Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v7 4/7] net: bcmgenet: add XDP_TX support
From: Nicolai Buchwitz @ 2026-04-16  5:47 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, linux-kernel, bpf
In-Reply-To: <20260416054743.1289191-1-nb@tipi-net.de>

Implement XDP_TX using ring 16 (DESC_INDEX), the hardware default
descriptor ring, dedicated to XDP TX for isolation from SKB TX queues.

Ring 16 gets 32 BDs carved from ring 0's allocation. TX completion is
piggybacked on RX NAPI poll since ring 16's INTRL2_1 bit collides with
RX ring 0, similar to how bnxt, ice, and other XDP drivers handle TX
completion within the RX poll path.

The GENET MAC has TBUF_64B_EN set globally, requiring every TX buffer
to start with a 64-byte struct status_64 (TSB). For local XDP_TX, the
TSB is prepended by backing xdp->data into the RSB area (unused after
BPF execution) and zeroing it. For foreign frames redirected from other
devices, the TSB is written into the xdp_frame headroom.

The page_pool DMA direction is changed from DMA_FROM_DEVICE to
DMA_BIDIRECTIONAL to allow TX reuse of the existing DMA mapping.

Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
---
 .../net/ethernet/broadcom/genet/bcmgenet.c    | 224 ++++++++++++++++--
 .../net/ethernet/broadcom/genet/bcmgenet.h    |   3 +
 2 files changed, 205 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index b09e5c3c3543..3f3682e39267 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -48,8 +48,10 @@
 
 #define GENET_Q0_RX_BD_CNT	\
 	(TOTAL_DESC - priv->hw_params->rx_queues * priv->hw_params->rx_bds_per_q)
+#define GENET_Q16_TX_BD_CNT	32
 #define GENET_Q0_TX_BD_CNT	\
-	(TOTAL_DESC - priv->hw_params->tx_queues * priv->hw_params->tx_bds_per_q)
+	(TOTAL_DESC - priv->hw_params->tx_queues * priv->hw_params->tx_bds_per_q \
+	 - GENET_Q16_TX_BD_CNT)
 
 #define RX_BUF_LENGTH		2048
 #define SKB_ALIGNMENT		32
@@ -1892,6 +1894,14 @@ static struct sk_buff *bcmgenet_free_tx_cb(struct device *dev,
 		if (cb == GENET_CB(skb)->last_cb)
 			return skb;
 
+	} else if (cb->xdpf) {
+		if (cb->xdp_dma_map)
+			dma_unmap_single(dev, dma_unmap_addr(cb, dma_addr),
+					 dma_unmap_len(cb, dma_len),
+					 DMA_TO_DEVICE);
+		dma_unmap_addr_set(cb, dma_addr, 0);
+		xdp_return_frame(cb->xdpf);
+		cb->xdpf = NULL;
 	} else if (dma_unmap_addr(cb, dma_addr)) {
 		dma_unmap_page(dev,
 			       dma_unmap_addr(cb, dma_addr),
@@ -1924,10 +1934,16 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
 	unsigned int pkts_compl = 0;
 	unsigned int txbds_ready;
 	unsigned int c_index;
+	struct enet_cb *tx_cb;
 	struct sk_buff *skb;
 
-	/* Clear status before servicing to reduce spurious interrupts */
-	bcmgenet_intrl2_1_writel(priv, (1 << ring->index), INTRL2_CPU_CLEAR);
+	/* Clear status before servicing to reduce spurious interrupts.
+	 * Ring DESC_INDEX (XDP TX) has no interrupt; skip the clear to
+	 * avoid clobbering RX ring 0's bit at the same position.
+	 */
+	if (ring->index != DESC_INDEX)
+		bcmgenet_intrl2_1_writel(priv, BIT(ring->index),
+					 INTRL2_CPU_CLEAR);
 
 	/* Compute how many buffers are transmitted since last xmit call */
 	c_index = bcmgenet_tdma_ring_readl(priv, ring->index, TDMA_CONS_INDEX)
@@ -1940,8 +1956,15 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
 
 	/* Reclaim transmitted buffers */
 	while (txbds_processed < txbds_ready) {
-		skb = bcmgenet_free_tx_cb(&priv->pdev->dev,
-					  &priv->tx_cbs[ring->clean_ptr]);
+		tx_cb = &priv->tx_cbs[ring->clean_ptr];
+		if (tx_cb->xdpf) {
+			pkts_compl++;
+			bytes_compl += tx_cb->xdp_dma_map
+				? tx_cb->xdpf->len
+				: tx_cb->xdpf->len -
+				  sizeof(struct status_64);
+		}
+		skb = bcmgenet_free_tx_cb(&priv->pdev->dev, tx_cb);
 		if (skb) {
 			pkts_compl++;
 			bytes_compl += GENET_CB(skb)->bytes_sent;
@@ -1963,8 +1986,11 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
 	u64_stats_add(&stats->bytes, bytes_compl);
 	u64_stats_update_end(&stats->syncp);
 
-	netdev_tx_completed_queue(netdev_get_tx_queue(dev, ring->index),
-				  pkts_compl, bytes_compl);
+	/* Ring DESC_INDEX (XDP TX) has no netdev TX queue; skip BQL */
+	if (ring->index != DESC_INDEX)
+		netdev_tx_completed_queue(netdev_get_tx_queue(dev,
+							      ring->index),
+					  pkts_compl, bytes_compl);
 
 	return txbds_processed;
 }
@@ -2041,6 +2067,9 @@ static void bcmgenet_tx_reclaim_all(struct net_device *dev)
 	do {
 		bcmgenet_tx_reclaim(dev, &priv->tx_rings[i++], true);
 	} while (i <= priv->hw_params->tx_queues && netif_is_multiqueue(dev));
+
+	/* Also reclaim XDP TX ring */
+	bcmgenet_tx_reclaim(dev, &priv->xdp_tx_ring, true);
 }
 
 /* Reallocate the SKB to put enough headroom in front of it and insert
@@ -2297,11 +2326,96 @@ static struct sk_buff *bcmgenet_xdp_build_skb(struct bcmgenet_rx_ring *ring,
 	return skb;
 }
 
+static bool bcmgenet_xdp_xmit_frame(struct bcmgenet_priv *priv,
+				     struct xdp_frame *xdpf, bool dma_map)
+{
+	struct bcmgenet_tx_ring *ring = &priv->xdp_tx_ring;
+	struct device *kdev = &priv->pdev->dev;
+	struct enet_cb *tx_cb_ptr;
+	dma_addr_t mapping;
+	unsigned int dma_len;
+	u32 len_stat;
+
+	spin_lock(&ring->lock);
+
+	if (ring->free_bds < 1) {
+		spin_unlock(&ring->lock);
+		return false;
+	}
+
+	tx_cb_ptr = bcmgenet_get_txcb(priv, ring);
+
+	if (dma_map) {
+		void *tsb_start;
+
+		/* The GENET MAC has TBUF_64B_EN set globally, so hardware
+		 * expects a 64-byte TSB prefix on every TX buffer.  For
+		 * redirected frames (ndo_xdp_xmit) we prepend a zeroed TSB
+		 * using the frame's headroom.
+		 */
+		if (unlikely(xdpf->headroom < sizeof(struct status_64))) {
+			bcmgenet_put_txcb(priv, ring);
+			spin_unlock(&ring->lock);
+			return false;
+		}
+
+		tsb_start = xdpf->data - sizeof(struct status_64);
+		memset(tsb_start, 0, sizeof(struct status_64));
+
+		dma_len = xdpf->len + sizeof(struct status_64);
+		mapping = dma_map_single(kdev, tsb_start, dma_len,
+					 DMA_TO_DEVICE);
+		if (dma_mapping_error(kdev, mapping)) {
+			tx_cb_ptr->skb = NULL;
+			tx_cb_ptr->xdpf = NULL;
+			bcmgenet_put_txcb(priv, ring);
+			spin_unlock(&ring->lock);
+			return false;
+		}
+	} else {
+		struct page *page = virt_to_page(xdpf->data);
+
+		/* For local XDP_TX the caller already prepended the TSB
+		 * into xdpf->data/len, so dma_len == xdpf->len.
+		 */
+		dma_len = xdpf->len;
+		mapping = page_pool_get_dma_addr(page) +
+			  sizeof(*xdpf) + xdpf->headroom;
+		dma_sync_single_for_device(kdev, mapping, dma_len,
+					   DMA_BIDIRECTIONAL);
+	}
+
+	dma_unmap_addr_set(tx_cb_ptr, dma_addr, mapping);
+	dma_unmap_len_set(tx_cb_ptr, dma_len, dma_len);
+	tx_cb_ptr->skb = NULL;
+	tx_cb_ptr->xdpf = xdpf;
+	tx_cb_ptr->xdp_dma_map = dma_map;
+
+	len_stat = (dma_len << DMA_BUFLENGTH_SHIFT) |
+		   (priv->hw_params->qtag_mask << DMA_TX_QTAG_SHIFT) |
+		   DMA_TX_APPEND_CRC | DMA_SOP | DMA_EOP;
+
+	dmadesc_set(priv, tx_cb_ptr->bd_addr, mapping, len_stat);
+
+	ring->free_bds--;
+	ring->prod_index++;
+	ring->prod_index &= DMA_P_INDEX_MASK;
+
+	bcmgenet_tdma_ring_writel(priv, ring->index, ring->prod_index,
+				  TDMA_PROD_INDEX);
+
+	spin_unlock(&ring->lock);
+
+	return true;
+}
+
 static unsigned int bcmgenet_run_xdp(struct bcmgenet_rx_ring *ring,
 				     struct bpf_prog *prog,
 				     struct xdp_buff *xdp,
 				     struct page *rx_page)
 {
+	struct bcmgenet_priv *priv = ring->priv;
+	struct xdp_frame *xdpf;
 	unsigned int act;
 
 	if (!prog)
@@ -2312,14 +2426,42 @@ static unsigned int bcmgenet_run_xdp(struct bcmgenet_rx_ring *ring,
 	switch (act) {
 	case XDP_PASS:
 		return XDP_PASS;
+	case XDP_TX:
+		/* Prepend a zeroed TSB (Transmit Status Block).  The GENET
+		 * MAC has TBUF_64B_EN set globally, so hardware expects every
+		 * TX buffer to begin with a 64-byte struct status_64.  Back
+		 * up xdp->data into the RSB area (which is no longer needed
+		 * after the BPF program ran) and zero it.
+		 */
+		if (xdp->data - xdp->data_hard_start <
+		    sizeof(struct status_64) + sizeof(struct xdp_frame)) {
+			page_pool_put_full_page(ring->page_pool, rx_page,
+						true);
+			return XDP_DROP;
+		}
+		xdp->data -= sizeof(struct status_64);
+		xdp->data_meta -= sizeof(struct status_64);
+		memset(xdp->data, 0, sizeof(struct status_64));
+
+		xdpf = xdp_convert_buff_to_frame(xdp);
+		if (unlikely(!xdpf)) {
+			page_pool_put_full_page(ring->page_pool, rx_page,
+						true);
+			return XDP_DROP;
+		}
+		if (unlikely(!bcmgenet_xdp_xmit_frame(priv, xdpf, false))) {
+			xdp_return_frame_rx_napi(xdpf);
+			return XDP_DROP;
+		}
+		return XDP_TX;
 	case XDP_DROP:
 		page_pool_put_full_page(ring->page_pool, rx_page, true);
 		return XDP_DROP;
 	default:
-		bpf_warn_invalid_xdp_action(ring->priv->dev, prog, act);
+		bpf_warn_invalid_xdp_action(priv->dev, prog, act);
 		fallthrough;
 	case XDP_ABORTED:
-		trace_xdp_exception(ring->priv->dev, prog, act);
+		trace_xdp_exception(priv->dev, prog, act);
 		page_pool_put_full_page(ring->page_pool, rx_page, true);
 		return XDP_ABORTED;
 	}
@@ -2537,9 +2679,15 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_rx_ring *ring = container_of(napi,
 			struct bcmgenet_rx_ring, napi);
+	struct bcmgenet_priv *priv = ring->priv;
 	struct dim_sample dim_sample = {};
 	unsigned int work_done;
 
+	/* Reclaim completed XDP TX frames (ring 16 has no interrupt) */
+	if (priv->xdp_tx_ring.free_bds < priv->xdp_tx_ring.size)
+		bcmgenet_tx_reclaim(priv->dev,
+				    &priv->xdp_tx_ring, false);
+
 	work_done = bcmgenet_desc_rx(ring, budget);
 
 	if (work_done < budget && napi_complete_done(napi, work_done))
@@ -2770,10 +2918,11 @@ static void bcmgenet_init_rx_coalesce(struct bcmgenet_rx_ring *ring)
 
 /* Initialize a Tx ring along with corresponding hardware registers */
 static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
+				  struct bcmgenet_tx_ring *ring,
 				  unsigned int index, unsigned int size,
-				  unsigned int start_ptr, unsigned int end_ptr)
+				  unsigned int start_ptr,
+				  unsigned int end_ptr)
 {
-	struct bcmgenet_tx_ring *ring = &priv->tx_rings[index];
 	u32 words_per_bd = WORDS_PER_BD(priv);
 	u32 flow_period_val = 0;
 
@@ -2814,8 +2963,11 @@ static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 	bcmgenet_tdma_ring_writel(priv, index, end_ptr * words_per_bd - 1,
 				  DMA_END_ADDR);
 
-	/* Initialize Tx NAPI */
-	netif_napi_add_tx(priv->dev, &ring->napi, bcmgenet_tx_poll);
+	/* Initialize Tx NAPI for priority queues only; ring DESC_INDEX
+	 * (XDP TX) has its completions handled inline in RX NAPI.
+	 */
+	if (index != DESC_INDEX)
+		netif_napi_add_tx(priv->dev, &ring->napi, bcmgenet_tx_poll);
 }
 
 static int bcmgenet_rx_ring_create_pool(struct bcmgenet_priv *priv,
@@ -2827,7 +2979,7 @@ static int bcmgenet_rx_ring_create_pool(struct bcmgenet_priv *priv,
 		.pool_size = ring->size,
 		.nid = NUMA_NO_NODE,
 		.dev = &priv->pdev->dev,
-		.dma_dir = DMA_FROM_DEVICE,
+		.dma_dir = DMA_BIDIRECTIONAL,
 		.offset = XDP_PACKET_HEADROOM,
 		.max_len = RX_BUF_LENGTH,
 	};
@@ -2961,6 +3113,7 @@ static int bcmgenet_tdma_disable(struct bcmgenet_priv *priv)
 
 	reg = bcmgenet_tdma_readl(priv, DMA_CTRL);
 	mask = (1 << (priv->hw_params->tx_queues + 1)) - 1;
+	mask |= BIT(DESC_INDEX);
 	mask = (mask << DMA_RING_BUF_EN_SHIFT) | DMA_EN;
 	reg &= ~mask;
 	bcmgenet_tdma_writel(priv, reg, DMA_CTRL);
@@ -3006,14 +3159,18 @@ static int bcmgenet_rdma_disable(struct bcmgenet_priv *priv)
  * with queue 1 being the highest priority queue.
  *
  * Queue 0 is the default Tx queue with
- * GENET_Q0_TX_BD_CNT = 256 - 4 * 32 = 128 descriptors.
+ * GENET_Q0_TX_BD_CNT = 256 - 4 * 32 - 32 = 96 descriptors.
+ *
+ * Ring 16 (DESC_INDEX) is used for XDP TX with
+ * GENET_Q16_TX_BD_CNT = 32 descriptors.
  *
  * The transmit control block pool is then partitioned as follows:
- * - Tx queue 0 uses tx_cbs[0..127]
- * - Tx queue 1 uses tx_cbs[128..159]
- * - Tx queue 2 uses tx_cbs[160..191]
- * - Tx queue 3 uses tx_cbs[192..223]
- * - Tx queue 4 uses tx_cbs[224..255]
+ * - Tx queue 0 uses tx_cbs[0..95]
+ * - Tx queue 1 uses tx_cbs[96..127]
+ * - Tx queue 2 uses tx_cbs[128..159]
+ * - Tx queue 3 uses tx_cbs[160..191]
+ * - Tx queue 4 uses tx_cbs[192..223]
+ * - Tx queue 16 uses tx_cbs[224..255]
  */
 static void bcmgenet_init_tx_queues(struct net_device *dev)
 {
@@ -3026,7 +3183,8 @@ static void bcmgenet_init_tx_queues(struct net_device *dev)
 
 	/* Initialize Tx priority queues */
 	for (i = 0; i <= priv->hw_params->tx_queues; i++) {
-		bcmgenet_init_tx_ring(priv, i, end - start, start, end);
+		bcmgenet_init_tx_ring(priv, &priv->tx_rings[i],
+				      i, end - start, start, end);
 		start = end;
 		end += priv->hw_params->tx_bds_per_q;
 		dma_priority[DMA_PRIO_REG_INDEX(i)] |=
@@ -3034,13 +3192,19 @@ static void bcmgenet_init_tx_queues(struct net_device *dev)
 			<< DMA_PRIO_REG_SHIFT(i);
 	}
 
+	/* Initialize ring 16 (descriptor ring) for XDP TX */
+	bcmgenet_init_tx_ring(priv, &priv->xdp_tx_ring,
+			      DESC_INDEX, GENET_Q16_TX_BD_CNT,
+			      TOTAL_DESC - GENET_Q16_TX_BD_CNT, TOTAL_DESC);
+
 	/* Set Tx queue priorities */
 	bcmgenet_tdma_writel(priv, dma_priority[0], DMA_PRIORITY_0);
 	bcmgenet_tdma_writel(priv, dma_priority[1], DMA_PRIORITY_1);
 	bcmgenet_tdma_writel(priv, dma_priority[2], DMA_PRIORITY_2);
 
-	/* Configure Tx queues as descriptor rings */
+	/* Configure Tx queues as descriptor rings, including ring 16 */
 	ring_mask = (1 << (priv->hw_params->tx_queues + 1)) - 1;
+	ring_mask |= BIT(DESC_INDEX);
 	bcmgenet_tdma_writel(priv, ring_mask, DMA_RING_CFG);
 
 	/* Enable Tx rings */
@@ -3754,6 +3918,21 @@ static void bcmgenet_get_stats64(struct net_device *dev,
 		stats->tx_dropped += tx_dropped;
 	}
 
+	/* Include XDP TX ring (DESC_INDEX) stats */
+	tx_stats = &priv->xdp_tx_ring.stats64;
+	do {
+		start = u64_stats_fetch_begin(&tx_stats->syncp);
+		tx_bytes = u64_stats_read(&tx_stats->bytes);
+		tx_packets = u64_stats_read(&tx_stats->packets);
+		tx_errors = u64_stats_read(&tx_stats->errors);
+		tx_dropped = u64_stats_read(&tx_stats->dropped);
+	} while (u64_stats_fetch_retry(&tx_stats->syncp, start));
+
+	stats->tx_bytes += tx_bytes;
+	stats->tx_packets += tx_packets;
+	stats->tx_errors += tx_errors;
+	stats->tx_dropped += tx_dropped;
+
 	for (q = 0; q <= priv->hw_params->rx_queues; q++) {
 		rx_stats = &priv->rx_rings[q].stats64;
 		do {
@@ -4257,6 +4436,7 @@ static int bcmgenet_probe(struct platform_device *pdev)
 		u64_stats_init(&priv->rx_rings[i].stats64.syncp);
 	for (i = 0; i <= priv->hw_params->tx_queues; i++)
 		u64_stats_init(&priv->tx_rings[i].stats64.syncp);
+	u64_stats_init(&priv->xdp_tx_ring.stats64.syncp);
 
 	/* libphy will determine the link state */
 	netif_carrier_off(dev);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 1459473ac1b0..8966d32efe2f 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -472,6 +472,8 @@ struct bcmgenet_rx_stats64 {
 
 struct enet_cb {
 	struct sk_buff      *skb;
+	struct xdp_frame    *xdpf;
+	bool                xdp_dma_map;
 	struct page         *rx_page;
 	unsigned int        rx_page_offset;
 	void __iomem *bd_addr;
@@ -611,6 +613,7 @@ struct bcmgenet_priv {
 	unsigned int num_tx_bds;
 
 	struct bcmgenet_tx_ring tx_rings[GENET_MAX_MQ_CNT + 1];
+	struct bcmgenet_tx_ring xdp_tx_ring;
 
 	/* receive variables */
 	void __iomem *rx_bds;
-- 
2.51.0


^ permalink raw reply related

* [PATCH net-next v7 0/7] net: bcmgenet: add XDP support
From: Nicolai Buchwitz @ 2026-04-16  5:47 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	Alexei Starovoitov, Daniel Borkmann, David S. Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, bpf

Add XDP support to the bcmgenet driver, covering XDP_PASS, XDP_DROP,
XDP_TX, XDP_REDIRECT, and ndo_xdp_xmit.

The first patch converts the RX path from the existing kmalloc-based
allocation to page_pool, which is a prerequisite for XDP. The remaining
patches incrementally add XDP functionality and per-action statistics.

Tested on Raspberry Pi CM4 (BCM2711, bcmgenet, 1Gbps link):
- XDP_PASS: 943 Mbit/s TX, 935 Mbit/s RX (no regression vs baseline)
- XDP_PASS latency: 0.164ms avg, 0% packet loss
- XDP_DROP: all inbound traffic blocked as expected
- XDP_TX: TX counter increments (packet reflection working)
- Link flap with XDP attached: no errors
- Program swap under iperf3 load: no errors
- Upstream XDP selftests (xdp.py): pass_sb, drop_sb, tx_sb passing
- XDP-based EtherCAT master (~37 kHz cycle rate, all packet processing
  in BPF/XDP), stable over multiple days

Previous versions:
  v6: https://lore.kernel.org/netdev/20260406083536.839517-1-nb@tipi-net.de/
  v5: https://lore.kernel.org/netdev/20260328230513.415790-1-nb@tipi-net.de/
  v4: https://lore.kernel.org/netdev/20260323120539.136029-1-nb@tipi-net.de/
  v3: https://lore.kernel.org/netdev/20260319115402.353509-1-nb@tipi-net.de/
  v2: https://lore.kernel.org/netdev/20260315214914.1555777-1-nb@tipi-net.de/
  v1: https://lore.kernel.org/netdev/20260313092101.1344954-1-nb@tipi-net.de/

Changes since v6:
  - Removed GENET_XDP_HEADROOM alias, use XDP_PACKET_HEADROOM
    directly. (Jakub Kicinski)
  - Dropped redundant __GFP_NOWARN from page_pool_alloc_pages(),
    page_pool adds it automatically. (Jakub Kicinski)
  - Removed floating code block in desc_rx, moved variables to outer
    scope. (Jakub Kicinski)
  - Make bcmgenet_run_xdp() return XDP_PASS when no program is set,
    removing the if (xdp_prog) indentation from desc_rx.
    (Jakub Kicinski)

Changes since v5:
  - Refactored desc_rx: always prepare xdp_buff and use
    bcmgenet_xdp_build_skb for both XDP and non-XDP paths, treating
    no-prog as XDP_PASS. (Jakub Kicinski)
  - Removed synchronize_net() before bpf_prog_put(), RCU handles
    the grace period. (Jakub Kicinski)
  - Save status->rx_csum before running XDP program to prevent
    bpf_xdp_adjust_head from corrupting the RSB checksum.
    (Jakub Kicinski)
  - Tightened TSB headroom check to include sizeof(struct xdp_frame).
    (Jakub Kicinski)
  - Fixed reclaim gating: check for pending frames on the XDP TX ring
    instead of priv->xdp_prog, so in-flight frames are still reclaimed
    after XDP program detach. (Jakub Kicinski)
  - Removed dead len -= ETH_FCS_LEN in patch 1. (Mohsin Bashir)
  - Added patch 7: minimal ndo_change_mtu that rejects MTU values
    incompatible with XDP when a program is attached. (Mohsin Bashir,
    Florian Fainelli)

Changes since v4:
  - Fixed unused variable warning: moved tx_ring declaration from
    patch 4 to patch 5 where it is first used. (Jakub Kicinski)

Changes since v3:
  - Fixed xdp_prepare_buff() called with meta_valid=false, causing
    bcmgenet_xdp_build_skb() to compute metasize=UINT_MAX and corrupt
    skb meta_len. Now passes true. (Simon Horman)
  - Removed bcmgenet_dump_tx_queue() for ring 16 in bcmgenet_timeout().
    Ring 16 has no netdev TX queue, so netdev_get_tx_queue(dev, 16)
    accessed beyond the allocated _tx array. (Simon Horman)
  - Fixed checkpatch alignment warnings in patches 4 and 5.

Changes since v2:
  - Fixed page leak on partial bcmgenet_alloc_rx_buffers() failure:
    free already-allocated rx_cbs before destroying page pool.
    (Simon Horman)
  - Fixed GENET_Q16_TX_BD_CNT defined as 64 instead of 32.
    (Simon Horman)
  - Moved XDP TX ring to a separate struct member (xdp_tx_ring)
    instead of expanding tx_rings[] to DESC_INDEX+1. (Justin Chen)
  - Added synchronize_net() before bpf_prog_put() in XDP prog swap.
  - Removed goto drop_page inside switch; inlined page_pool_put
    calls in each failure path. (Justin Chen)
  - Removed unnecessary curly braces around case XDP_TX. (Justin Chen)
  - Moved int err hoisting from patch 2 to patch 1. (Justin Chen)
  - Kept return type on same line as function name, per driver
    convention. (Justin Chen)
  - XDP TX packets/bytes now counted in TX reclaim for standard
    network statistics.

Changes since v1:
  - Fixed tx_rings[DESC_INDEX] out-of-bounds access. Expanded array
    to DESC_INDEX+1 and initialized ring 16 with dedicated BDs.
  - Use ring 16 (hardware default descriptor ring) for XDP TX,
    isolating from normal SKB TX queues.
  - Piggyback ring 16 TX completion on RX NAPI poll (INTRL2_1 bit
    collision with RX ring 0).
  - Fixed ring 16 TX reclaim: skip INTRL2_1 clear, skip BQL
    completion, use non-destructive reclaim in RX poll path.
  - Prepend zeroed TSB before XDP TX frame data (TBUF_64B_EN requires
    64-byte struct status_64 prefix on all TX buffers).
  - Tested with upstream XDP selftests (xdp.py): pass_sb, drop_sb,
    tx_sb all passing. The multi-buffer tests (pass_mb, drop_mb,
    tx_mb) fail because bcmgenet does not support jumbo frames /
    MTU changes; I plan to add ndo_change_mtu support in a follow-up
    series.

Nicolai Buchwitz (7):
  net: bcmgenet: convert RX path to page_pool
  net: bcmgenet: register xdp_rxq_info for each RX ring
  net: bcmgenet: add basic XDP support (PASS/DROP)
  net: bcmgenet: add XDP_TX support
  net: bcmgenet: add XDP_REDIRECT and ndo_xdp_xmit support
  net: bcmgenet: add XDP statistics counters
  net: bcmgenet: reject MTU changes incompatible with XDP

 drivers/net/ethernet/broadcom/Kconfig         |   1 +
 .../net/ethernet/broadcom/genet/bcmgenet.c    | 637 +++++++++++++++---
 .../net/ethernet/broadcom/genet/bcmgenet.h    |  19 +
 3 files changed, 559 insertions(+), 98 deletions(-)

--
2.51.0


^ permalink raw reply

* [PATCH net-next v7 3/7] net: bcmgenet: add basic XDP support (PASS/DROP)
From: Nicolai Buchwitz @ 2026-04-16  5:47 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, linux-kernel, bpf
In-Reply-To: <20260416054743.1289191-1-nb@tipi-net.de>

Add XDP program attachment via ndo_bpf and execute XDP programs in the
RX path. XDP_PASS builds an SKB from the xdp_buff (handling
xdp_adjust_head/tail), XDP_DROP returns the page to page_pool without
SKB allocation.

XDP_TX and XDP_REDIRECT are not yet supported and return XDP_ABORTED.

Advertise NETDEV_XDP_ACT_BASIC in xdp_features.

Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
---
 .../net/ethernet/broadcom/genet/bcmgenet.c    | 129 +++++++++++++++---
 .../net/ethernet/broadcom/genet/bcmgenet.h    |   4 +
 2 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index dd00196b9d4b..b09e5c3c3543 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -35,6 +35,8 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/phy.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 
 #include <linux/unaligned.h>
 
@@ -2273,6 +2275,56 @@ static int bcmgenet_rx_refill(struct bcmgenet_rx_ring *ring,
 	return 0;
 }
 
+static struct sk_buff *bcmgenet_xdp_build_skb(struct bcmgenet_rx_ring *ring,
+					      struct xdp_buff *xdp)
+{
+	unsigned int metasize;
+	struct sk_buff *skb;
+
+	skb = napi_build_skb(xdp->data_hard_start, PAGE_SIZE);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_mark_for_recycle(skb);
+
+	metasize = xdp->data - xdp->data_meta;
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+
+	return skb;
+}
+
+static unsigned int bcmgenet_run_xdp(struct bcmgenet_rx_ring *ring,
+				     struct bpf_prog *prog,
+				     struct xdp_buff *xdp,
+				     struct page *rx_page)
+{
+	unsigned int act;
+
+	if (!prog)
+		return XDP_PASS;
+
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		return XDP_PASS;
+	case XDP_DROP:
+		page_pool_put_full_page(ring->page_pool, rx_page, true);
+		return XDP_DROP;
+	default:
+		bpf_warn_invalid_xdp_action(ring->priv->dev, prog, act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(ring->priv->dev, prog, act);
+		page_pool_put_full_page(ring->page_pool, rx_page, true);
+		return XDP_ABORTED;
+	}
+}
+
 /* bcmgenet_desc_rx - descriptor based rx process.
  * this could be called from bottom half, or from NAPI polling method.
  */
@@ -2282,6 +2334,7 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	struct bcmgenet_rx_stats64 *stats = &ring->stats64;
 	struct bcmgenet_priv *priv = ring->priv;
 	struct net_device *dev = priv->dev;
+	struct bpf_prog *xdp_prog;
 	struct enet_cb *cb;
 	struct sk_buff *skb;
 	u32 dma_length_status;
@@ -2292,6 +2345,8 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	unsigned int p_index, mask;
 	unsigned int discards;
 
+	xdp_prog = READ_ONCE(priv->xdp_prog);
+
 	/* Clear status before servicing to reduce spurious interrupts */
 	mask = 1 << (UMAC_IRQ1_RX_INTR_SHIFT + ring->index);
 	bcmgenet_intrl2_1_writel(priv, mask, INTRL2_CPU_CLEAR);
@@ -2323,9 +2378,12 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	       (rxpktprocessed < budget)) {
 		struct status_64 *status;
 		struct page *rx_page;
+		unsigned int xdp_act;
 		unsigned int rx_off;
-		__be16 rx_csum;
+		struct xdp_buff xdp;
+		__be16 rx_csum = 0;
 		void *hard_start;
+		int pkt_len;
 
 		cb = &priv->rx_cbs[ring->read_ptr];
 
@@ -2402,30 +2460,34 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 			goto next;
 		} /* error packet */
 
-		/* Build SKB from the page - data starts at hard_start,
-		 * frame begins after RSB(64) + pad(2) = 66 bytes.
+		pkt_len = len - GENET_RSB_PAD;
+		if (priv->crc_fwd_en)
+			pkt_len -= ETH_FCS_LEN;
+
+		/* Save rx_csum before XDP runs - an XDP program
+		 * could overwrite the RSB via bpf_xdp_adjust_head.
 		 */
-		skb = napi_build_skb(hard_start, PAGE_SIZE - XDP_PACKET_HEADROOM);
-		if (unlikely(!skb)) {
-			BCMGENET_STATS64_INC(stats, dropped);
-			page_pool_put_full_page(ring->page_pool, rx_page,
-						true);
-			goto next;
-		}
+		if (dev->features & NETIF_F_RXCSUM)
+			rx_csum = (__force __be16)(status->rx_csum & 0xffff);
 
-		skb_mark_for_recycle(skb);
+		xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_rxq);
+		xdp_prepare_buff(&xdp, page_address(rx_page),
+				 GENET_RX_HEADROOM, pkt_len, true);
 
-		/* Reserve the RSB + pad, then set the data length */
-		skb_reserve(skb, GENET_RSB_PAD);
-		__skb_put(skb, len - GENET_RSB_PAD);
+		xdp_act = bcmgenet_run_xdp(ring, xdp_prog, &xdp, rx_page);
+		if (xdp_act != XDP_PASS)
+			goto next;
 
-		if (priv->crc_fwd_en) {
-			skb_trim(skb, skb->len - ETH_FCS_LEN);
+		skb = bcmgenet_xdp_build_skb(ring, &xdp);
+		if (unlikely(!skb)) {
+			BCMGENET_STATS64_INC(stats, dropped);
+			page_pool_put_full_page(ring->page_pool,
+						rx_page, true);
+			goto next;
 		}
 
 		/* Set up checksum offload */
 		if (dev->features & NETIF_F_RXCSUM) {
-			rx_csum = (__force __be16)(status->rx_csum & 0xffff);
 			if (rx_csum) {
 				skb->csum = (__force __wsum)ntohs(rx_csum);
 				skb->ip_summed = CHECKSUM_COMPLETE;
@@ -3743,6 +3805,37 @@ static int bcmgenet_change_carrier(struct net_device *dev, bool new_carrier)
 	return 0;
 }
 
+static int bcmgenet_xdp_setup(struct net_device *dev,
+			      struct netdev_bpf *xdp)
+{
+	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	struct bpf_prog *prog = xdp->prog;
+
+	if (prog && dev->mtu > PAGE_SIZE - GENET_RX_HEADROOM -
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) {
+		NL_SET_ERR_MSG_MOD(xdp->extack,
+				   "MTU too large for single-page XDP buffer");
+		return -EOPNOTSUPP;
+	}
+
+	old_prog = xchg(&priv->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int bcmgenet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return bcmgenet_xdp_setup(dev, xdp);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops bcmgenet_netdev_ops = {
 	.ndo_open		= bcmgenet_open,
 	.ndo_stop		= bcmgenet_close,
@@ -3754,6 +3847,7 @@ static const struct net_device_ops bcmgenet_netdev_ops = {
 	.ndo_set_features	= bcmgenet_set_features,
 	.ndo_get_stats64	= bcmgenet_get_stats64,
 	.ndo_change_carrier	= bcmgenet_change_carrier,
+	.ndo_bpf		= bcmgenet_xdp,
 };
 
 /* GENET hardware parameters/characteristics */
@@ -4056,6 +4150,7 @@ static int bcmgenet_probe(struct platform_device *pdev)
 			 NETIF_F_RXCSUM;
 	dev->hw_features |= dev->features;
 	dev->vlan_features |= dev->features;
+	dev->xdp_features = NETDEV_XDP_ACT_BASIC;
 
 	netdev_sw_irq_coalesce_default_on(dev);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 82a6d29f481d..1459473ac1b0 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,6 +16,7 @@
 #include <linux/dim.h>
 #include <linux/ethtool.h>
 #include <net/page_pool/helpers.h>
+#include <linux/bpf.h>
 #include <net/xdp.h>
 
 #include "../unimac.h"
@@ -671,6 +672,9 @@ struct bcmgenet_priv {
 	u8 sopass[SOPASS_MAX];
 
 	struct bcmgenet_mib_counters mib;
+
+	/* XDP */
+	struct bpf_prog *xdp_prog;
 };
 
 static inline bool bcmgenet_has_40bits(struct bcmgenet_priv *priv)
-- 
2.51.0


^ permalink raw reply related

* [PATCH RFC] xfrm: enforce SPI uniqueness for inbound SAs only
From: Antony Antony @ 2026-04-16  5:44 UTC (permalink / raw)
  To: Steffen Klassert, Herbert Xu, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Aakash Kumar S, Yan Yan
  Cc: Abed Mohammad Kamaluddin, Nathan Harold, netdev, devel,
	Antony Antony

Per RFC 4301 section 4.4.2.1, the SPI is selected by the receiving
end, which is interpreted as making SPI uniqueness an inbound-only
requirement.

Commit 94f39804d891 ("xfrm: Duplicate SPI Handling") introduced
xfrm_state_lookup_spi_proto() to fix duplicate SPI allocation for
inbound SAs with different destination addresses.  However, it enforces
global uniqueness by (spi, proto) across all states regardless of
direction, which causes SPI allocation to fail for outbound SAs when
the same (spi, proto) is already in use by an inbound SA.

When x->dir == XFRM_DIR_IN, enforce SPI uniqueness via
xfrm_state_lookup_spi_proto() scoped to inbound SAs. SAs created via
PF_KEY, without direction, or with XFRM_DIR_OUT restore the
pre 94f39804d891, RFC 2401 lookup by (daddr, spi, proto).

Reported-by: Yan Yan <evitayan@google.com>
Fixes: 94f39804d891 ("xfrm: Duplicate SPI Handling")
Signed-off-by: Antony Antony <antony.antony@secunet.com>
---
 net/xfrm/xfrm_state.c | 16 ++++++++++++++--
 net/xfrm/xfrm_user.c  |  6 +++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1748d374abca..b1ec95141512 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1698,15 +1698,21 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
 }
 EXPORT_SYMBOL(xfrm_state_lookup_byspi);
 
-static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 spi, u8 proto)
+static struct xfrm_state *xfrm_state_lookup_input_spi(struct net *net,
+						      __be32 spi, u8 proto,
+						      u8 dir)
 {
 	struct xfrm_state *x;
 	unsigned int i;
 
 	for (i = 0; i <= net->xfrm.state_hmask; i++) {
 		hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_byspi, net) + i, byspi) {
+			if (x->dir != dir)
+				continue;
+
 			if (x->id.spi == spi && x->id.proto == proto)
 				return x;
+
 		}
 	}
 	return NULL;
@@ -2578,6 +2584,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
 	struct xfrm_state *x0;
 	int err = -ENOENT;
 	u32 range = high - low + 1;
+	u32 mark = x->mark.v & x->mark.m;
 	__be32 newspi = 0;
 
 	spin_lock_bh(&x->lock);
@@ -2599,7 +2606,12 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
 		newspi = htonl(spi);
 
 		spin_lock_bh(&net->xfrm.xfrm_state_lock);
-		x0 = xfrm_state_lookup_spi_proto(net, newspi, x->id.proto);
+		if (x->dir == XFRM_SA_DIR_IN)
+			x0 = xfrm_state_lookup_input_spi(net, newspi,
+							 x->id.proto, x->dir);
+		else
+			x0 = xfrm_state_lookup(net, mark, &x->id.daddr, newspi,
+					       x->id.proto, x->props.family);
 		if (!x0) {
 			x->id.spi = newspi;
 			h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d56450f61669..f9db2d2c392b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1883,13 +1883,13 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto out_noput;
 	}
 
+	if (attrs[XFRMA_SA_DIR])
+		x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);
+
 	err = xfrm_alloc_spi(x, p->min, p->max, extack);
 	if (err)
 		goto out;
 
-	if (attrs[XFRMA_SA_DIR])
-		x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);
-
 	resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
 	if (IS_ERR(resp_skb)) {
 		err = PTR_ERR(resp_skb);

---
base-commit: 426c355742f02cf743b347d9d7dbdc1bfbfa31ef
change-id: 20260330-alloc-spi-dir-3b6e2f4b34e9

Best regards,
--  
Antony Antony <antony.antony@secunet.com>


^ permalink raw reply related

* Re: [PATCH v3 net] ax25: fix OOB read after address header strip in ax25_rcv()
From: Ashutosh Desai @ 2026-04-16  5:39 UTC (permalink / raw)
  To: David Laight
  Cc: netdev, linux-hams, jreuter, davem, edumazet, kuba, pabeni, horms,
	stable, linux-kernel
In-Reply-To: <20260415085921.757b48a0@pumpkin>

On Wed, 15 Apr 2026 08:59:21 +0100, David Laight wrote:
> Is it just worth linearising the skb on entry to all this code?

Thanks for the feedback, David.

skb_linearize() on entry is a nice idea for simplifying sanity checks
overall, but it wouldn't fix this particular bug on its own - the issue
is skb->len dropping to zero after skb_pull(), not non-linear data. We'd
still need a length check regardless. pskb_may_pull(skb, 2) handles both
in one call.

That said, linearizing on entry to ax25_rcv() as a cleanup to simplify
future checks sounds worthwhile - happy to send that as a separate
net-next patch.

^ permalink raw reply

* Re: [PATCH net 1/1] 8021q: free cleared egress QoS mappings safely
From: Yuan Tan @ 2026-04-16  5:35 UTC (permalink / raw)
  To: Simon Horman, Ren Wei
  Cc: netdev, andrew+netdev, davem, edumazet, kuba, pabeni, kees,
	yifanwucs, tomapufckgml, bird, ylong030
In-Reply-To: <20260415151545.GM772670@horms.kernel.org>


On 4/15/26 08:15, Simon Horman wrote:
> On Mon, Apr 13, 2026 at 05:07:20PM +0800, Ren Wei wrote:
>> From: Longxuan Yu <ylong030@ucr.edu>
>>
>> vlan_dev_set_egress_priority() leaves cleared egress priority mapping
>> nodes in the hash until device teardown. Repeated set/clear cycles with
>> distinct skb priorities therefore allocate an unbounded number of
>> vlan_priority_tci_mapping objects and leak memory.
>>
>> Delete mappings when vlan_prio is cleared instead of keeping
>> tombstones. The TX fast path and reporting paths walk the lists without
>> RTNL, so convert the egress mapping lists to RCU-protected pointers and
>> defer freeing removed nodes until after a grace period.
>>
>> Cc: stable@kernel.org
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> Reported-by: Yifan Wu <yifanwucs@gmail.com>
>> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
>> Co-developed-by: Yuan Tan <yuantan098@gmail.com>
>> Signed-off-by: Yuan Tan <yuantan098@gmail.com>
>> Suggested-by: Xin Liu <bird@lzu.edu.cn>
>> Signed-off-by: Longxuan Yu <ylong030@ucr.edu>
>> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
>> ---
>>  include/linux/if_vlan.h  | 23 +++++++++++--------
>>  net/8021q/vlan_dev.c     | 48 +++++++++++++++++++++++-----------------
>>  net/8021q/vlan_netlink.c |  9 +++-----
>>  net/8021q/vlanproc.c     | 12 ++++++----
>>  4 files changed, 53 insertions(+), 39 deletions(-)
> There is a lot of change here. And I'd suggest splitting the patch up into
> (at least) two patches:
>
> 1. Convert mappings to use RCU
> 2. Fix bug
>
> As is, the bug fix itself is difficult to isolate amongst the other changes.
>
> Also, AI generated review suggests that this bug was introduced by commit
> b020cb488586 ("[VLAN]: Keep track of number of QoS mappings"). If so,
> it would be appropriate to use that commit in the Fixes tag.
>
Thank you very much for your review and suggestions. We will try to
revise it in this direction.
May I ask whether we should include your “Suggested-by” tag in the patch?


^ permalink raw reply

* Re: [RFC PATCH 4/4] nfs: allow P2PDMA in direct I/O path
From: Christoph Hellwig @ 2026-04-16  5:29 UTC (permalink / raw)
  To: Pranjal Shrivastava
  Cc: Christoph Hellwig, trond.myklebust, anna, davem, kuba, edumazet,
	pabeni, chuck.lever, jlayton, tom, okorniev, neil, dai.ngo,
	linux-nfs, netdev
In-Reply-To: <ad6c5fI0HsHkUbKH@google.com>

On Tue, Apr 14, 2026 at 08:00:37PM +0000, Pranjal Shrivastava wrote:
> > Please split theconversion to iov_iter_extract_pages into a separate
> > preparation patch, and even series.  That is a long overdue change
> > that fixes potential data corruption in XFS.
> > 
> 
> Sure, I'll send out a series with the migration to 
> iov_iter_extract_pages, should I club this with the pin-aware + folios
> for direct I/O or send it as a separate series?

I think combining all this sounds find.  I'd just do the P2P separately
as it is bound to get quite a bit more complicated.


^ permalink raw reply

* Re: [RFC PATCH 3/4] nfs: make nfs_page pin-aware
From: Christoph Hellwig @ 2026-04-16  5:28 UTC (permalink / raw)
  To: Pranjal Shrivastava
  Cc: Christoph Hellwig, trond.myklebust, anna, davem, kuba, edumazet,
	pabeni, chuck.lever, jlayton, tom, okorniev, neil, dai.ngo,
	linux-nfs, netdev
In-Reply-To: <ad6cVbDGy3alQ2uK@google.com>

On Tue, Apr 14, 2026 at 07:58:13PM +0000, Pranjal Shrivastava wrote:
> > > +			req = nfs_page_create_from_page(dreq->ctx, pagevec[i], false,
> > >  							pgbase, pos, req_len);
> > >
> > 
> > A lot of this code reads pretty odd as it's overflowing the lines.
> > 
> 
> Ahh, my bad. For some reason even checkpatch didn't catch this, I'll fix
> this here and everywhere else.

checkpatch is unfortunately completely broken :(  It misses lots of
important bits, but at the same time has complete incoherent and crazy
warnings.

^ permalink raw reply

* Re: [PATCH net v2 2/2] bnge: remove unsupported backing store type
From: Vikas Gupta @ 2026-04-16  5:22 UTC (permalink / raw)
  To: Przemek Kitszel
  Cc: dharmender.garg, netdev, davem, edumazet, kuba, pabeni,
	andrew+netdev, horms, linux-kernel, vsrama-krishna.nemani,
	bhargava.marreddy, rajashekar.hudumula, ajit.khaparde,
	rahul-rg.gupta
In-Reply-To: <b2735cbf-34ac-4ad8-b524-2aa0f57511f8@intel.com>

[-- Attachment #1: Type: text/plain, Size: 2548 bytes --]

On Thu, Apr 16, 2026 at 9:24 AM Przemek Kitszel
<przemyslaw.kitszel@intel.com> wrote:
>
> On 4/15/26 17:16, Vikas Gupta wrote:
> > The backing store type, BNGE_CTX_MRAV, is not applicable in Thor Ultra
> > devices. Remove it from the backing store configuration, as the firmware
>
> I guess the removed code was needed for previous devices, what is the
> impact for them?

This driver does not support previous devices. Thor Ultra devices have
split MRAV
into two separate contexts, MR and AV. Support for them will be added
in a future
patch series.

>
> > will not populate entities in this backing store type, due to which the
> > driver load fails.
> >
> > Fixes: 29c5b358f385 ("bng_en: Add backing store support")
> > Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
> > Reviewed-by: Dharmender Garg <dharmender.garg@broadcom.com>
> > ---
> >   drivers/net/ethernet/broadcom/bnge/bnge_rmem.c | 16 ----------------
> >   1 file changed, 16 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> > index 94f15e08a88c..b066ee887a09 100644
> > --- a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> > +++ b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> > @@ -324,7 +324,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
> >       u32 l2_qps, qp1_qps, max_qps;
> >       u32 ena, entries_sp, entries;
> >       u32 srqs, max_srqs, min;
> > -     u32 num_mr, num_ah;
> >       u32 extra_srqs = 0;
> >       u32 extra_qps = 0;
> >       u32 fast_qpmd_qps;
> > @@ -390,21 +389,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
> >       if (!bnge_is_roce_en(bd))
> >               goto skip_rdma;
> >
> > -     ctxm = &ctx->ctx_arr[BNGE_CTX_MRAV];
> > -     /* 128K extra is needed to accommodate static AH context
> > -      * allocation by f/w.
> > -      */
> > -     num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256);
> > -     num_ah = min_t(u32, num_mr, 1024 * 128);
> > -     ctxm->split_entry_cnt = BNGE_CTX_MRAV_AV_SPLIT_ENTRY + 1;
> > -     if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah)
> > -             ctxm->mrav_av_entries = num_ah;
> > -
> > -     rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, num_mr + num_ah, 2);
> > -     if (rc)
> > -             return rc;
> > -     ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV;
> > -
> >       ctxm = &ctx->ctx_arr[BNGE_CTX_TIM];
> >       rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, l2_qps + qp1_qps + extra_qps, 1);
> >       if (rc)
>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5465 bytes --]

^ permalink raw reply

* Re: [PATCH v4] nfc: hci: fix out-of-bounds read in HCP header parsing
From: Ashutosh Desai @ 2026-04-16  5:21 UTC (permalink / raw)
  To: Simon Horman; +Cc: netdev, kuba, edumazet, davem, pabeni, linux-kernel
In-Reply-To: <20260415162641.GO772670@horms.kernel.org>

Apologies for the noise, fixed whitespace damage and sent v5.

^ permalink raw reply

* [PATCH v5 net] nfc: hci: fix out-of-bounds read in HCP header parsing
From: Ashutosh Desai @ 2026-04-16  5:15 UTC (permalink / raw)
  To: netdev
  Cc: kuba, edumazet, davem, pabeni, horms, stable, linux-kernel,
	Ashutosh Desai

nfc_hci_recv_from_llc() and nci_hci_data_received_cb() cast skb->data
to struct hcp_packet and read the message header byte without checking
that enough data is present in the linear sk_buff area. A malicious NFC
peer can send a 1-byte HCP frame that passes through the SHDLC layer
and reaches these functions, causing an out-of-bounds heap read.

Fix this by adding pskb_may_pull() before each cast to ensure the full
2-byte HCP header is pulled into the linear area before it is accessed.

Fixes: 8b8d2e08bf0d ("NFC: HCI support")
Fixes: 11f54f228643 ("NFC: nci: Add HCI over NCI protocol support")
Cc: stable@vger.kernel.org
Signed-off-by: Ashutosh Desai <ashutoshdesai993@gmail.com>
---
V4 -> V5: fix whitespace damage
V3 -> V4: add Fixes tags
V2 -> V3: drop redundant checks from nfc_hci_msg_rx_work/nci_hci_msg_rx_work;
          remove incorrect Suggested-by tag
V1 -> V2: use pskb_may_pull() instead of skb->len check

v4: https://lore.kernel.org/netdev/177614425081.3600288.2536320552978506086@gmail.com/
v3: https://lore.kernel.org/netdev/20260413024329.3293075-1-ashutoshdesai993@gmail.com/
v2: https://lore.kernel.org/netdev/20260409150825.2217133-1-ashutoshdesai993@gmail.com/
v1: https://lore.kernel.org/netdev/20260408223113.2009304-1-ashutoshdesai993@gmail.com/

 net/nfc/hci/core.c | 5 +++++
 net/nfc/nci/hci.c  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index 0d33c81a15fe..cd9cf6c94a50 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -904,6 +904,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb)
 	 * unblock waiting cmd context. Otherwise, enqueue to dispatch
 	 * in separate context where handler can also execute command.
 	 */
+	if (!pskb_may_pull(hcp_skb, NFC_HCI_HCP_HEADER_LEN)) {
+		kfree_skb(hcp_skb);
+		return;
+	}
+
 	packet = (struct hcp_packet *)hcp_skb->data;
 	type = HCP_MSG_GET_TYPE(packet->message.header);
 	if (type == NFC_HCI_HCP_RESPONSE) {
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index 40ae8e5a7ec7..6e633da257d1 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -482,6 +482,11 @@ void nci_hci_data_received_cb(void *context,
 	 * unblock waiting cmd context. Otherwise, enqueue to dispatch
 	 * in separate context where handler can also execute command.
 	 */
+	if (!pskb_may_pull(hcp_skb, NCI_HCI_HCP_HEADER_LEN)) {
+		kfree_skb(hcp_skb);
+		return;
+	}
+
 	packet = (struct nci_hcp_packet *)hcp_skb->data;
 	type = NCI_HCP_MSG_GET_TYPE(packet->message.header);
 	if (type == NCI_HCI_HCP_RESPONSE) {
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH iwl-net] ice: fix infinite recursion in ice_cfg_tx_topo via ice_init_dev_hw
From: Przemek Kitszel @ 2026-04-16  4:36 UTC (permalink / raw)
  To: Jacob Keller, Simon Horman, Petr Oros
  Cc: netdev, Tony Nguyen, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Aleksandr Loktionov,
	Nikolay Aleksandrov, Daniel Zahka, Paul Greenwalt, Dave Ertman,
	Michal Swiatkowski, intel-wired-lan, linux-kernel
In-Reply-To: <f30ad78e-1eb9-4c9d-9034-c8873966de66@intel.com>

On 4/15/26 23:22, Jacob Keller wrote:
> On 4/15/2026 9:30 AM, Simon Horman wrote:
>> On Mon, Apr 13, 2026 at 09:14:20PM +0200, Petr Oros wrote:
>>> On certain E810 configurations where firmware supports Tx scheduler
>>> topology switching (tx_sched_topo_comp_mode_en), ice_cfg_tx_topo()
>>> may need to apply a new 5-layer or 9-layer topology from the DDP
>>> package. If the AQ command to set the topology fails (e.g. due to
>>> invalid DDP data or firmware limitations), the global configuration
>>> lock must still be cleared via a CORER reset.
>>>
>>> Commit 86aae43f21cf ("ice: don't leave device non-functional if Tx
>>> scheduler config fails") correctly fixed this by refactoring
>>> ice_cfg_tx_topo() to always trigger CORER after acquiring the global
>>> lock and re-initialize hardware via ice_init_hw() afterwards.
>>>
>>> However, commit 8a37f9e2ff40 ("ice: move ice_deinit_dev() to the end
>>> of deinit paths") later moved ice_init_dev_hw() into ice_init_hw(),
>>> breaking the reinit path introduced by 86aae43f21cf. This creates an
>>> infinite recursive call chain:
>>>
>>>    ice_init_hw()
>>>      ice_init_dev_hw()
>>>        ice_cfg_tx_topo()         # topology change needed
>>>          ice_deinit_hw()
>>>          ice_init_hw()           # reinit after CORER
>>>            ice_init_dev_hw()     # recurse
>>>              ice_cfg_tx_topo()
>>>                ...               # stack overflow
>>>
>>> Fix by moving ice_init_dev_hw() back out of ice_init_hw() and calling
>>> it explicitly from ice_probe() and ice_devlink_reinit_up(). The third
>>> caller, ice_cfg_tx_topo(), intentionally does not need ice_init_dev_hw()

ice_cfg_tx_topo() stops calling ice_init_dev_hw(), that is the real
change that patch does, OK

>>> during its reinit, it only needs the core HW reinitialization. This
>>> breaks the recursion cleanly without adding flags or guards.
>>>
>>> The deinit ordering changes from commit 8a37f9e2ff40 ("ice: move
>>> ice_deinit_dev() to the end of deinit paths") which fixed slow rmmod
>>> are preserved, only the init-side placement of ice_init_dev_hw() is
>>> reverted.
>>>
>>> Fixes: 8a37f9e2ff40 ("ice: move ice_deinit_dev() to the end of deinit paths")
>>> Signed-off-by: Petr Oros <poros@redhat.com>
>>
>> Hi Petr,
>>
>> I don't intended to delay this patch.
>> But could you follow-up by looking over the AI generated
>> review of this patch on sashiko.dev?
>>
>> Thanks!
> 
> I'll take a look as well. I recently included this fix in Intel Wired
> LAN update last night, so hopefully nothing too problematic...
> 
> Sashiko says:
> 
>> While this code wasn't introduced by this patch, the restructuring makes it
>> more visible: can this cause a use-after-free if the nested hardware
>> initialization fails?
>> If ice_cfg_tx_topo() triggers a topology change, it performs a CORER reset
>> followed by an unroll (ice_deinit_hw) and re-initialization (ice_init_hw). If
>> that nested ice_init_hw() fails, its unroll path frees hw->port_info and
>> destroys control queues and mutexes.

here is a talk about "prerequisite for the problem"

>> Because ice_init_dev_hw() returns void, it swallows the -ENODEV error and

and here is about code that Petr just removes, IOW, does not apply


Plausible sounding comments, yeah, I hope we will not drown in the sea
of AI content :(

for the patch:
I have tested that it does not break my test suite (it was me to start
touching ice_init_hw() and friends), and both code and human written
commit message looks good,

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>

thank you for fixing the code after me!

^ permalink raw reply

* Re: [PATCH nf] netfilter: nf_tables: use RCU-safe list primitives for basechain hook list
From: Xiang Mei @ 2026-04-16  4:30 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Weiming Shi, Florian Westphal, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Phil Sutter, Simon Horman,
	netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <ad_C1f2cW5-kctHi@chamomile>

On Wed, Apr 15, 2026 at 9:55 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> On Fri, Apr 10, 2026 at 06:13:22PM +0800, Weiming Shi wrote:
> > NFT_MSG_GETCHAIN runs as an NFNL_CB_RCU callback, so chain dumps
> > traverse basechain->hook_list under rcu_read_lock() without holding
> > commit_mutex. Meanwhile, nft_delchain_hook() mutates that same live
> > hook_list with plain list_move() and list_splice(), and the commit/abort
> > paths splice hooks back with plain list_splice(). None of these are
> > RCU-safe list operations.
> >
> > A concurrent GETCHAIN dump can observe partially updated list pointers,
> > follow them into stack-local or transaction-private list heads, and
> > crash when container_of() produces a bogus struct nft_hook pointer.
>
> For the record, v1 of proposed series to fix this is here:
>
> https://patchwork.ozlabs.org/project/netfilter-devel/list/?series=499757

Hi Pablo,

Thanks for working on this.
If this addresses the issue I originally reported, could you please
consider adding:
Reported-by: Xiang Mei <xmei5@asu.edu>

Thanks,
Xiang

^ permalink raw reply

* Re: [PATCH net 10/13] i40e: fix napi_enable/disable skipping ringless q_vectors
From: Przemek Kitszel @ 2026-04-16  4:20 UTC (permalink / raw)
  To: Jacob Keller, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: netdev, Aleksandr Loktionov, stable, Sunitha Mekala,
	Maciej Fijalkowski
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-10-852f38e7da39@intel.com>

On 4/15/26 07:48, Jacob Keller wrote:
> From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> 
> After ethtool -L reduces the queue count, i40e_napi_disable_all() sets
> NAPI_STATE_SCHED on all q_vectors, then i40e_vsi_map_rings_to_vectors()
> clears ring pointers on the excess ones.  i40e_napi_enable_all() skips
> those with:
> 
> 	if (q_vector->rx.ring || q_vector->tx.ring)
> 		napi_enable(&q_vector->napi);
> 
> leaving them on dev->napi_list with NAPI_STATE_SCHED permanently set.
> 
> Writing to /sys/class/net/<iface>/threaded calls napi_stop_kthread()
> on every entry in dev->napi_list.  The function loops on msleep(20)
> waiting for NAPI_STATE_SCHED to clear -- which never happens for the
> stale q_vectors.  The task hangs in D state forever; a concurrent write
> deadlocks on dev->lock held by the first.
> 
> Commit 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no
> rings") added the guard to prevent a divide-by-zero in i40e_napi_poll()
> when epoll busy-poll iterated all device NAPIs (4.x era). Since
> 7adc3d57fe2b ("net: Introduce preferred busy-polling"), from v5.11,
> napi_busy_loop() polls by napi_id keyed to the socket, so ringless
> q_vectors are never selected.  i40e_msix_clean_rings() also independently
> avoids scheduling NAPI for them.  The guard is safe to remove.
> 
> Add an early return in i40e_napi_poll() for num_ringpairs == 0 so the
> function is self-defending against a NULL tx.ring dereference at the
> WB_ON_ITR check, should the NAPI ever fire through an unexpected path.
> 
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/

Maciej developed a better fix for the problem, and he explicitly asked
to not include this patch. Please drop it from this series.

Maciej's fix:
https://lore.kernel.org/intel-wired-lan/20260414121405.631092-1-maciej.fijalkowski@intel.com/T/#u

ask for reject:
https://lore.kernel.org/intel-wired-lan/PH0PR11MB75223C8A00C3183C5082A096A0252@PH0PR11MB7522.namprd11.prod.outlook.com/T/#mbac55f7219d7855a2e5d1527904b2da43ad080cb

> Fixes: 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no rings")
> Cc: stable@vger.kernel.org
> Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>   drivers/net/ethernet/intel/i40e/i40e_main.c | 28 ++++++++++++++++------------
>   drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 ++++++++++
>   2 files changed, 26 insertions(+), 12 deletions(-)
> 


^ permalink raw reply

* Re: [PATCH v11 net-next 2/5] psp: add new netlink cmd for dev-assoc and dev-disassoc
From: Wei Wang @ 2026-04-16  3:55 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, Jakub Kicinski, Daniel Zahka, Willem de Bruijn, David Wei,
	Andrew Lunn, David S . Miller, Eric Dumazet, Simon Horman,
	Wei Wang
In-Reply-To: <bc8ef831-a4ea-4d85-ab7c-a287c9b80e61@redhat.com>

On Mon, Apr 13, 2026 at 3:37 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
>
>
> On 4/9/26 1:14 AM, Wei Wang wrote:
> > From: Wei Wang <weibunny@fb.com>
> >
> > The main purpose of this cmd is to be able to associate a
> > non-psp-capable device (e.g. veth or netkit) with a psp device.
> > One use case is if we create a pair of veth/netkit, and assign 1 end
> > inside a netns, while leaving the other end within the default netns,
> > with a real PSP device, e.g. netdevsim or a physical PSP-capable NIC.
> > With this command, we could associate the veth/netkit inside the netns
> > with PSP device, so the virtual device could act as PSP-capable device
> > to initiate PSP connections, and performs PSP encryption/decryption on
> > the real PSP device.
> >
> > Signed-off-by: Wei Wang <weibunny@fb.com>
> > Reviewed-by: Daniel Zahka <daniel.zahka@gmail.com>
> > ---
> >  Documentation/netlink/specs/psp.yaml |  67 +++++-
> >  include/net/psp/types.h              |  15 ++
> >  include/uapi/linux/psp.h             |  13 ++
> >  net/psp/psp-nl-gen.c                 |  32 +++
> >  net/psp/psp-nl-gen.h                 |   2 +
> >  net/psp/psp_main.c                   |  20 ++
> >  net/psp/psp_nl.c                     | 325 ++++++++++++++++++++++++++-
> >  7 files changed, 462 insertions(+), 12 deletions(-)
> >
> > diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
> > index c54e1202cbe0..3d1b7223e084 100644
> > --- a/Documentation/netlink/specs/psp.yaml
> > +++ b/Documentation/netlink/specs/psp.yaml
> > @@ -13,6 +13,17 @@ definitions:
> >                hdr0-aes-gmac-128, hdr0-aes-gmac-256]
> >
> >  attribute-sets:
> > +  -
> > +    name: assoc-dev-info
> > +    attributes:
> > +      -
> > +        name: ifindex
> > +        doc: ifindex of an associated network device.
> > +        type: u32
> > +      -
> > +        name: nsid
> > +        doc: Network namespace ID of the associated device.
> > +        type: s32
> >    -
> >      name: dev
> >      attributes:
> > @@ -24,7 +35,9 @@ attribute-sets:
> >            min: 1
> >        -
> >          name: ifindex
> > -        doc: ifindex of the main netdevice linked to the PSP device.
> > +        doc: |
> > +          ifindex of the main netdevice linked to the PSP device,
> > +          or the ifindex to associate with the PSP device.
> >          type: u32
> >        -
> >          name: psp-versions-cap
> > @@ -38,6 +51,28 @@ attribute-sets:
> >          type: u32
> >          enum: version
> >          enum-as-flags: true
> > +      -
> > +        name: assoc-list
> > +        doc: List of associated virtual devices.
> > +        type: nest
> > +        nested-attributes: assoc-dev-info
> > +        multi-attr: true
> > +      -
> > +        name: nsid
> > +        doc: |
> > +          Network namespace ID for the device to associate/disassociate.
> > +          Optional for dev-assoc and dev-disassoc; if not present, the
> > +          device is looked up in the caller's network namespace.
> > +        type: s32
> > +      -
> > +        name: by-association
> > +        doc: |
> > +          Flag indicating the PSP device is an associated device from a
> > +          different network namespace.
> > +          Present when in associated namespace, absent when in primary/host
> > +          namespace.
> > +        type: flag
> > +
> >    -
> >      name: assoc
> >      attributes:
> > @@ -170,6 +205,8 @@ operations:
> >              - ifindex
> >              - psp-versions-cap
> >              - psp-versions-ena
> > +            - assoc-list
> > +            - by-association
> >          pre: psp-device-get-locked
> >          post: psp-device-unlock
> >        dump:
> > @@ -279,6 +316,34 @@ operations:
> >          post: psp-device-unlock
> >        dump:
> >          reply: *stats-all
> > +    -
> > +      name: dev-assoc
> > +      doc: Associate a network device with a PSP device.
> > +      attribute-set: dev
> > +      do:
> > +        request:
> > +          attributes:
> > +            - id
> > +            - ifindex
> > +            - nsid
> > +        reply:
> > +          attributes: []
> > +        pre: psp-device-get-locked
> > +        post: psp-device-unlock
> > +    -
> > +      name: dev-disassoc
> > +      doc: Disassociate a network device from a PSP device.
> > +      attribute-set: dev
> > +      do:
> > +        request:
> > +          attributes:
> > +            - id
> > +            - ifindex
> > +            - nsid
> > +        reply:
> > +          attributes: []
> > +        pre: psp-device-get-locked
> > +        post: psp-device-unlock
> >
> >  mcast-groups:
> >    list:
> > diff --git a/include/net/psp/types.h b/include/net/psp/types.h
> > index 25a9096d4e7d..4bd432ed107a 100644
> > --- a/include/net/psp/types.h
> > +++ b/include/net/psp/types.h
> > @@ -5,6 +5,7 @@
> >
> >  #include <linux/mutex.h>
> >  #include <linux/refcount.h>
> > +#include <net/net_trackers.h>
> >
> >  struct netlink_ext_ack;
> >
> > @@ -43,9 +44,22 @@ struct psp_dev_config {
> >       u32 versions;
> >  };
> >
> > +/**
> > + * struct psp_assoc_dev - wrapper for associated net_device
> > + * @dev_list: list node for psp_dev::assoc_dev_list
> > + * @assoc_dev: the associated net_device
> > + * @dev_tracker: tracker for the net_device reference
> > + */
> > +struct psp_assoc_dev {
> > +     struct list_head dev_list;
> > +     struct net_device *assoc_dev;
> > +     netdevice_tracker dev_tracker;
> > +};
> > +
> >  /**
> >   * struct psp_dev - PSP device struct
> >   * @main_netdev: original netdevice of this PSP device
> > + * @assoc_dev_list: list of psp_assoc_dev entries associated with this PSP device
> >   * @ops:     driver callbacks
> >   * @caps:    device capabilities
> >   * @drv_priv:        driver priv pointer
> > @@ -67,6 +81,7 @@ struct psp_dev_config {
> >   */
> >  struct psp_dev {
> >       struct net_device *main_netdev;
> > +     struct list_head assoc_dev_list;
> >
> >       struct psp_dev_ops *ops;
> >       struct psp_dev_caps *caps;
> > diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h
> > index a3a336488dc3..1c8899cd4da5 100644
> > --- a/include/uapi/linux/psp.h
> > +++ b/include/uapi/linux/psp.h
> > @@ -17,11 +17,22 @@ enum psp_version {
> >       PSP_VERSION_HDR0_AES_GMAC_256,
> >  };
> >
> > +enum {
> > +     PSP_A_ASSOC_DEV_INFO_IFINDEX = 1,
> > +     PSP_A_ASSOC_DEV_INFO_NSID,
> > +
> > +     __PSP_A_ASSOC_DEV_INFO_MAX,
> > +     PSP_A_ASSOC_DEV_INFO_MAX = (__PSP_A_ASSOC_DEV_INFO_MAX - 1)
> > +};
> > +
> >  enum {
> >       PSP_A_DEV_ID = 1,
> >       PSP_A_DEV_IFINDEX,
> >       PSP_A_DEV_PSP_VERSIONS_CAP,
> >       PSP_A_DEV_PSP_VERSIONS_ENA,
> > +     PSP_A_DEV_ASSOC_LIST,
> > +     PSP_A_DEV_NSID,
> > +     PSP_A_DEV_BY_ASSOCIATION,
> >
> >       __PSP_A_DEV_MAX,
> >       PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1)
> > @@ -74,6 +85,8 @@ enum {
> >       PSP_CMD_RX_ASSOC,
> >       PSP_CMD_TX_ASSOC,
> >       PSP_CMD_GET_STATS,
> > +     PSP_CMD_DEV_ASSOC,
> > +     PSP_CMD_DEV_DISASSOC,
> >
> >       __PSP_CMD_MAX,
> >       PSP_CMD_MAX = (__PSP_CMD_MAX - 1)
> > diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
> > index 1f5e73e7ccc1..114299c64423 100644
> > --- a/net/psp/psp-nl-gen.c
> > +++ b/net/psp/psp-nl-gen.c
> > @@ -53,6 +53,20 @@ static const struct nla_policy psp_get_stats_nl_policy[PSP_A_STATS_DEV_ID + 1] =
> >       [PSP_A_STATS_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
> >  };
> >
> > +/* PSP_CMD_DEV_ASSOC - do */
> > +static const struct nla_policy psp_dev_assoc_nl_policy[PSP_A_DEV_NSID + 1] = {
> > +     [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
> > +     [PSP_A_DEV_IFINDEX] = { .type = NLA_U32, },
> > +     [PSP_A_DEV_NSID] = { .type = NLA_S32, },
> > +};
> > +
> > +/* PSP_CMD_DEV_DISASSOC - do */
> > +static const struct nla_policy psp_dev_disassoc_nl_policy[PSP_A_DEV_NSID + 1] = {
> > +     [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
> > +     [PSP_A_DEV_IFINDEX] = { .type = NLA_U32, },
> > +     [PSP_A_DEV_NSID] = { .type = NLA_S32, },
> > +};
> > +
> >  /* Ops table for psp */
> >  static const struct genl_split_ops psp_nl_ops[] = {
> >       {
> > @@ -119,6 +133,24 @@ static const struct genl_split_ops psp_nl_ops[] = {
> >               .dumpit = psp_nl_get_stats_dumpit,
> >               .flags  = GENL_CMD_CAP_DUMP,
> >       },
> > +     {
> > +             .cmd            = PSP_CMD_DEV_ASSOC,
> > +             .pre_doit       = psp_device_get_locked,
> > +             .doit           = psp_nl_dev_assoc_doit,
> > +             .post_doit      = psp_device_unlock,
> > +             .policy         = psp_dev_assoc_nl_policy,
> > +             .maxattr        = PSP_A_DEV_NSID,
> > +             .flags          = GENL_CMD_CAP_DO,
> > +     },
> > +     {
> > +             .cmd            = PSP_CMD_DEV_DISASSOC,
> > +             .pre_doit       = psp_device_get_locked,
> > +             .doit           = psp_nl_dev_disassoc_doit,
> > +             .post_doit      = psp_device_unlock,
> > +             .policy         = psp_dev_disassoc_nl_policy,
> > +             .maxattr        = PSP_A_DEV_NSID,
> > +             .flags          = GENL_CMD_CAP_DO,
>
> Sashiko notes that the above allows deleteing an associations bypassing
> the netns boundaries. Do you need ADMIN_PERM flag or exlicit checks in
> the doit cb?

I think the concern is if we are calling this from an assoc_dev's
netns, it should not allow it to delete any assoc_dev from other
assoc_dev's netns. Right?
I will add a check to only allow deletion of assoc_dev from its own
netns.  (except main_dev's netns).

>
> > @@ -292,6 +455,145 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info)
> >       return err;
> >  }
> >
> > +int psp_nl_dev_assoc_doit(struct sk_buff *skb, struct genl_info *info)
> > +{
> > +     struct psp_dev *psd = info->user_ptr[0];
> > +     struct psp_assoc_dev *psp_assoc_dev;
> > +     struct net_device *assoc_dev;
> > +     struct sk_buff *rsp;
> > +     u32 assoc_ifindex;
> > +     struct net *net;
> > +     int nsid, err;
> > +
> > +     if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_IFINDEX))
> > +             return -EINVAL;
> > +
> > +     if (info->attrs[PSP_A_DEV_NSID]) {
> > +             nsid = nla_get_s32(info->attrs[PSP_A_DEV_NSID]);
> > +
> > +             net = get_net_ns_by_id(genl_info_net(info), nsid);
> > +             if (!net) {
> > +                     NL_SET_BAD_ATTR(info->extack,
> > +                                     info->attrs[PSP_A_DEV_NSID]);
> > +                     return -EINVAL;
> > +             }
> > +     } else {
> > +             net = get_net(genl_info_net(info));
> > +     }
>
> psp_nl_dev_disassoc_doit() has the same code; perhaps it would be worthy
> move it in a common helper, called via pre_doit()? It should also
> simplify the cleanup paths.
>

Ack.

> > +
> > +     psp_assoc_dev = kzalloc(sizeof(*psp_assoc_dev), GFP_KERNEL);
> > +     if (!psp_assoc_dev) {
> > +             err = -ENOMEM;
> > +             goto alloc_err;
> > +     }
> > +
> > +     assoc_ifindex = nla_get_u32(info->attrs[PSP_A_DEV_IFINDEX]);
> > +     assoc_dev = netdev_get_by_index(net, assoc_ifindex,
> > +                                     &psp_assoc_dev->dev_tracker,
> > +                                     GFP_KERNEL);
> > +     if (!assoc_dev) {
> > +             NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_DEV_IFINDEX]);
> > +             err = -ENODEV;
> > +             goto assoc_dev_err;
> > +     }
> > +
> > +     /* Check if device is already associated with a PSP device */
> > +     if (cmpxchg(&assoc_dev->psp_dev, NULL, RCU_INITIALIZER(psd))) {
> > +             NL_SET_ERR_MSG(info->extack,
> > +                            "Device already associated with a PSP device");
> > +             err = -EBUSY;
> > +             goto cmpxchg_err;
> > +     }
> > +
> > +     psp_assoc_dev->assoc_dev = assoc_dev;
> > +     rsp = psp_nl_reply_new(info);
> > +     if (!rsp) {
> > +             err = -ENOMEM;
> > +             goto rsp_err;
> > +     }
> > +
> > +     list_add_tail(&psp_assoc_dev->dev_list, &psd->assoc_dev_list);
>
> Sashiko says:
>
> ---
> list_add_tail(&psp_assoc_dev->dev_list, &psd->assoc_dev_list);
> There doesn't seem to be a limit on the number of devices that can be
> associated with a single PSP device.
> If a user repeatedly associates devices, could the generated netlink message
> in psp_nl_dev_fill() exceed the maximum allowed size (GENLMSG_DEFAULT_SIZE),
> causing it to fail with -EMSGSIZE and permanently break PSP_CMD_DEV_GET
> and management notifications for the device?
> --

Ack. Will enforce a max allowed number on the assoc_dev_list to fit
into GENLMSG_DEFAULT_SIZE.

>
> /P
>

^ permalink raw reply

* [PATCH net 3/3] octeontx2-af: npc: cn20k: Return error when defrag rollback free fails
From: Ratheesh Kannoth @ 2026-04-16  3:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: sgoutham, davem, edumazet, kuba, pabeni, andrew+netdev,
	dan.carpenter, Ratheesh Kannoth, Dan Carpenter
In-Reply-To: <20260416035352.333808-1-rkannoth@marvell.com>

In npc_defrag_alloc_free_slots(), the fail_free_alloc rollback loop frees
previously allocated MCAM entries after a partial allocation failure.  If
__npc_subbank_free() fails, we break out of the loop but rc was still zero
from the successful npc_mcam_idx_2_subbank_idx() lookup, so the function
incorrectly returned success.  Set rc to -EFAULT so the failure is visible
to callers.

CC: Dan Carpenter <error27@gmail.com>
Link: https://lore.kernel.org/netdev/adjNJEpILRZATB2N@stanley.mountain/
Fixes: 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index support")
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
index 2fcd0ee2b1e1..df192729ac1d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
@@ -3541,6 +3541,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu,
 			dev_err(rvu->dev,
 				"%s: Error to free mcam idx=%u\n",
 				__func__, save[i]);
+			rc = -EFAULT;
 			break;
 		}
 	}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net 0/3] octeontx2-af: Fix smatch reported errors
From: Ratheesh Kannoth @ 2026-04-16  3:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: sgoutham, davem, edumazet, kuba, pabeni, andrew+netdev,
	dan.carpenter, Ratheesh Kannoth

This series tightens error handling in the Marvell OcteonTX2 AF CN20K NPC
layer: MCAM paths now respect npc_mcam_idx_2_key_type() failures, debugfs
setup follows the usual "optional, do not fail init" convention, and
defrag rollback reports failure when freeing slots after a partial
allocation does not complete cleanly.

Patch 1 returns early when resolving an MCAM index to a key type fails,
so enable, configure, copy, and read paths do not program or interpret
hardware with a stale or undefined key type.

Patch 2 removes dentry and allocation-failure checks around
debugfs_create_file() in npc_cn20k_debugfs_init().  Debugfs entries are
diagnostic; callers should not abort probe or subsystem init when they
cannot be created (including when debugfs is disabled).

Patch 3 sets the error code when __npc_subbank_free() fails inside the
rollback loop in npc_defrag_alloc_free_slots(), so the function does not
return success after a failed cleanup.

Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>

Ratheesh Kannoth (3):
  octeontx2-af: npc: cn20k: Handle npc_mcam_idx_2_key_type() failures
  octeontx2-af: npc: cn20k: Drop debugfs_create_file() error checks in
    init
  octeontx2-af: npc: cn20k: Return error when defrag rollback free fails

---
 .../marvell/octeontx2/af/cn20k/debugfs.c      | 33 ++++++-------------
 .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 21 +++++++++---
 2 files changed, 26 insertions(+), 28 deletions(-)

--
2.43.0

^ permalink raw reply

* [PATCH net 2/3] octeontx2-af: npc: cn20k: Drop debugfs_create_file() error checks in init
From: Ratheesh Kannoth @ 2026-04-16  3:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: sgoutham, davem, edumazet, kuba, pabeni, andrew+netdev,
	dan.carpenter, Ratheesh Kannoth, Dan Carpenter
In-Reply-To: <20260416035352.333808-1-rkannoth@marvell.com>

debugfs is not intended to be checked for allocation failures the way
other kernel APIs are: callers should not fail probe or subsystem init
because a debugfs node could not be created, including when debugfs is
disabled in Kconfig.  Replacing NULL checks with IS_ERR() checks is
similarly wrong for optional debugfs.

Remove dentry checks and -EFAULT returns from npc_cn20k_debugfs_init().
https://staticthinking.wordpress.com/2023/07/24/debugfs-functions-are-not-supposed-to-be-checked/

CC: Dan Carpenter <error27@gmail.com>
Link: https://lore.kernel.org/netdev/adjNGPWKMOk3KgWL@stanley.mountain/
Fixes: 528530dff56b ("octeontx2-af: npc: cn20k: add debugfs support")
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
---
 .../marvell/octeontx2/af/cn20k/debugfs.c      | 33 ++++++-------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c
index 3debf2fae1a4..6f13296303cb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c
@@ -249,34 +249,21 @@ DEFINE_SHOW_ATTRIBUTE(npc_defrag);
 int npc_cn20k_debugfs_init(struct rvu *rvu)
 {
 	struct npc_priv_t *npc_priv = npc_priv_get();
-	struct dentry *npc_dentry;
 
-	npc_dentry = debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc,
-					 npc_priv, &npc_mcam_layout_fops);
+	debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc,
+			    npc_priv, &npc_mcam_layout_fops);
 
-	if (!npc_dentry)
-		return -EFAULT;
+	debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc,
+			    rvu, &npc_mcam_default_fops);
 
-	npc_dentry = debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc,
-					 rvu, &npc_mcam_default_fops);
+	debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc,
+			    npc_priv, &npc_vidx2idx_map_fops);
 
-	if (!npc_dentry)
-		return -EFAULT;
+	debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc,
+			    npc_priv, &npc_idx2vidx_map_fops);
 
-	npc_dentry = debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc,
-					 npc_priv, &npc_vidx2idx_map_fops);
-	if (!npc_dentry)
-		return -EFAULT;
-
-	npc_dentry = debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc,
-					 npc_priv, &npc_idx2vidx_map_fops);
-	if (!npc_dentry)
-		return -EFAULT;
-
-	npc_dentry = debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc,
-					 npc_priv, &npc_defrag_fops);
-	if (!npc_dentry)
-		return -EFAULT;
+	debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc,
+			    npc_priv, &npc_defrag_fops);
 
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH net 1/3] octeontx2-af: npc: cn20k: Handle npc_mcam_idx_2_key_type() failures
From: Ratheesh Kannoth @ 2026-04-16  3:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: sgoutham, davem, edumazet, kuba, pabeni, andrew+netdev,
	dan.carpenter, Ratheesh Kannoth, Dan Carpenter
In-Reply-To: <20260416035352.333808-1-rkannoth@marvell.com>

npc_mcam_idx_2_key_type() can fail; ignoring its return value left
kw_type unchecked in MCAM enable, configure, copy, and read paths.
Return early on error so we do not program or interpret MCAM state
with an invalid key type.

CC: Dan Carpenter <error27@gmail.com>
Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs")
Link: https://lore.kernel.org/netdev/adiQJvuKlEhq2ILx@stanley.mountain/
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
---
 .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
index 7291fdb89b03..2fcd0ee2b1e1 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
@@ -808,7 +808,9 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr,
 	u64 cfg, hw_prio;
 	u8 kw_type;
 
-	npc_mcam_idx_2_key_type(rvu, index, &kw_type);
+	if (npc_mcam_idx_2_key_type(rvu, index, &kw_type))
+		return;
+
 	if (kw_type == NPC_MCAM_KEY_X2) {
 		cfg = rvu_read64(rvu, blkaddr,
 				 NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx,
@@ -1052,10 +1054,12 @@ void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index,
 	int kw = 0;
 	u8 kw_type;
 
+	if (npc_mcam_idx_2_key_type(rvu, index, &kw_type))
+		return;
+
 	/* Disable before mcam entry update */
 	npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false);
 
-	npc_mcam_idx_2_key_type(rvu, index, &kw_type);
 	/* CAM1 takes the comparison value and
 	 * CAM0 specifies match for a bit in key being '0' or '1' or 'dontcare'.
 	 * CAM1<n> = 0 & CAM0<n> = 1 => match if key<n> = 0
@@ -1132,8 +1136,13 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest)
 
 	dbank = npc_get_bank(mcam, dest);
 	sbank = npc_get_bank(mcam, src);
-	npc_mcam_idx_2_key_type(rvu, src, &src_kwtype);
-	npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype);
+
+	if (npc_mcam_idx_2_key_type(rvu, src, &src_kwtype))
+		return;
+
+	if (npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype))
+		return;
+
 	if (src_kwtype != dest_kwtype)
 		return;
 
@@ -1188,7 +1197,8 @@ void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index,
 	int kw = 0, bank;
 	u8 kw_type;
 
-	npc_mcam_idx_2_key_type(rvu, index, &kw_type);
+	if (npc_mcam_idx_2_key_type(rvu, index, &kw_type))
+		return;
 
 	bank = npc_get_bank(mcam, index);
 	index &= (mcam->banksize - 1);
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net v2 2/2] bnge: remove unsupported backing store type
From: Przemek Kitszel @ 2026-04-16  3:54 UTC (permalink / raw)
  To: Vikas Gupta, dharmender.garg
  Cc: netdev, davem, edumazet, kuba, pabeni, andrew+netdev, horms,
	linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, rahul-rg.gupta
In-Reply-To: <20260415151621.1104956-3-vikas.gupta@broadcom.com>

On 4/15/26 17:16, Vikas Gupta wrote:
> The backing store type, BNGE_CTX_MRAV, is not applicable in Thor Ultra
> devices. Remove it from the backing store configuration, as the firmware

I guess the removed code was needed for previous devices, what is the
impact for them?

> will not populate entities in this backing store type, due to which the
> driver load fails.
> 
> Fixes: 29c5b358f385 ("bng_en: Add backing store support")
> Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
> Reviewed-by: Dharmender Garg <dharmender.garg@broadcom.com>
> ---
>   drivers/net/ethernet/broadcom/bnge/bnge_rmem.c | 16 ----------------
>   1 file changed, 16 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> index 94f15e08a88c..b066ee887a09 100644
> --- a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> +++ b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
> @@ -324,7 +324,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
>   	u32 l2_qps, qp1_qps, max_qps;
>   	u32 ena, entries_sp, entries;
>   	u32 srqs, max_srqs, min;
> -	u32 num_mr, num_ah;
>   	u32 extra_srqs = 0;
>   	u32 extra_qps = 0;
>   	u32 fast_qpmd_qps;
> @@ -390,21 +389,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
>   	if (!bnge_is_roce_en(bd))
>   		goto skip_rdma;
>   
> -	ctxm = &ctx->ctx_arr[BNGE_CTX_MRAV];
> -	/* 128K extra is needed to accommodate static AH context
> -	 * allocation by f/w.
> -	 */
> -	num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256);
> -	num_ah = min_t(u32, num_mr, 1024 * 128);
> -	ctxm->split_entry_cnt = BNGE_CTX_MRAV_AV_SPLIT_ENTRY + 1;
> -	if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah)
> -		ctxm->mrav_av_entries = num_ah;
> -
> -	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, num_mr + num_ah, 2);
> -	if (rc)
> -		return rc;
> -	ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV;
> -
>   	ctxm = &ctx->ctx_arr[BNGE_CTX_TIM];
>   	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, l2_qps + qp1_qps + extra_qps, 1);
>   	if (rc)


^ permalink raw reply

* [PATCH net-next v1 2/2] net: add DEBUG_NET_WARN_ON_ONCE for negative transport offset
From: Jiayuan Chen @ 2026-04-16  3:46 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, David Ahern, Pravin B Shelar,
	Tom Herbert, linux-kernel
In-Reply-To: <20260416034610.8873-1-jiayuan.chen@linux.dev>

skb_transport_offset() can silently return a negative
value when the transport_header becomes stale after tunnel
decapsulation. A negative offset is never valid — it means
transport_header points before skb->data, which will cause unsigned
wraparound in any caller that assigns the result to an unsigned
variable.

Add a DEBUG_NET_WARN_ON_ONCE(off < 0) check so that such cases are
caught early in CONFIG_DEBUG_NET=y builds (e.g., syzkaller, kernel test
bots) with a full stack trace pointing to the caller, rather than
silently propagating a bogus offset until something crashes downstream.

Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2bcf78a4de7b..0b1aeacc25f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3234,7 +3234,10 @@ static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
 
 static inline int skb_transport_offset(const struct sk_buff *skb)
 {
-	return skb_transport_header(skb) - skb->data;
+	int off = skb_transport_header(skb) - skb->data;
+
+	DEBUG_NET_WARN_ON_ONCE(off < 0);
+	return off;
 }
 
 static inline u32 skb_network_header_len(const struct sk_buff *skb)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v1 1/2] net: tunnel: fix stale transport header after GRE/TEB decap
From: Jiayuan Chen @ 2026-04-16  3:46 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, syzbot+83181a31faf9455499c5, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	David Ahern, Pravin B Shelar, Tom Herbert, linux-kernel

syzbot reported a BUG.

I found that after GRE decapsulation in gretap/ip6gretap paths, the
transport_header becomes stale with a negative offset. The sequence is:

1. Before decap, transport_header points to the outer L4 (GRE) header.
2. __iptunnel_pull_header() calls skb_pull_rcsum() to advance skb->data
   past the GRE header, but does not update transport_header.
3. For TEB (gretap/ip6gretap), eth_type_trans() in ip_tunnel_rcv() /
   __ip6_tnl_rcv() further pulls ETH_HLEN (14 bytes) from skb->data.

After these two pulls, skb->data has moved forward while transport_header
still points to the old (now behind skb->data) position, resulting in a
negative skb_transport_offset(): typically -4 after GRE pull alone, or
-18 after GRE + inner Ethernet pull.

In the normal case where the inner frame is a recognizable protocol
(e.g., IPv4/TCP), inet_gro_receive() in net/ipv4/af_inet.c corrects the
transport_header via skb_set_transport_header() during GRO processing.
However, if the inner frame cannot be parsed (e.g., eth_type_trans()
classifies it as ETH_P_802_2 due to a zero/invalid inner Ethernet
header), no GRO callback resets the transport_header, and the stale
offset persists into __netif_receive_skb_core().

When this stale offset is combined with contradictory GSO metadata (e.g.,
SKB_GSO_TCPV4 injected via virtio_net_hdr from a tun device),
qdisc_pkt_len_segs_init() trusts the negative offset: the unsigned
wraparound makes pskb_may_pull() effectively a no-op, and __tcp_hdrlen()
then reads from an invalid memory location, causing a use-after-free.

Fix this by introducing iptunnel_rebuild_transport_header() which resets
and re-probes the transport header after tunnel decapsulation. If the
transport header cannot be rebuilt and the skb carries GSO metadata, the
inconsistent GSO fields are cleared to prevent downstream consumers from
trusting stale offsets.

reproducer: https://gist.github.com/mrpre/5ba943fd86367af748b70de99263da4b

Link: https://syzkaller.appspot.com/bug?extid=83181a31faf9455499c5
Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.")
Fixes: 0d3c703a9d17 ("ipv6: Cleanup IPv6 tunnel receive path")
Reported-by: syzbot+83181a31faf9455499c5@syzkaller.appspotmail.com
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 include/net/ip_tunnels.h | 14 ++++++++++++++
 net/ipv4/ip_tunnel.c     |  2 ++
 net/ipv6/ip6_tunnel.c    |  2 ++
 3 files changed, 18 insertions(+)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d708b66e55cd..f160d82e6196 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -662,6 +662,20 @@ static inline int iptunnel_pull_offloads(struct sk_buff *skb)
 	return 0;
 }
 
+static inline void iptunnel_rebuild_transport_header(struct sk_buff *skb)
+{
+	skb->transport_header = (typeof(skb->transport_header))~0U;
+	skb_probe_transport_header(skb);
+
+	if (!skb_transport_header_was_set(skb) && skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+		shinfo->gso_type = 0;
+		shinfo->gso_size = 0;
+		shinfo->gso_segs = 0;
+	}
+}
+
 static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 {
 	if (pkt_len > 0) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 50d0f5fe4e4c..c46be68cfafa 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -445,6 +445,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	if (tun_dst)
 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
 
+	iptunnel_rebuild_transport_header(skb);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 46bc06506470..f95348cf3c77 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -879,6 +879,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 	if (tun_dst)
 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
 
+	iptunnel_rebuild_transport_header(skb);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net] sctp: fix OOB write to userspace in sctp_getsockopt_peer_auth_chunks
From: Michael Bommarito @ 2026-04-16  3:19 UTC (permalink / raw)
  To: linux-sctp, Marcelo Ricardo Leitner, Xin Long
  Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, netdev, linux-kernel, stable

sctp_getsockopt_peer_auth_chunks() checks that the caller's optval
buffer is large enough for the peer AUTH chunk list with

    if (len < num_chunks)
            return -EINVAL;

but then writes num_chunks bytes to p->gauth_chunks, which lives
at offset offsetof(struct sctp_authchunks, gauth_chunks) == 8
inside optval.  The check is missing the sizeof(struct
sctp_authchunks) = 8-byte header.  When the caller supplies
len == num_chunks (for any num_chunks > 0) the test passes but
copy_to_user() writes sizeof(struct sctp_authchunks) = 8 bytes
past the declared buffer.

The sibling function sctp_getsockopt_local_auth_chunks() at the
next line already has the correct check:

    if (len < sizeof(struct sctp_authchunks) + num_chunks)
            return -EINVAL;

Align the peer variant with its sibling.

Reproducer confirms on v7.0-13-generic: an unprivileged userspace
caller that opens a loopback SCTP association with AUTH enabled,
queries num_chunks with a short optval, then issues the real
getsockopt with len == num_chunks and sentinel bytes painted past
the buffer observes those sentinel bytes overwritten with the
peer's AUTH chunk type.  The bytes written are under the peer's
control but land in the caller's own userspace; this is not a
kernel memory corruption, but it is a kernel-side contract
violation that can silently corrupt adjacent userspace data.

Fixes: 65b07e5d0d09 ("[SCTP]: API updates to suport SCTP-AUTH extensions.")
Cc: stable@vger.kernel.org
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
---
 net/sctp/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 05fb00c9c335..f5d442753dc9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7033,7 +7033,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
 
 	/* See if the user provided enough room for all the data */
 	num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr);
-	if (len < num_chunks)
+	if (len < sizeof(struct sctp_authchunks) + num_chunks)
 		return -EINVAL;
 
 	if (copy_to_user(to, ch->chunks, num_chunks))
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v5] openvswitch: cap upcall PID array size and pre-size vport replies
From: Weiming Shi @ 2026-04-16  2:46 UTC (permalink / raw)
  To: Aaron Conole, Eelco Chaudron, Ilya Maximets, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Pravin B Shelar, Thomas Graf, Alex Wang, netdev,
	dev, Xiang Mei, Weiming Shi

The vport netlink reply helpers allocate a fixed-size skb with
nlmsg_new(NLMSG_DEFAULT_SIZE, ...) but serialize the full upcall PID
array via ovs_vport_get_upcall_portids().  Since
ovs_vport_set_upcall_portids() accepts any non-zero multiple of
sizeof(u32) with no upper bound, a CAP_NET_ADMIN user can install a PID
array large enough to overflow the reply buffer, causing nla_put() to
fail with -EMSGSIZE and hitting BUG_ON(err < 0).  On systems with
unprivileged user namespaces enabled (e.g., Ubuntu default), this is
reachable via unshare -Urn since OVS vport mutation operations use
GENL_UNS_ADMIN_PERM.

 kernel BUG at net/openvswitch/datapath.c:2414!
 Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI
 CPU: 1 UID: 0 PID: 65 Comm: poc Not tainted 7.0.0-rc7-00195-geb216e422044 #1
 RIP: 0010:ovs_vport_cmd_set+0x34c/0x400
 Call Trace:
  <TASK>
  genl_family_rcv_msg_doit (net/netlink/genetlink.c:1116)
  genl_rcv_msg (net/netlink/genetlink.c:1194)
  netlink_rcv_skb (net/netlink/af_netlink.c:2550)
  genl_rcv (net/netlink/genetlink.c:1219)
  netlink_unicast (net/netlink/af_netlink.c:1344)
  netlink_sendmsg (net/netlink/af_netlink.c:1894)
  __sys_sendto (net/socket.c:2206)
  __x64_sys_sendto (net/socket.c:2209)
  do_syscall_64 (arch/x86/entry/syscall_64.c:63)
  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
  </TASK>
 Kernel panic - not syncing: Fatal exception

Reject attempts to set more PIDs than nr_cpu_ids in
ovs_vport_set_upcall_portids(), and pre-compute the worst-case reply
size in ovs_vport_cmd_msg_size() based on that bound, similar to the
existing ovs_dp_cmd_msg_size().  nr_cpu_ids matches the cap already
used by the per-CPU dispatch configuration on the datapath side
(ovs_dp_cmd_fill_info() serialises at most nr_cpu_ids PIDs), so the
two sides stay consistent.

Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 'port_id's.")
Reported-by: Xiang Mei <xmei5@asu.edu>
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
---
v5 (per Ilya):
- Add blank lines before multi-line comment blocks in
  ovs_vport_cmd_msg_size() for readability.
- Drop parenthetical from the OVS_VPORT_ATTR_UPCALL_PID comment.
- Add lore links for previous versions.
v4: https://lore.kernel.org/netdev/20260415125121.110874-2-bestswngs@gmail.com
- Use nr_cpu_ids instead of num_possible_cpus() for consistency with
  the per-CPU dispatch on the datapath side.
- Annotate ovs_vport_cmd_msg_size() per-attribute; split nested sums.
v3: https://lore.kernel.org/netdev/20260413035514.2113886-3-bestswngs@gmail.com
- Cap at num_possible_cpus(); add ovs_vport_cmd_msg_size(); keep
  BUG_ON(); fix Fixes tag.
v2: https://lore.kernel.org/netdev/20260411141448.1479933-3-bestswngs@gmail.com
- Dynamically size reply skb; drop WARN_ON_ONCE, return plain errors.
v1: https://lore.kernel.org/netdev/20260411055915.1224902-2-bestswngs@gmail.com
---
 net/openvswitch/datapath.c | 35 +++++++++++++++++++++++++++++++++--
 net/openvswitch/vport.c    |  3 +++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e209099218b4..bbbde50fc649 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2184,9 +2184,40 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 	return err;
 }
 
+static size_t ovs_vport_cmd_msg_size(void)
+{
+	size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
+
+	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_PORT_NO */
+	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_TYPE */
+	msgsize += nla_total_size(IFNAMSIZ);    /* OVS_VPORT_ATTR_NAME */
+	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_IFINDEX */
+	msgsize += nla_total_size(sizeof(s32)); /* OVS_VPORT_ATTR_NETNSID */
+
+	/* OVS_VPORT_ATTR_STATS */
+	msgsize += nla_total_size_64bit(sizeof(struct ovs_vport_stats));
+
+	/* OVS_VPORT_ATTR_UPCALL_STATS(OVS_VPORT_UPCALL_ATTR_SUCCESS +
+	 *                             OVS_VPORT_UPCALL_ATTR_FAIL)
+	 */
+	msgsize += nla_total_size(nla_total_size_64bit(sizeof(u64)) +
+				  nla_total_size_64bit(sizeof(u64)));
+
+	/* OVS_VPORT_ATTR_UPCALL_PID */
+	msgsize += nla_total_size(nr_cpu_ids * sizeof(u32));
+
+	/* OVS_VPORT_ATTR_OPTIONS(OVS_TUNNEL_ATTR_DST_PORT +
+	 *                        OVS_TUNNEL_ATTR_EXTENSION(OVS_VXLAN_EXT_GBP))
+	 */
+	msgsize += nla_total_size(nla_total_size(sizeof(u16)) +
+				  nla_total_size(nla_total_size(0)));
+
+	return msgsize;
+}
+
 static struct sk_buff *ovs_vport_cmd_alloc_info(void)
 {
-	return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	return genlmsg_new(ovs_vport_cmd_msg_size(), GFP_KERNEL);
 }
 
 /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
@@ -2196,7 +2227,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 	struct sk_buff *skb;
 	int retval;
 
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	skb = ovs_vport_cmd_alloc_info();
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 23f629e94a36..56b2e2d1a749 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -406,6 +406,9 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids)
 	if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
 		return -EINVAL;
 
+	if (nla_len(ids) / sizeof(u32) > nr_cpu_ids)
+		return -EINVAL;
+
 	old = ovsl_dereference(vport->upcall_portids);
 
 	vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids),
-- 
2.43.0


^ permalink raw reply related

* [PATCH iwl-next v2 3/3] igc: add support for forcing link speed without autonegotiation
From: KhaiWenTan @ 2026-04-16  1:55 UTC (permalink / raw)
  To: anthony.l.nguyen, przemyslaw.kitszel, andrew+netdev, davem,
	edumazet, kuba, pabeni
  Cc: intel-wired-lan, netdev, linux-kernel, faizal.abdul.rahim,
	hong.aun.looi, khai.wen.tan, Faizal Rahim, Looi, KhaiWenTan
In-Reply-To: <20260416015520.6090-1-khai.wen.tan@linux.intel.com>

From: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>

Allow users to force 10/100 Mb/s link speed and duplex via ethtool
when autonegotiation is disabled. Previously, the driver rejected
these requests with "Force mode currently not supported.".

Forcing at 1000 Mb/s and 2500 Mb/s is not supported.

Reviewed-by: Looi, Hong Aun <hong.aun.looi@intel.com>
Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
Signed-off-by: KhaiWenTan <khai.wen.tan@linux.intel.com>
---
 drivers/net/ethernet/intel/igc/igc_base.c    |  35 ++++-
 drivers/net/ethernet/intel/igc/igc_defines.h |   9 +-
 drivers/net/ethernet/intel/igc/igc_ethtool.c | 131 +++++++++++++------
 drivers/net/ethernet/intel/igc/igc_hw.h      |   9 ++
 drivers/net/ethernet/intel/igc/igc_mac.c     |  10 ++
 drivers/net/ethernet/intel/igc/igc_main.c    |   2 +-
 drivers/net/ethernet/intel/igc/igc_phy.c     |  65 ++++++++-
 drivers/net/ethernet/intel/igc/igc_phy.h     |   1 +
 8 files changed, 211 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c
index 1613b562d17c..ab9120a3127f 100644
--- a/drivers/net/ethernet/intel/igc/igc_base.c
+++ b/drivers/net/ethernet/intel/igc/igc_base.c
@@ -114,11 +114,35 @@ static s32 igc_setup_copper_link_base(struct igc_hw *hw)
 	u32 ctrl;

 	ctrl = rd32(IGC_CTRL);
-	ctrl |= IGC_CTRL_SLU;
-	ctrl &= ~(IGC_CTRL_FRCSPD | IGC_CTRL_FRCDPX);
-	wr32(IGC_CTRL, ctrl);
-
-	ret_val = igc_setup_copper_link(hw);
+	ctrl &= ~(IGC_CTRL_FRCSPD | IGC_CTRL_FRCDPX |
+		  IGC_CTRL_SPEED_MASK | IGC_CTRL_FD);
+
+	if (hw->mac.autoneg_enabled) {
+		ctrl |= IGC_CTRL_SLU;
+		wr32(IGC_CTRL, ctrl);
+		ret_val = igc_setup_copper_link(hw);
+	} else {
+		ctrl |= IGC_CTRL_SLU | IGC_CTRL_FRCSPD | IGC_CTRL_FRCDPX;
+
+		switch (hw->mac.forced_speed_duplex) {
+		case IGC_FORCED_10H:
+			ctrl |= IGC_CTRL_SPEED_10;
+			break;
+		case IGC_FORCED_10F:
+			ctrl |= IGC_CTRL_SPEED_10 | IGC_CTRL_FD;
+			break;
+		case IGC_FORCED_100H:
+			ctrl |= IGC_CTRL_SPEED_100;
+			break;
+		case IGC_FORCED_100F:
+			ctrl |= IGC_CTRL_SPEED_100 | IGC_CTRL_FD;
+			break;
+		default:
+			return -IGC_ERR_CONFIG;
+		}
+		wr32(IGC_CTRL, ctrl);
+		ret_val = igc_setup_copper_link(hw);
+	}

 	return ret_val;
 }
@@ -443,6 +467,7 @@ static const struct igc_phy_operations igc_phy_ops_base = {
 	.reset			= igc_phy_hw_reset,
 	.read_reg		= igc_read_phy_reg_gpy,
 	.write_reg		= igc_write_phy_reg_gpy,
+	.force_speed_duplex	= igc_force_speed_duplex,
 };

 const struct igc_info igc_base_info = {
diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
index 9482ab11f050..3f504751c2d9 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -129,10 +129,13 @@
 #define IGC_ERR_SWFW_SYNC		13

 /* Device Control */
+#define IGC_CTRL_FD		BIT(0)  /* Full Duplex */
 #define IGC_CTRL_RST		0x04000000  /* Global reset */
-
 #define IGC_CTRL_PHY_RST	0x80000000  /* PHY Reset */
 #define IGC_CTRL_SLU		0x00000040  /* Set link up (Force Link) */
+#define IGC_CTRL_SPEED_MASK	GENMASK(10, 8)
+#define IGC_CTRL_SPEED_10	FIELD_PREP(IGC_CTRL_SPEED_MASK, 0)
+#define IGC_CTRL_SPEED_100	FIELD_PREP(IGC_CTRL_SPEED_MASK, 1)
 #define IGC_CTRL_FRCSPD		0x00000800  /* Force Speed */
 #define IGC_CTRL_FRCDPX		0x00001000  /* Force Duplex */
 #define IGC_CTRL_VME		0x40000000  /* IEEE VLAN mode enable */
@@ -673,6 +676,10 @@
 #define IGC_GEN_POLL_TIMEOUT	1920

 /* PHY Control Register */
+#define MII_CR_SPEED_MASK	(BIT(6) | BIT(13))
+#define MII_CR_SPEED_10		0x0000	/* SSM=0, SSL=0: 10 Mb/s */
+#define MII_CR_SPEED_100	BIT(13)	/* SSM=0, SSL=1: 100 Mb/s */
+#define MII_CR_DUPLEX_EN	BIT(8)	/* 0 = Half Duplex, 1 = Full Duplex */
 #define MII_CR_RESTART_AUTO_NEG	0x0200  /* Restart auto negotiation */
 #define MII_CR_POWER_DOWN	0x0800  /* Power down */
 #define MII_CR_AUTO_NEG_EN	0x1000  /* Auto Neg Enable */
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index cfcbf2fdad6e..5bd37d1be168 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1914,44 +1914,58 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
 	ethtool_link_ksettings_add_link_mode(cmd, supported, TP);
 	ethtool_link_ksettings_add_link_mode(cmd, advertising, TP);

-	/* advertising link modes */
-	if (hw->phy.autoneg_advertised & ADVERTISE_10_HALF)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 10baseT_Half);
-	if (hw->phy.autoneg_advertised & ADVERTISE_10_FULL)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 10baseT_Full);
-	if (hw->phy.autoneg_advertised & ADVERTISE_100_HALF)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 100baseT_Half);
-	if (hw->phy.autoneg_advertised & ADVERTISE_100_FULL)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 100baseT_Full);
-	if (hw->phy.autoneg_advertised & ADVERTISE_1000_FULL)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 1000baseT_Full);
-	if (hw->phy.autoneg_advertised & ADVERTISE_2500_FULL)
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, 2500baseT_Full);
-
 	/* set autoneg settings */
 	ethtool_link_ksettings_add_link_mode(cmd, supported, Autoneg);
-	ethtool_link_ksettings_add_link_mode(cmd, advertising, Autoneg);
+	if (hw->mac.autoneg_enabled) {
+		ethtool_link_ksettings_add_link_mode(cmd, advertising, Autoneg);
+		cmd->base.autoneg = AUTONEG_ENABLE;
+
+		/* advertising link modes only apply when autoneg is on */
+		if (hw->phy.autoneg_advertised & ADVERTISE_10_HALF)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     10baseT_Half);
+		if (hw->phy.autoneg_advertised & ADVERTISE_10_FULL)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     10baseT_Full);
+		if (hw->phy.autoneg_advertised & ADVERTISE_100_HALF)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     100baseT_Half);
+		if (hw->phy.autoneg_advertised & ADVERTISE_100_FULL)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     100baseT_Full);
+		if (hw->phy.autoneg_advertised & ADVERTISE_1000_FULL)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     1000baseT_Full);
+		if (hw->phy.autoneg_advertised & ADVERTISE_2500_FULL)
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     2500baseT_Full);
+
+		/* Set pause flow control advertising */
+		switch (hw->fc.requested_mode) {
+		case igc_fc_full:
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     Pause);
+			break;
+		case igc_fc_rx_pause:
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     Pause);
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     Asym_Pause);
+			break;
+		case igc_fc_tx_pause:
+			ethtool_link_ksettings_add_link_mode(cmd, advertising,
+							     Asym_Pause);
+			break;
+		default:
+			break;
+		}
+	} else {
+		cmd->base.autoneg = AUTONEG_DISABLE;
+	}

-	/* Set pause flow control settings */
+	/* Pause is always supported */
 	ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);

-	switch (hw->fc.requested_mode) {
-	case igc_fc_full:
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause);
-		break;
-	case igc_fc_rx_pause:
-		ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(cmd, advertising,
-						     Asym_Pause);
-		break;
-	case igc_fc_tx_pause:
-		ethtool_link_ksettings_add_link_mode(cmd, advertising,
-						     Asym_Pause);
-		break;
-	default:
-		break;
-	}
-
 	status = pm_runtime_suspended(&adapter->pdev->dev) ?
 		 0 : rd32(IGC_STATUS);

@@ -1983,7 +1997,6 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
 		cmd->base.duplex = DUPLEX_UNKNOWN;
 	}
 	cmd->base.speed = speed;
-	cmd->base.autoneg = AUTONEG_ENABLE;

 	/* MDI-X => 2; MDI =>1; Invalid =>0 */
 	if (hw->phy.media_type == igc_media_type_copper)
@@ -2000,6 +2013,41 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
 	return 0;
 }

+/**
+ * igc_handle_autoneg_disabled - Configure forced speed/duplex settings
+ * @adapter: private driver structure
+ * @speed: requested speed (must be SPEED_10 or SPEED_100)
+ * @duplex: requested duplex
+ *
+ * Records forced speed/duplex when autoneg is disabled.
+ * Caller must validate speed before calling this function.
+ */
+static void igc_handle_autoneg_disabled(struct igc_adapter *adapter, u32 speed,
+					u8 duplex)
+{
+	struct igc_mac_info *mac = &adapter->hw.mac;
+
+	switch (speed) {
+	case SPEED_10:
+		mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
+			IGC_FORCED_10F : IGC_FORCED_10H;
+		break;
+	case SPEED_100:
+		mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
+			IGC_FORCED_100F : IGC_FORCED_100H;
+		break;
+	default:
+		WARN_ONCE(1, "Unsupported speed %u\n", speed);
+		return;
+	}
+
+	mac->autoneg_enabled = false;
+
+	/* Half-duplex cannot support flow control per IEEE 802.3 */
+	if (duplex == DUPLEX_HALF)
+		adapter->hw.fc.requested_mode = igc_fc_none;
+}
+
 /**
  * igc_handle_autoneg_enabled - Configure autonegotiation advertisement
  * @adapter: private driver structure
@@ -2038,6 +2086,7 @@ static void igc_handle_autoneg_enabled(struct igc_adapter *adapter,
 						  10baseT_Half))
 		advertised |= ADVERTISE_10_HALF;

+	hw->mac.autoneg_enabled = true;
 	hw->phy.autoneg_advertised = advertised;
 	if (adapter->fc_autoneg)
 		hw->fc.requested_mode = igc_fc_default;
@@ -2071,14 +2120,20 @@ igc_ethtool_set_link_ksettings(struct net_device *netdev,
 		}
 	}

+	if (cmd->base.autoneg == AUTONEG_DISABLE &&
+	    cmd->base.speed != SPEED_10 && cmd->base.speed != SPEED_100) {
+		netdev_info(dev, "Unsupported speed for forced link\n");
+		return -EINVAL;
+	}
+
 	while (test_and_set_bit(__IGC_RESETTING, &adapter->state))
 		usleep_range(1000, 2000);

-	if (cmd->base.autoneg == AUTONEG_ENABLE) {
+	if (cmd->base.autoneg == AUTONEG_ENABLE)
 		igc_handle_autoneg_enabled(adapter, cmd);
-	} else {
-		netdev_info(dev, "Force mode currently not supported\n");
-	}
+	else
+		igc_handle_autoneg_disabled(adapter, cmd->base.speed,
+					    cmd->base.duplex);

 	/* MDI-X => 2; MDI => 1; Auto => 3 */
 	if (cmd->base.eth_tp_mdix_ctrl) {
diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h
index 86ab8f566f44..62aaee55668a 100644
--- a/drivers/net/ethernet/intel/igc/igc_hw.h
+++ b/drivers/net/ethernet/intel/igc/igc_hw.h
@@ -73,6 +73,13 @@ struct igc_info {

 extern const struct igc_info igc_base_info;

+enum igc_forced_speed_duplex {
+	IGC_FORCED_10H,
+	IGC_FORCED_10F,
+	IGC_FORCED_100H,
+	IGC_FORCED_100F,
+};
+
 struct igc_mac_info {
 	struct igc_mac_operations ops;

@@ -93,6 +100,8 @@ struct igc_mac_info {
 	bool arc_subsystem_valid;

 	bool get_link_status;
+	bool autoneg_enabled;
+	enum igc_forced_speed_duplex forced_speed_duplex;
 };

 struct igc_nvm_operations {
diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c
index 142beb9ae557..8bbb6d5581c7 100644
--- a/drivers/net/ethernet/intel/igc/igc_mac.c
+++ b/drivers/net/ethernet/intel/igc/igc_mac.c
@@ -446,6 +446,16 @@ s32 igc_config_fc_after_link_up(struct igc_hw *hw)
 	u16 speed, duplex;
 	s32 ret_val = 0;

+	/* When autoneg is disabled, force the MAC flow control settings
+	 * to match the "fc" parameter.
+	 */
+	if (!hw->mac.autoneg_enabled) {
+		ret_val = igc_force_mac_fc(hw);
+		if (ret_val)
+			hw_dbg("Error forcing flow control settings\n");
+		goto out;
+	}
+
 	/* In auto-neg, we need to check and see if Auto-Neg has completed,
 	 * and if so, how the PHY and link partner has flow control
 	 * configured.
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 72bc5128d8b8..437e1d1ef1e4 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -7298,7 +7298,7 @@ static int igc_probe(struct pci_dev *pdev,
 	/* Initialize link properties that are user-changeable */
 	adapter->fc_autoneg = true;
 	hw->phy.autoneg_advertised = 0xaf;
-
+	hw->mac.autoneg_enabled = true;
 	hw->fc.requested_mode = igc_fc_default;
 	hw->fc.current_mode = igc_fc_default;

diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c
index 6c4d204aecfa..4cf737fb3b21 100644
--- a/drivers/net/ethernet/intel/igc/igc_phy.c
+++ b/drivers/net/ethernet/intel/igc/igc_phy.c
@@ -494,12 +494,20 @@ s32 igc_setup_copper_link(struct igc_hw *hw)
 	s32 ret_val = 0;
 	bool link;

-	/* Setup autoneg and flow control advertisement and perform
-	 * autonegotiation.
-	 */
-	ret_val = igc_copper_link_autoneg(hw);
-	if (ret_val)
-		goto out;
+	if (hw->mac.autoneg_enabled) {
+		/* Setup autoneg and flow control advertisement and perform
+		 * autonegotiation.
+		 */
+		ret_val = igc_copper_link_autoneg(hw);
+		if (ret_val)
+			goto out;
+	} else {
+		ret_val = hw->phy.ops.force_speed_duplex(hw);
+		if (ret_val) {
+			hw_dbg("Error Forcing Speed/Duplex\n");
+			goto out;
+		}
+	}

 	/* Check link status. Wait up to 100 microseconds for link to become
 	 * valid.
@@ -778,3 +786,48 @@ u16 igc_read_phy_fw_version(struct igc_hw *hw)

 	return gphy_version;
 }
+
+/**
+ * igc_force_speed_duplex - Force PHY speed and duplex settings
+ * @hw: pointer to the HW structure
+ *
+ * Programs the GPY PHY control register to disable autonegotiation
+ * and force the speed/duplex indicated by hw->mac.forced_speed_duplex.
+ */
+s32 igc_force_speed_duplex(struct igc_hw *hw)
+{
+	struct igc_phy_info *phy = &hw->phy;
+	u16 phy_ctrl;
+	s32 ret_val;
+
+	ret_val = phy->ops.read_reg(hw, PHY_CONTROL, &phy_ctrl);
+	if (ret_val)
+		return ret_val;
+
+	phy_ctrl &= ~(MII_CR_SPEED_MASK | MII_CR_DUPLEX_EN |
+		      MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG);
+
+	switch (hw->mac.forced_speed_duplex) {
+	case IGC_FORCED_10H:
+		phy_ctrl |= MII_CR_SPEED_10;
+		break;
+	case IGC_FORCED_10F:
+		phy_ctrl |= MII_CR_SPEED_10 | MII_CR_DUPLEX_EN;
+		break;
+	case IGC_FORCED_100H:
+		phy_ctrl |= MII_CR_SPEED_100;
+		break;
+	case IGC_FORCED_100F:
+		phy_ctrl |= MII_CR_SPEED_100 | MII_CR_DUPLEX_EN;
+		break;
+	default:
+		return -IGC_ERR_CONFIG;
+	}
+
+	ret_val = phy->ops.write_reg(hw, PHY_CONTROL, phy_ctrl);
+	if (ret_val)
+		return ret_val;
+
+	hw->mac.get_link_status = true;
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/igc/igc_phy.h b/drivers/net/ethernet/intel/igc/igc_phy.h
index 832a7e359f18..d37a89174826 100644
--- a/drivers/net/ethernet/intel/igc/igc_phy.h
+++ b/drivers/net/ethernet/intel/igc/igc_phy.h
@@ -18,5 +18,6 @@ void igc_power_down_phy_copper(struct igc_hw *hw);
 s32 igc_write_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 data);
 s32 igc_read_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 *data);
 u16 igc_read_phy_fw_version(struct igc_hw *hw);
+s32 igc_force_speed_duplex(struct igc_hw *hw);

 #endif
--
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox