Re: [PATCH RFC net-next v2 3/8] cadence: macb: Add page pool support handle multi-descriptor frame rx

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Paolo Valerio <pvalerio@redhat.com>
To: "Théo Lebrun" <theo.lebrun@bootlin.com>, netdev@vger.kernel.org
Cc: "Nicolas Ferre" <nicolas.ferre@microchip.com>,
	"Claudiu Beznea" <claudiu.beznea@tuxon.dev>,
	"Andrew Lunn" <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>,
	"Eric Dumazet" <edumazet@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Paolo Abeni" <pabeni@redhat.com>,
	"Lorenzo Bianconi" <lorenzo@kernel.org>,
	"Théo Lebrun" <theo.lebrun@bootlin.com>
Subject: Re: [PATCH RFC net-next v2 3/8] cadence: macb: Add page pool support handle multi-descriptor frame rx
Date: Mon, 12 Jan 2026 15:16:24 +0100	[thread overview]
Message-ID: <87jyxmor0n.fsf@redhat.com> (raw)
In-Reply-To: <DFJBNAK0H1KV.1HVW5GR7V4Q2B@bootlin.com>

On 08 Jan 2026 at 04:43:43 PM, Théo Lebrun <theo.lebrun@bootlin.com> wrote:

> On Sun Dec 21, 2025 at 12:51 AM CET, Paolo Valerio wrote:
>> Use the page pool allocator for the data buffers and enable skb recycling
>> support, instead of relying on netdev_alloc_skb allocating the entire skb
>> during the refill.
>>
>> The patch also add support for receiving network frames that span multiple
>> DMA descriptors in the Cadence MACB/GEM Ethernet driver.
>>
>> The patch removes the requirement that limited frame reception to
>> a single descriptor (RX_SOF && RX_EOF), also avoiding potential
>> contiguous multi-page allocation for large frames.
>>
>> Signed-off-by: Paolo Valerio <pvalerio@redhat.com>
>> ---
>>  drivers/net/ethernet/cadence/Kconfig     |   1 +
>>  drivers/net/ethernet/cadence/macb.h      |   5 +
>>  drivers/net/ethernet/cadence/macb_main.c | 345 +++++++++++++++--------
>>  3 files changed, 235 insertions(+), 116 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig
>> index 5b2a461dfd28..ae500f717433 100644
>> --- a/drivers/net/ethernet/cadence/Kconfig
>> +++ b/drivers/net/ethernet/cadence/Kconfig
>> @@ -25,6 +25,7 @@ config MACB
>>  	depends on PTP_1588_CLOCK_OPTIONAL
>>  	select PHYLINK
>>  	select CRC32
>> +	select PAGE_POOL
>>  	help
>>  	  The Cadence MACB ethernet interface is found on many Atmel AT32 and
>>  	  AT91 parts.  This driver also supports the Cadence GEM (Gigabit
>> diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
>> index 3b184e9ac771..45c04157f153 100644
>> --- a/drivers/net/ethernet/cadence/macb.h
>> +++ b/drivers/net/ethernet/cadence/macb.h
>> @@ -14,6 +14,8 @@
>>  #include <linux/interrupt.h>
>>  #include <linux/phy/phy.h>
>>  #include <linux/workqueue.h>
>> +#include <net/page_pool/helpers.h>
>> +#include <net/xdp.h>
>
> nit: `#include <net/xdp.h>` is not needed yet.
>

ack

>>  
>>  #define MACB_GREGS_NBR 16
>>  #define MACB_GREGS_VERSION 2
>> @@ -1266,6 +1268,8 @@ struct macb_queue {
>>  	void			*rx_buffers;
>>  	struct napi_struct	napi_rx;
>>  	struct queue_stats stats;
>> +	struct page_pool	*page_pool;
>> +	struct sk_buff		*skb;
>>  };
>>  
>>  struct ethtool_rx_fs_item {
>> @@ -1289,6 +1293,7 @@ struct macb {
>>  	struct macb_dma_desc	*rx_ring_tieoff;
>>  	dma_addr_t		rx_ring_tieoff_dma;
>>  	size_t			rx_buffer_size;
>> +	size_t			rx_headroom;
>>  
>>  	unsigned int		rx_ring_size;
>>  	unsigned int		tx_ring_size;
>> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
>> index b4e2444b2e95..9e1efc1f56d8 100644
>> --- a/drivers/net/ethernet/cadence/macb_main.c
>> +++ b/drivers/net/ethernet/cadence/macb_main.c
>> @@ -1249,14 +1249,22 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
>>  	return packets;
>>  }
>>  
>> -static int gem_rx_refill(struct macb_queue *queue)
>> +static int gem_total_rx_buffer_size(struct macb *bp)
>> +{
>> +	return SKB_HEAD_ALIGN(bp->rx_buffer_size + bp->rx_headroom);
>> +}
>
> nit: something closer to a buffer size, either `unsigned int` or
> `size_t`, sounds better than an int return type.
>

will do

>> +
>> +static int gem_rx_refill(struct macb_queue *queue, bool napi)
>>  {
>>  	unsigned int		entry;
>> -	struct sk_buff		*skb;
>>  	dma_addr_t		paddr;
>> +	void			*data;
>>  	struct macb *bp = queue->bp;
>>  	struct macb_dma_desc *desc;
>> +	struct page *page;
>> +	gfp_t gfp_alloc;
>>  	int err = 0;
>> +	int offset;
>>  
>>  	while (CIRC_SPACE(queue->rx_prepared_head, queue->rx_tail,
>>  			bp->rx_ring_size) > 0) {
>> @@ -1268,25 +1276,20 @@ static int gem_rx_refill(struct macb_queue *queue)
>>  		desc = macb_rx_desc(queue, entry);
>>  
>>  		if (!queue->rx_buff[entry]) {
>> -			/* allocate sk_buff for this free entry in ring */
>> -			skb = netdev_alloc_skb(bp->dev, bp->rx_buffer_size);
>> -			if (unlikely(!skb)) {
>> +			gfp_alloc = napi ? GFP_ATOMIC : GFP_KERNEL;
>> +			page = page_pool_alloc_frag(queue->page_pool, &offset,
>> +						    gem_total_rx_buffer_size(bp),
>> +						    gfp_alloc | __GFP_NOWARN);
>> +			if (!page) {
>>  				netdev_err(bp->dev,
>> -					   "Unable to allocate sk_buff\n");
>> +					   "Unable to allocate page\n");
>>  				err = -ENOMEM;
>>  				break;
>>  			}
>>  
>> -			/* now fill corresponding descriptor entry */
>> -			paddr = dma_map_single(&bp->pdev->dev, skb->data,
>> -					       bp->rx_buffer_size,
>> -					       DMA_FROM_DEVICE);
>> -			if (dma_mapping_error(&bp->pdev->dev, paddr)) {
>> -				dev_kfree_skb(skb);
>> -				break;
>> -			}
>> -
>> -			queue->rx_buff[entry] = skb;
>> +			paddr = page_pool_get_dma_addr(page) + XDP_PACKET_HEADROOM + offset;
>> +			data = page_address(page) + offset;
>> +			queue->rx_buff[entry] = data;
>>  
>>  			if (entry == bp->rx_ring_size - 1)
>>  				paddr |= MACB_BIT(RX_WRAP);
>> @@ -1296,20 +1299,6 @@ static int gem_rx_refill(struct macb_queue *queue)
>>  			 */
>>  			dma_wmb();
>>  			macb_set_addr(bp, desc, paddr);
>> -
>> -			/* Properly align Ethernet header.
>> -			 *
>> -			 * Hardware can add dummy bytes if asked using the RBOF
>> -			 * field inside the NCFGR register. That feature isn't
>> -			 * available if hardware is RSC capable.
>> -			 *
>> -			 * We cannot fallback to doing the 2-byte shift before
>> -			 * DMA mapping because the address field does not allow
>> -			 * setting the low 2/3 bits.
>> -			 * It is 3 bits if HW_DMA_CAP_PTP, else 2 bits.
>> -			 */
>> -			if (!(bp->caps & MACB_CAPS_RSC))
>> -				skb_reserve(skb, NET_IP_ALIGN);
>>  		} else {
>>  			desc->ctrl = 0;
>>  			dma_wmb();
>> @@ -1353,14 +1342,19 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>  	struct macb *bp = queue->bp;
>>  	unsigned int		len;
>>  	unsigned int		entry;
>> -	struct sk_buff		*skb;
>>  	struct macb_dma_desc	*desc;
>> +	int			data_len;
>>  	int			count = 0;
>> +	void			*buff_head;
>> +	struct skb_shared_info	*shinfo;
>> +	struct page		*page;
>> +	int			nr_frags;
>
> nit: you add 5 new stack variables, maybe you could apply reverse xmas
> tree while at it. You do it for the loop body in [5/8].
>

sure

>> +
>>  
>>  	while (count < budget) {
>>  		u32 ctrl;
>>  		dma_addr_t addr;
>> -		bool rxused;
>> +		bool rxused, first_frame;
>>  
>>  		entry = macb_rx_ring_wrap(bp, queue->rx_tail);
>>  		desc = macb_rx_desc(queue, entry);
>> @@ -1374,6 +1368,12 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>  		if (!rxused)
>>  			break;
>>  
>> +		if (!(bp->caps & MACB_CAPS_RSC))
>> +			addr += NET_IP_ALIGN;
>> +
>> +		dma_sync_single_for_cpu(&bp->pdev->dev,
>> +					addr, bp->rx_buffer_size,
>> +					page_pool_get_dma_dir(queue->page_pool));
>>  		/* Ensure ctrl is at least as up-to-date as rxused */
>>  		dma_rmb();
>>  
>> @@ -1382,58 +1382,118 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>  		queue->rx_tail++;
>>  		count++;
>>  
>> -		if (!(ctrl & MACB_BIT(RX_SOF) && ctrl & MACB_BIT(RX_EOF))) {
>> -			netdev_err(bp->dev,
>> -				   "not whole frame pointed by descriptor\n");
>> -			bp->dev->stats.rx_dropped++;
>> -			queue->stats.rx_dropped++;
>> -			break;
>> -		}
>> -		skb = queue->rx_buff[entry];
>> -		if (unlikely(!skb)) {
>> +		buff_head = queue->rx_buff[entry];
>> +		if (unlikely(!buff_head)) {
>>  			netdev_err(bp->dev,
>>  				   "inconsistent Rx descriptor chain\n");
>>  			bp->dev->stats.rx_dropped++;
>>  			queue->stats.rx_dropped++;
>>  			break;
>>  		}
>> -		/* now everything is ready for receiving packet */
>> -		queue->rx_buff[entry] = NULL;
>> +
>> +		first_frame = ctrl & MACB_BIT(RX_SOF);
>>  		len = ctrl & bp->rx_frm_len_mask;
>>  
>> -		netdev_vdbg(bp->dev, "gem_rx %u (len %u)\n", entry, len);
>> +		if (len) {
>> +			data_len = len;
>> +			if (!first_frame)
>> +				data_len -= queue->skb->len;
>> +		} else {
>> +			data_len = bp->rx_buffer_size;
>> +		}
>
> Why deal with the `!len` case? How can it occur? User guide doesn't hint
> that. It would mean we would grab uninitialised bytes as we assume len
> is the max buffer size.
>

Good point. After taking a second look, !len may not be the most reliable
way to check this.
From the datasheet, status signals are only valid (with some exceptions)
when MACB_BIT(RX_EOF) is set. As a side effect, len is always zero on my
hw for frames without the EOF bit, but it's probably better to just rely
on MACB_BIT(RX_EOF) instead of reading something that may end up being
unreliable.

>> +
>> +		if (first_frame) {
>> +			queue->skb = napi_build_skb(buff_head, gem_total_rx_buffer_size(bp));
>> +			if (unlikely(!queue->skb)) {
>> +				netdev_err(bp->dev,
>> +					   "Unable to allocate sk_buff\n");
>> +				goto free_frags;
>> +			}
>> +
>> +			/* Properly align Ethernet header.
>> +			 *
>> +			 * Hardware can add dummy bytes if asked using the RBOF
>> +			 * field inside the NCFGR register. That feature isn't
>> +			 * available if hardware is RSC capable.
>> +			 *
>> +			 * We cannot fallback to doing the 2-byte shift before
>> +			 * DMA mapping because the address field does not allow
>> +			 * setting the low 2/3 bits.
>> +			 * It is 3 bits if HW_DMA_CAP_PTP, else 2 bits.
>> +			 */
>> +			skb_reserve(queue->skb, bp->rx_headroom);
>> +			skb_mark_for_recycle(queue->skb);
>> +			skb_put(queue->skb, data_len);
>> +			queue->skb->protocol = eth_type_trans(queue->skb, bp->dev);
>> +
>> +			skb_checksum_none_assert(queue->skb);
>> +			if (bp->dev->features & NETIF_F_RXCSUM &&
>> +			    !(bp->dev->flags & IFF_PROMISC) &&
>> +			    GEM_BFEXT(RX_CSUM, ctrl) & GEM_RX_CSUM_CHECKED_MASK)
>> +				queue->skb->ip_summed = CHECKSUM_UNNECESSARY;
>> +		} else {
>> +			if (!queue->skb) {
>> +				netdev_err(bp->dev,
>> +					   "Received non-starting frame while expecting it\n");
>> +				goto free_frags;
>> +			}
>> +
>> +			shinfo = skb_shinfo(queue->skb);
>> +			page = virt_to_head_page(buff_head);
>> +			nr_frags = shinfo->nr_frags;
>> +
>> +			if (nr_frags >= ARRAY_SIZE(shinfo->frags))
>> +				goto free_frags;
>>  
>> -		skb_put(skb, len);
>> -		dma_unmap_single(&bp->pdev->dev, addr,
>> -				 bp->rx_buffer_size, DMA_FROM_DEVICE);
>> +			skb_add_rx_frag(queue->skb, nr_frags, page,
>> +					buff_head - page_address(page) + bp->rx_headroom,
>> +					data_len, gem_total_rx_buffer_size(bp));
>> +		}
>> +
>> +		/* now everything is ready for receiving packet */
>> +		queue->rx_buff[entry] = NULL;
>>  
>> -		skb->protocol = eth_type_trans(skb, bp->dev);
>> -		skb_checksum_none_assert(skb);
>> -		if (bp->dev->features & NETIF_F_RXCSUM &&
>> -		    !(bp->dev->flags & IFF_PROMISC) &&
>> -		    GEM_BFEXT(RX_CSUM, ctrl) & GEM_RX_CSUM_CHECKED_MASK)
>> -			skb->ip_summed = CHECKSUM_UNNECESSARY;
>> +		netdev_vdbg(bp->dev, "%s %u (len %u)\n", __func__, entry, data_len);
>>  
>> -		bp->dev->stats.rx_packets++;
>> -		queue->stats.rx_packets++;
>> -		bp->dev->stats.rx_bytes += skb->len;
>> -		queue->stats.rx_bytes += skb->len;
>> +		if (ctrl & MACB_BIT(RX_EOF)) {
>> +			bp->dev->stats.rx_packets++;
>> +			queue->stats.rx_packets++;
>> +			bp->dev->stats.rx_bytes += queue->skb->len;
>> +			queue->stats.rx_bytes += queue->skb->len;
>>  
>> -		gem_ptp_do_rxstamp(bp, skb, desc);
>> +			gem_ptp_do_rxstamp(bp, queue->skb, desc);
>>  
>>  #if defined(DEBUG) && defined(VERBOSE_DEBUG)
>> -		netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
>> -			    skb->len, skb->csum);
>> -		print_hex_dump(KERN_DEBUG, " mac: ", DUMP_PREFIX_ADDRESS, 16, 1,
>> -			       skb_mac_header(skb), 16, true);
>> -		print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_ADDRESS, 16, 1,
>> -			       skb->data, 32, true);
>> +			netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
>> +				    queue->skb->len, queue->skb->csum);
>> +			print_hex_dump(KERN_DEBUG, " mac: ", DUMP_PREFIX_ADDRESS, 16, 1,
>> +				       skb_mac_header(queue->skb), 16, true);
>> +			print_hex_dump(KERN_DEBUG, "buff_head: ", DUMP_PREFIX_ADDRESS, 16, 1,
>> +				       queue->skb->buff_head, 32, true);
>>  #endif
>
> nit: while you are at it, maybe replace with print_hex_dump_debug()?
>

sure

>>  
>> -		napi_gro_receive(napi, skb);
>> +			napi_gro_receive(napi, queue->skb);
>> +			queue->skb = NULL;
>> +		}
>> +
>> +		continue;
>> +
>> +free_frags:
>> +		if (queue->skb) {
>> +			dev_kfree_skb(queue->skb);
>> +			queue->skb = NULL;
>> +		} else {
>> +			page_pool_put_full_page(queue->page_pool,
>> +						virt_to_head_page(buff_head),
>> +						false);
>> +		}
>> +
>> +		bp->dev->stats.rx_dropped++;
>> +		queue->stats.rx_dropped++;
>> +		queue->rx_buff[entry] = NULL;
>>  	}
>>  
>> -	gem_rx_refill(queue);
>> +	gem_rx_refill(queue, true);
>>  
>>  	return count;
>>  }
>> @@ -2367,12 +2427,25 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	return ret;
>>  }
>>  
>> -static void macb_init_rx_buffer_size(struct macb *bp, size_t size)
>> +static void macb_init_rx_buffer_size(struct macb *bp, unsigned int mtu)
>>  {
>> +	int overhead;
>
> nit: Maybe `unsigned int` or `size_t` rather than `int`?
>

ack

>> +	size_t size;
>> +
>>  	if (!macb_is_gem(bp)) {
>>  		bp->rx_buffer_size = MACB_RX_BUFFER_SIZE;
>>  	} else {
>> -		bp->rx_buffer_size = size;
>> +		size = mtu + ETH_HLEN + ETH_FCS_LEN;
>> +		if (!(bp->caps & MACB_CAPS_RSC))
>> +			size += NET_IP_ALIGN;
>
> NET_IP_ALIGN looks like it is accounted for twice, once in
> bp->rx_headroom and once in bp->rx_buffer_size. This gets fixed in
> [5/8] where gem_max_rx_data_size() gets introduced.
>

ah, right

>> +
>> +		bp->rx_buffer_size = SKB_DATA_ALIGN(size);
>> +		if (gem_total_rx_buffer_size(bp) > PAGE_SIZE) {
>> +			overhead = bp->rx_headroom +
>> +				SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +			bp->rx_buffer_size = rounddown(PAGE_SIZE - overhead,
>> +						       RX_BUFFER_MULTIPLE);
>> +		}
>
> I've seen your comment in [0/8]. Do you have any advice on how to test
> this clamping? All I can think of is to either configure a massive MTU
> or, more easily, cheat with the headroom.
>

I normally test the set with 4k PAGE_SIZE and, as you said, setting the
mtu to something bigger than that. This is still possible with 8k pages
(given .jumbo_max_len = 10240).


> Also, should we warn? It means MTU-sized packets will be received in
> fragments. It will work but is probably unexpected by users and a
> slowdown reason that users might want to know about.
>

I'm not sure about the warning as I don't see this as a user level detail.
For debugging purpose, I guess we should be fine the last print out (even
better once extended with your suggestion). Of course, feel free to disagree.

> --
>
> nit: while in macb_init_rx_buffer_size(), can you tweak the debug line
> from mtu & rx_buffer_size to also have rx_headroom and total? So that
> we have everything available to understand what is going on buffer size
> wise. Something like:
>
> -       netdev_dbg(bp->dev, "mtu [%u] rx_buffer_size [%zu]\n",
> -                  bp->dev->mtu, bp->rx_buffer_size);
> +       netdev_info(bp->dev, "mtu [%u] rx_buffer_size [%zu] rx_headroom [%zu] total [%u]\n",
> +                   bp->dev->mtu, bp->rx_buffer_size, bp->rx_headroom,
> +                   gem_total_rx_buffer_size(bp));
>
> Thanks,
>
> --
> Théo Lebrun, Bootlin
> Embedded Linux and Kernel engineering
> https://bootlin.com

next prev parent reply	other threads:[~2026-01-12 14:16 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-20 23:51 [PATCH RFC net-next v2 0/8] net: macb: Add XDP support and page pool integration Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 1/8] net: macb: move Rx buffers alloc from link up to open Paolo Valerio
2026-01-08 15:24   ` Théo Lebrun
2026-01-12 14:14     ` Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 2/8] net: macb: rename rx_skbuff into rx_buff Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 3/8] cadence: macb: Add page pool support handle multi-descriptor frame rx Paolo Valerio
2026-01-08 15:43   ` Théo Lebrun
2026-01-12 14:16     ` Paolo Valerio [this message]
2026-01-12 18:43       ` Paolo Valerio
2026-01-13 10:35         ` Théo Lebrun
2026-01-13 19:30           ` Paolo Valerio
2026-01-13 10:43       ` Théo Lebrun
2025-12-20 23:51 ` [PATCH RFC net-next v2 4/8] cadence: macb: use the current queue number for stats Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 5/8] cadence: macb: add XDP support for gem Paolo Valerio
2026-01-08 15:49   ` Théo Lebrun
2026-01-12 14:17     ` Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 6/8] cadence: macb: make macb_tx_skb generic Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 7/8] cadence: macb: make tx path skb agnostic Paolo Valerio
2025-12-20 23:51 ` [PATCH RFC net-next v2 8/8] cadence: macb: introduce xmit support Paolo Valerio
2026-01-08 15:54   ` Théo Lebrun
2026-01-12 14:17     ` Paolo Valerio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87jyxmor0n.fsf@redhat.com \
    --to=pvalerio@redhat.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=claudiu.beznea@tuxon.dev \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=lorenzo@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=nicolas.ferre@microchip.com \
    --cc=pabeni@redhat.com \
    --cc=theo.lebrun@bootlin.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.