[PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag
@ 2026-03-30 21:01 Dipayaan Roy
  2026-03-30 22:47 ` Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Dipayaan Roy @ 2026-03-30 21:01 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, leitao, kees, dipayanroy

On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate(180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page().

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 159 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 159 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 49c65cc1697c..59a1626c2be1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+	 * in the RX refill path (~2kB buffer) can cause significant throughput
+	 * regression under high connection counts. Allow user to force one RX
+	 * buffer per page via ethtool private flag to bypass the fragment
+	 * path.
+	 */
+	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 6a4b42fe0944..9f7393b71a34 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,58 +133,91 @@ static const struct mana_stats_desc mana_phy_stats[] = {
 	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 
-	if (stringset != ETH_SS_STATS)
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(mana_eth_stats) +
+		       ARRAY_SIZE(mana_phy_stats) +
+		       ARRAY_SIZE(mana_hc_stats)  +
+		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	case ETH_SS_PRIV_FLAGS:
+		return MANA_PRIV_FLAG_MAX;
+	default:
 		return -EINVAL;
+	}
+}
+
+static void mana_get_strings_priv_flags(u8 **data)
+{
+	int i;
 
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
-			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+		ethtool_puts(data, mana_priv_flags[i]);
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	int i, j;
 
-	if (stringset != ETH_SS_STATS)
-		return;
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-		ethtool_puts(&data, mana_eth_stats[i].name);
+		ethtool_puts(data, mana_eth_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-		ethtool_puts(&data, mana_hc_stats[i].name);
+		ethtool_puts(data, mana_hc_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-		ethtool_puts(&data, mana_phy_stats[i].name);
+		ethtool_puts(data, mana_phy_stats[i].name);
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "rx_%d_packets", i);
-		ethtool_sprintf(&data, "rx_%d_bytes", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		ethtool_sprintf(data, "rx_%d_packets", i);
+		ethtool_sprintf(data, "rx_%d_bytes", i);
+		ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+		ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+		ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
 		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
+			ethtool_sprintf(data,
+					"rx_%d_coalesced_cqe_%d",
+					i,
+					j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "tx_%d_packets", i);
-		ethtool_sprintf(&data, "tx_%d_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-		ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-		ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+		ethtool_sprintf(data, "tx_%d_packets", i);
+		ethtool_sprintf(data, "tx_%d_bytes", i);
+		ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+		ethtool_sprintf(data, "tx_%d_tso_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+		ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_csum_partial", i);
+		ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+	}
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	switch (stringset) {
+	case ETH_SS_PRIV_FLAGS:
+		mana_get_strings_priv_flags(&data);
+		break;
+
+	case ETH_SS_STATS:
+		mana_get_strings_stats(apc, &data);
+		break;
 	}
 }
 
@@ -573,6 +606,74 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u32 changed = apc->priv_flags ^ priv_flags;
+	u32 old_priv_flags = apc->priv_flags;
+	bool schedule_port_reset = false;
+	int err = 0;
+
+	if (!changed)
+		return 0;
+
+	/* Reject unknown bits */
+	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+		return -EINVAL;
+
+	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+		apc->priv_flags = priv_flags;
+
+		if (!apc->port_is_up) {
+			/* Port is down, flag updated to apply on next up
+			 * so just return.
+			 */
+			return 0;
+		}
+
+		/* Pre-allocate buffers to prevent failure in mana_attach
+		 * later
+		 */
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			netdev_err(ndev,
+				   "Insufficient memory for new allocations\n");
+			apc->priv_flags = old_priv_flags;
+			return err;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev, "mana_detach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+			goto out;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev, "mana_attach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+			schedule_port_reset = true;
+		}
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(apc);
+
+	if (err && schedule_port_reset)
+		queue_work(apc->ac->per_port_queue_reset_wq,
+			   &apc->queue_reset_work);
+
+	return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
@@ -591,4 +692,6 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
+	.get_priv_flags		= mana_get_priv_flags,
+	.set_priv_flags		= mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3336688fed5e..fd87e3d6c1f4 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -30,6 +30,12 @@ enum TRI_STATE {
 	TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+	MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -531,6 +537,8 @@ struct mana_port_context {
 	u32 rxbpre_headroom;
 	u32 rxbpre_frag_count;
 
+	u32 priv_flags;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag
  2026-03-30 21:01 [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag Dipayaan Roy
@ 2026-03-30 22:47 ` Jakub Kicinski
  2026-04-05  3:14   ` Dipayaan Roy
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2026-03-30 22:47 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, leitao, kees, dipayanroy

On Mon, 30 Mar 2026 14:01:54 -0700 Dipayaan Roy wrote:
> On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
> allocation in the RX refill path can cause 15-20% throughput
> regression under high connection counts (>16 TCP streams).

Did you investigate what makes such a difference exactly?
As I said I suspect there are some improvements we could
make in the page pool fragmentation logic that could yield
similar wins without bothering the user.

> Add an ethtool private flag "full-page-rx" that allows the user to
> force one RX buffer per page, bypassing the page_pool fragment path.
> This restores line-rate(180+ Gbps) performance on affected platforms.
> 
> Usage:
>   ethtool --set-priv-flags eth0 full-page-rx on
> 
> There is no behavioral change by default. The flag must be explicitly
> enabled by the user or udev rule.
> 
> The existing single-buffer-per-page logic for XDP and jumbo frames is
> consolidated into a new helper mana_use_single_rxbuf_per_page().

ethtool -g rx-buf-len could also fit the bill but I guess this is more
of a hack / workaround than legit config so no strong preference.

> -static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
> +static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
>  {
> -	struct mana_port_context *apc = netdev_priv(ndev);
>  	unsigned int num_queues = apc->num_queues;
>  	int i, j;
>  
> -	if (stringset != ETH_SS_STATS)
> -		return;
>  	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
> -		ethtool_puts(&data, mana_eth_stats[i].name);
> +		ethtool_puts(data, mana_eth_stats[i].name);
>  
>  	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
> -		ethtool_puts(&data, mana_hc_stats[i].name);
> +		ethtool_puts(data, mana_hc_stats[i].name);
>  
>  	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
> -		ethtool_puts(&data, mana_phy_stats[i].name);
> +		ethtool_puts(data, mana_phy_stats[i].name);
>  
>  	for (i = 0; i < num_queues; i++) {
> -		ethtool_sprintf(&data, "rx_%d_packets", i);
> -		ethtool_sprintf(&data, "rx_%d_bytes", i);
> -		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
> -		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
> -		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
> -		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
> +		ethtool_sprintf(data, "rx_%d_packets", i);

Please factor out the noisy, no-op prep work into a separate patch for
ease of review
-- 
pw-bot: cr

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag
  2026-03-30 22:47 ` Jakub Kicinski
@ 2026-04-05  3:14   ` Dipayaan Roy
  2026-04-06 17:51     ` Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Dipayaan Roy @ 2026-04-05  3:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, leitao, kees, dipayanroy

On Mon, Mar 30, 2026 at 03:47:55PM -0700, Jakub Kicinski wrote:
> On Mon, 30 Mar 2026 14:01:54 -0700 Dipayaan Roy wrote:
> > On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
> > allocation in the RX refill path can cause 15-20% throughput
> > regression under high connection counts (>16 TCP streams).
> 
> Did you investigate what makes such a difference exactly?
> As I said I suspect there are some improvements we could
> make in the page pool fragmentation logic that could yield
> similar wins without bothering the user.
>
I collected the perf numbers, shared the analysis below.
> > Add an ethtool private flag "full-page-rx" that allows the user to
> > force one RX buffer per page, bypassing the page_pool fragment path.
> > This restores line-rate(180+ Gbps) performance on affected platforms.
> > 
> > Usage:
> >   ethtool --set-priv-flags eth0 full-page-rx on
> > 
> > There is no behavioral change by default. The flag must be explicitly
> > enabled by the user or udev rule.
> > 
> > The existing single-buffer-per-page logic for XDP and jumbo frames is
> > consolidated into a new helper mana_use_single_rxbuf_per_page().
> 
> ethtool -g rx-buf-len could also fit the bill but I guess this is more
> of a hack / workaround than legit config so no strong preference.
> 
ok, want to stay with private flag.
> > -static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
> > +static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
> >  {
> > -	struct mana_port_context *apc = netdev_priv(ndev);
> >  	unsigned int num_queues = apc->num_queues;
> >  	int i, j;
> >  
> > -	if (stringset != ETH_SS_STATS)
> > -		return;
> >  	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
> > -		ethtool_puts(&data, mana_eth_stats[i].name);
> > +		ethtool_puts(data, mana_eth_stats[i].name);
> >  
> >  	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
> > -		ethtool_puts(&data, mana_hc_stats[i].name);
> > +		ethtool_puts(data, mana_hc_stats[i].name);
> >  
> >  	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
> > -		ethtool_puts(&data, mana_phy_stats[i].name);
> > +		ethtool_puts(data, mana_phy_stats[i].name);
> >  
> >  	for (i = 0; i < num_queues; i++) {
> > -		ethtool_sprintf(&data, "rx_%d_packets", i);
> > -		ethtool_sprintf(&data, "rx_%d_bytes", i);
> > -		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
> > -		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
> > -		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
> > -		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
> > +		ethtool_sprintf(data, "rx_%d_packets", i);
> 
> Please factor out the noisy, no-op prep work into a separate patch for
> ease of review
Ack, will split it out in 2 separate patches in v5.
> -- 
> pw-bot: cr

Hi Jakub,

I did some perf analysis on the ARM64 platform for which we want to
have this work around of full page rx buffers:

test: ntttcp with 48 tcp connections
perf: perf record -ag --call-graph dwarf -C 0-33 -- sleep 32

Page pool overhead summary: 
(framgment based rx buff vs full page rx buff on the same ARM64
platform)

  Function                        Fragment   Full-page   Delta
  ─----------------------------   ─-------   ---------   -----
  napi_pp_put_page                  3.93%      0.85%    +3.08%
  page_pool_alloc_frag_netmem       1.93%         —     +1.93%
  Total page_pool overhead          5.86%      0.85%    +5.01%

In fragment mode, napi_pp_put_page performs an atomic decrement of
the shared page refcount on every packet free. This single operation
accounts for ~3% more CPU than in full-page mode, where the page is
sole-owned and the atomic is skipped entirely. Additionally,
page_pool_alloc_frag_netmem adds ~2% overhead on the allocation
path for fragments.

Further annotation of the hot page pool functions in fragment mode
shows:

napi_pp_put_page:

    0.09 :   ffff80008117c240:       b       ffff80008117c268
<napi_pp_put_page+0x68>
         : 64               ATOMIC64_FETCH_OP(        , al, op, asm_op,
"memory")
         :
         : 66               ATOMIC64_FETCH_OPS(andnot, ldclr)
         : 67               ATOMIC64_FETCH_OPS(or, ldset)
         : 68               ATOMIC64_FETCH_OPS(xor, ldeor)
         : 69               ATOMIC64_FETCH_OPS(add, ldadd)
    0.00 :   ffff80008117c244:       mov     x3, #0xffffffffffffffff
// #-1
    0.08 :   ffff80008117c248:       add     x0, x2, #0x28
    0.06 :   ffff80008117c24c:       ldaddal x3, x3, [x0]
         : 73               }
         :
         : 75               ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
         : 76               ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
         : 77               ATOMIC64_OP_ADD_SUB_RETURN(_release)
         : 78               ATOMIC64_OP_ADD_SUB_RETURN(        )
   88.09 :   ffff80008117c250:       sub     x3, x3, #0x1
         :
         : 81               return 0;
         : 82               }

88% of this function's cycles stall on the sub that depends on
ldaddal.


page_pool_alloc_frag_netmem:

         : 151              ATOMIC64_FETCH_OPS(add, ldadd)
    0.00 :   ffff8000811fd40c:       add     x1, x21, #0x28
    0.14 :   ffff8000811fd410:       ldaddal x0, x1, [x1]
         : 154              }
         :
         : 156              ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
         : 157              ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
         : 158              ATOMIC64_OP_ADD_SUB_RETURN(_release)
         : 159              ATOMIC64_OP_ADD_SUB_RETURN(        )
   75.09 :   ffff8000811fd414:       add     x0, x0, x1
         : 161              WARN_ON(ret < 0);
    0.16 :   ffff8000811fd418:       cmp     x0, #0x0
    0.00 :   ffff8000811fd41c:       b.lt    ffff8000811fd394
<page_pool_alloc_frag_netmem+0xb4>  // b.tstop


75% of this function's cycles stall on the same pattern.


Full comparison (top functions, >0.5%):

Fragment mode:                          Full-page mode:
-------------                           --------------
 15.88%  __wake_up_sync_key             13.66%  __wake_up_sync_key
  9.66%  default_idle_call              10.41%  default_idle_call
  8.38%  handle_softirqs                 8.89%  handle_softirqs
  3.93%  napi_pp_put_page       ←        0.85%  napi_pp_put_page
  3.18%  tcp_gro_receive                 3.43%  tcp_gro_receive
  1.93%  page_pool_alloc_frag   ←           —
     —                                   1.14%
page_pool_recycle_in_cache
     —                                   1.06%
page_pool_put_unrefed_netmem
  0.93%  napi_build_skb                  1.24%  napi_build_skb
  0.56%  __build_skb_around              1.46%  __build_skb_around

In full page rx buffers mode  'napi_pp_put_page' took just 0.85% on
the same ARM64 platform.

Comparing with another platform(x86):

To confirm this behaviour is specific to this ARM64 platform, I
collected the same data on a x86 Vm (Intel, 192 vCPUs, same MANA NIC 200Gbps)
Here both full page rx buff mode and fragment modes rx buffs achieves identical
~182 Gbps on x86.

x86 fragment mode:                      x86 full-page mode:
─-----------------                      ─------------------
 61.69%  pv_native_safe_halt            50.91%  pv_native_safe_halt
  4.17%  _raw_spin_unlock_irqrestore     6.19%
_raw_spin_unlock_irqrestore
  3.95%  handle_softirqs                 4.02%  handle_softirqs
  2.51%  _copy_to_iter                   2.53%  _copy_to_iter
  0.60%  napi_pp_put_page                  —    napi_pp_put_page (<0.5%)

On x86, napi_pp_put_page is only 0.60% in fragment mode (vs 3.93%
on the ARM64 platform data shared earlier).

Note: I did not had a different arm64 platform available to run and compare
it with.

From the above data, seems to be an issue specific to this ARM64
platform.


Regards

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag
  2026-04-05  3:14   ` Dipayaan Roy
@ 2026-04-06 17:51     ` Jakub Kicinski
  0 siblings, 0 replies; 4+ messages in thread
From: Jakub Kicinski @ 2026-04-06 17:51 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, leitao, kees, dipayanroy

On Sat, 4 Apr 2026 20:14:35 -0700 Dipayaan Roy wrote:
>   Function                        Fragment   Full-page   Delta
>   ─----------------------------   ─-------   ---------   -----
>   napi_pp_put_page                  3.93%      0.85%    +3.08%
>   page_pool_alloc_frag_netmem       1.93%         —     +1.93%
>   Total page_pool overhead          5.86%      0.85%    +5.01%


Thanks for the analysis, and presumably recycling the full page is
cheaper because page_pool_put_unrefed_netmem() hits the fastpath
because page_pool_napi_local() returns true?

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-04-06 17:51 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-30 21:01 [PATCH net-next,v4] net: mana: Force full-page RX buffers via ethtool private flag Dipayaan Roy
2026-03-30 22:47 ` Jakub Kicinski
2026-04-05  3:14   ` Dipayaan Roy
2026-04-06 17:51     ` Jakub Kicinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox