[PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
@ 2026-03-11  4:00 Dipayaan Roy
  2026-03-14 19:50 ` Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Dipayaan Roy @ 2026-03-11  4:00 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy

On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
fragments for RX buffers results in a significant throughput regression.
Profiling reveals that this regression correlates with high overhead in the
fragment allocation and reference counting paths on these specific
platforms, rendering the multi-buffer-per-page strategy counterproductive.

To mitigate this, bypass the page_pool fragment path and force a single RX
packet per page allocation when all the following conditions are met:
  1. The system is configured with a 4K PAGE_SIZE.
  2. A processor-specific quirk is detected via SMBIOS Type 4 data.

This approach restores expected line-rate performance by ensuring
predictable RX refill behavior on affected hardware.

There is no behavioral change for systems using larger page sizes
(16K/64K), or platforms where this processor-specific quirk do not
apply.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.
---
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 133 ++++++++++++++++++
 drivers/net/ethernet/microsoft/mana/mana_en.c |  23 ++-
 include/net/mana/gdma.h                       |   9 ++
 3 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index aef8612b73cb..05fecc00a90c 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -9,6 +9,7 @@
 #include <linux/msi.h>
 #include <linux/irqdomain.h>
 #include <linux/export.h>
+#include <linux/dmi.h>
 
 #include <net/mana/mana.h>
 #include <net/mana/hw_channel.h>
@@ -1959,6 +1960,128 @@ static bool mana_is_pf(unsigned short dev_id)
 	return dev_id == MANA_PF_DEVICE_ID;
 }
 
+/*
+ * Table for Processor Version strings found from SMBIOS Type 4 information,
+ * for processors that needs to force single RX buffer per page quirk for
+ * meeting line rate performance with ARM64 + 4K pages.
+ * Note: These strings are exactly matched with version fetched from SMBIOS.
+ */
+static const char * const mana_single_rxbuf_per_page_quirk_tbl[] = {
+	"Cobalt 200",
+};
+
+/* On some systems with 4K PAGE_SIZE, page_pool RX fragments can
+ * trigger a throughput regression. Hence identify those processors
+ * from the extracted SMBIOS table and apply the quirk to forces one
+ * RX buffer per page to avoid the fragment allocation/refcounting
+ * overhead in the RX refill path for those processors only.
+ */
+static bool mana_needs_single_rxbuf_per_page(struct gdma_context *gc)
+{
+	int i = 0;
+	const char *ver = gc->processor_version;
+
+	if (!ver)
+		return false;
+
+	if (PAGE_SIZE != SZ_4K)
+		return false;
+
+	while (i < ARRAY_SIZE(mana_single_rxbuf_per_page_quirk_tbl)) {
+		if (!strcmp(ver, mana_single_rxbuf_per_page_quirk_tbl[i]))
+			return true;
+		i++;
+	}
+
+	return false;
+}
+
+static void mana_get_proc_ver_strno(const struct dmi_header *hdr, void *data)
+{
+	struct gdma_context *gc = data;
+	const u8 *d = (const u8 *)hdr;
+
+	/* We are only looking for Type 4: Processor Information */
+	if (hdr->type != SMBIOS_TYPE_4_PROCESSOR_INFO)
+		return;
+
+	/* Ensure the record is long enough to contain the Processor Version
+	 * field
+	 */
+	if (hdr->length <= SMBIOS_TYPE4_PROC_VERSION_OFFSET)
+		return;
+
+	/* The 'Processor Version' string is located at index pointed by
+	 * SMBIOS_TYPE4_PROC_VERSION_OFFSET.  Make a copy of the index.
+	 * There could be multiple Type 4 tables so read and store the
+	 * processor version index found the first time.
+	 */
+	if (gc->proc_ver_strno)
+		return;
+
+	gc->proc_ver_strno = d[SMBIOS_TYPE4_PROC_VERSION_OFFSET];
+}
+
+static const char *mana_dmi_string_nosave(const struct dmi_header *hdr, u8 s)
+{
+	const char *bp = (const char *)hdr + hdr->length;
+
+	if (!s)
+		return NULL;
+
+	/* String numbers start at 1 */
+	while (--s > 0 && *bp)
+		bp += strlen(bp) + 1;
+
+	if (!*bp)
+		return NULL;
+
+	return bp;
+}
+
+static void mana_fetch_proc_ver_string(const struct dmi_header *hdr,
+				       void *data)
+{
+	struct gdma_context *gc = data;
+	const char *ver;
+
+	/* We are only looking for Type 4: Processor Information */
+	if (hdr->type != SMBIOS_TYPE_4_PROCESSOR_INFO)
+		return;
+
+	/* Extract proc version found the first time only */
+	if (!gc->proc_ver_strno || gc->processor_version)
+		return;
+
+	ver = mana_dmi_string_nosave(hdr, gc->proc_ver_strno);
+	if (ver)
+		gc->processor_version = kstrdup(ver, GFP_KERNEL);
+}
+
+/* Check and initialize all processor optimizations/quirks here */
+static bool mana_init_processor_optimization(struct gdma_context *gc)
+{
+	bool opt_initialized = false;
+
+	gc->proc_ver_strno = 0;
+	gc->processor_version = NULL;
+
+	dmi_walk(mana_get_proc_ver_strno, gc);
+	if (!gc->proc_ver_strno)
+		return false;
+
+	dmi_walk(mana_fetch_proc_ver_string, gc);
+	if (!gc->processor_version)
+		return false;
+
+	if (mana_needs_single_rxbuf_per_page(gc)) {
+		gc->force_full_page_rx_buffer = true;
+		opt_initialized = true;
+	}
+
+	return opt_initialized;
+}
+
 static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct gdma_context *gc;
@@ -2013,6 +2136,11 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		gc->mana_pci_debugfs = debugfs_create_dir(pci_slot_name(pdev->slot),
 							  mana_debugfs_root);
 
+	if (mana_init_processor_optimization(gc))
+		dev_info(&pdev->dev,
+			 "Processor specific optimization initialized on: %s\n",
+			gc->processor_version);
+
 	err = mana_gd_setup(pdev);
 	if (err)
 		goto unmap_bar;
@@ -2055,6 +2183,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_iounmap(pdev, bar0_va);
 free_gc:
 	pci_set_drvdata(pdev, NULL);
+	kfree(gc->processor_version);
+	gc->processor_version = NULL;
 	vfree(gc);
 release_region:
 	pci_release_regions(pdev);
@@ -2110,6 +2240,9 @@ static void mana_gd_remove(struct pci_dev *pdev)
 
 	pci_iounmap(pdev, gc->bar0_va);
 
+	kfree(gc->processor_version);
+	gc->processor_version = NULL;
+
 	vfree(gc);
 
 	pci_release_regions(pdev);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a868c28c8280..38f94f7619ad 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,26 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
+
+	/* On some systems with 4K PAGE_SIZE, page_pool RX fragments can
+	 * trigger a throughput regression. Hence forces one RX buffer per page
+	 * to avoid the fragment allocation/refcounting overhead in the RX
+	 * refill path for those processors only.
+	 */
+	if (gc->force_full_page_rx_buffer)
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +774,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index ec17004b10c0..03f01496fbbf 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -9,6 +9,12 @@
 
 #include "shm_channel.h"
 
+/* SMBIOS Type 4: Processor Information table */
+#define SMBIOS_TYPE_4_PROCESSOR_INFO 4
+
+/* Byte offset containing the Processor Version string number.*/
+#define SMBIOS_TYPE4_PROC_VERSION_OFFSET 0x10
+
 #define GDMA_STATUS_MORE_ENTRIES	0x00000105
 #define GDMA_STATUS_CMD_UNSUPPORTED	0xffffffff
 
@@ -444,6 +450,9 @@ struct gdma_context {
 	struct workqueue_struct *service_wq;
 
 	unsigned long		flags;
+	char			*processor_version;
+	u8			proc_ver_strno;
+	bool			force_full_page_rx_buffer;
 };
 
 static inline bool mana_gd_is_mana(struct gdma_dev *gd)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
  2026-03-11  4:00 [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems Dipayaan Roy
@ 2026-03-14 19:50 ` Jakub Kicinski
  2026-03-20 18:37   ` Dipayaan Roy
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2026-03-14 19:50 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy

On Tue, 10 Mar 2026 21:00:49 -0700 Dipayaan Roy wrote:
> On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
> fragments for RX buffers results in a significant throughput regression.
> Profiling reveals that this regression correlates with high overhead in the
> fragment allocation and reference counting paths on these specific
> platforms, rendering the multi-buffer-per-page strategy counterproductive.

Can you say more ? We could technically take two references on the page
right away if MTU is small and avoid some of the cost.

The driver doesn't seem to set skb->truesize accordingly after this
change. So you're lying to the stack about how much memory each packet
consumes. This is a blocker for the change.

> To mitigate this, bypass the page_pool fragment path and force a single RX
> packet per page allocation when all the following conditions are met:
>   1. The system is configured with a 4K PAGE_SIZE.
>   2. A processor-specific quirk is detected via SMBIOS Type 4 data.

I don't think we want the kernel to be in the business of carrying
matching on platform names and providing optimal config by default.
This sort of logic needs to live in user space or the hypervisor 
(which can then pass a single bit to the driver to enable the behavior)

> This approach restores expected line-rate performance by ensuring
> predictable RX refill behavior on affected hardware.
> 
> There is no behavioral change for systems using larger page sizes
> (16K/64K), or platforms where this processor-specific quirk do not
> apply.
-- 
pw-bot: cr

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
  2026-03-14 19:50 ` Jakub Kicinski
@ 2026-03-20 18:37   ` Dipayaan Roy
  2026-03-21  0:29     ` Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Dipayaan Roy @ 2026-03-20 18:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy

On Sat, Mar 14, 2026 at 12:50:53PM -0700, Jakub Kicinski wrote:
> On Tue, 10 Mar 2026 21:00:49 -0700 Dipayaan Roy wrote:
> > On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
> > fragments for RX buffers results in a significant throughput regression.
> > Profiling reveals that this regression correlates with high overhead in the
> > fragment allocation and reference counting paths on these specific
> > platforms, rendering the multi-buffer-per-page strategy counterproductive.
> 
> Can you say more ? We could technically take two references on the page
> right away if MTU is small and avoid some of the cost.

There is a 15-20% shortfall in achieving line rate for MANA (180+ Gbps)
on a particular ARM64 SKU. The issue is only specific to this processor SKU —
not seen on other ARM64 SKUs (e.g., GB200) or x86 SKUs. Critically, the
regression only manifests beyond 16 TCP connections, which strongly indicates
seen when there is  high contention and traffic.

  no. of     | rx buf backed       | rx buf backed
 connections | with page fragments | with full page
-------------+---------------------+---------------
           4 |         139 Gbps    |     138 Gbps
           8 |         140 Gbps    |     162 Gbps
          16 |         186 Gbps    |     186 Gbps
          32 |         136 Gbps    |     183 Gbps
          48 |         159 Gbps    |     185 Gbps
          64 |         165 Gbps    |     184 Gbps
         128 |         170 Gbps    |     180 Gbps
 
HW team is still working to RCA this hw behaviour.

Regarding "We could technically take two references on the page right
away", are you suggesting having page reference counting logic to driver
instead of relying on page pool?

> 
> The driver doesn't seem to set skb->truesize accordingly after this
> change. So you're lying to the stack about how much memory each packet
> consumes. This is a blocker for the change.
> 
ACK. I will send out a separate patch with fixes tag to fix the skb true
size.

> > To mitigate this, bypass the page_pool fragment path and force a single RX
> > packet per page allocation when all the following conditions are met:
> >   1. The system is configured with a 4K PAGE_SIZE.
> >   2. A processor-specific quirk is detected via SMBIOS Type 4 data.
> 
> I don't think we want the kernel to be in the business of carrying
> matching on platform names and providing optimal config by default.
> This sort of logic needs to live in user space or the hypervisor 
> (which can then pass a single bit to the driver to enable the behavior)
> 
As per our internal discussion the hypervisor cannot provide the CPU
version info(in vm as well as in bare metal offerings).

On handling it from user side are you suggesting it to introduce a new
ethtool Private Flags and have udev rules for the driver to set the private
flag and switch to full page rx buffers? Given that the wide number of distro
support this might be harder to maintain/backport. 

Also the dmi parsing design was influenced by other net wireleass
drivers as /wireless/ath/ath10k/core.c. If this approach is not
acceptable for MANA driver then will have to take a alternate route
based on the dsicussion right above it.

> > This approach restores expected line-rate performance by ensuring
> > predictable RX refill behavior on affected hardware.
> > 
> > There is no behavioral change for systems using larger page sizes
> > (16K/64K), or platforms where this processor-specific quirk do not
> > apply.
> -- 
> pw-bot: cr

Thank you for your comments Jakub, and also pointing out the skb true
size issue. I am sending out a separate to fix the skb true size issue.

Regards
Dipayaan Roy


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
  2026-03-20 18:37   ` Dipayaan Roy
@ 2026-03-21  0:29     ` Jakub Kicinski
  0 siblings, 0 replies; 4+ messages in thread
From: Jakub Kicinski @ 2026-03-21  0:29 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy

On Fri, 20 Mar 2026 11:37:36 -0700 Dipayaan Roy wrote:
> On Sat, Mar 14, 2026 at 12:50:53PM -0700, Jakub Kicinski wrote:
> > On Tue, 10 Mar 2026 21:00:49 -0700 Dipayaan Roy wrote:  
> > > On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
> > > fragments for RX buffers results in a significant throughput regression.
> > > Profiling reveals that this regression correlates with high overhead in the
> > > fragment allocation and reference counting paths on these specific
> > > platforms, rendering the multi-buffer-per-page strategy counterproductive.  
> > 
> > Can you say more ? We could technically take two references on the page
> > right away if MTU is small and avoid some of the cost.  
> 
> There is a 15-20% shortfall in achieving line rate for MANA (180+ Gbps)
> on a particular ARM64 SKU. The issue is only specific to this processor SKU —
> not seen on other ARM64 SKUs (e.g., GB200) or x86 SKUs. Critically, the
> regression only manifests beyond 16 TCP connections, which strongly indicates
> seen when there is  high contention and traffic.
> 
>   no. of     | rx buf backed       | rx buf backed
>  connections | with page fragments | with full page
> -------------+---------------------+---------------
>            4 |         139 Gbps    |     138 Gbps
>            8 |         140 Gbps    |     162 Gbps
>           16 |         186 Gbps    |     186 Gbps

These results look at bit odd, 4 and 16 streams have the same perf,
while all other cases indeed show a delta. What I was hoping for was
a more precise attribution of the performance issue. Like perf top
showing that its indeed the atomic ops on the refcount that stall.

>           32 |         136 Gbps    |     183 Gbps
>           48 |         159 Gbps    |     185 Gbps
>           64 |         165 Gbps    |     184 Gbps
>          128 |         170 Gbps    |     180 Gbps
>  
> HW team is still working to RCA this hw behaviour.
> 
> Regarding "We could technically take two references on the page right
> away", are you suggesting having page reference counting logic to driver
> instead of relying on page pool?

Yes, either that or adjust the page pool APIs. 
page_pool_alloc_frag_netmem() currently sets the refcount to BIAS
which it then has to subtract later. So we get:

  set(BIAS)
  .. driver allocates chunks ..
  sub(BIAS_MAX - pool->frag_users)

Instead of using BIAS we could make the page pool guess that the caller
will keep asking for the same frame size. So initially take
(PAGE_SIZE/size) references.

> > The driver doesn't seem to set skb->truesize accordingly after this
> > change. So you're lying to the stack about how much memory each packet
> > consumes. This is a blocker for the change.
> >   
> ACK. I will send out a separate patch with fixes tag to fix the skb true
> size.
> 
> > > To mitigate this, bypass the page_pool fragment path and force a single RX
> > > packet per page allocation when all the following conditions are met:
> > >   1. The system is configured with a 4K PAGE_SIZE.
> > >   2. A processor-specific quirk is detected via SMBIOS Type 4 data.  
> > 
> > I don't think we want the kernel to be in the business of carrying
> > matching on platform names and providing optimal config by default.
> > This sort of logic needs to live in user space or the hypervisor 
> > (which can then pass a single bit to the driver to enable the behavior)
> >   
> As per our internal discussion the hypervisor cannot provide the CPU
> version info(in vm as well as in bare metal offerings).

Why? I suppose it's much more effort for you but it's much more effort
for the community to carry the workaround. So..

> On handling it from user side are you suggesting it to introduce a new
> ethtool Private Flags and have udev rules for the driver to set the private
> flag and switch to full page rx buffers? Given that the wide number of distro
> support this might be harder to maintain/backport. 
> 
> Also the dmi parsing design was influenced by other net wireleass
> drivers as /wireless/ath/ath10k/core.c. If this approach is not
> acceptable for MANA driver then will have to take a alternate route
> based on the dsicussion right above it.

Plenty of ugly hacks in the kernel, it's no excuse.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-03-21  0:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-11  4:00 [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems Dipayaan Roy
2026-03-14 19:50 ` Jakub Kicinski
2026-03-20 18:37   ` Dipayaan Roy
2026-03-21  0:29     ` Jakub Kicinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox