[PATCH net-next] net: airoha: Add TCP LRO support

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next] net: airoha: Add TCP LRO support
@ 2025-06-10  9:12 Lorenzo Bianconi
  2025-06-10  9:34 ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-06-10  9:12 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Lorenzo Bianconi
  Cc: linux-arm-kernel, linux-mediatek, netdev

EN7581 SoC supports TCP hw Large Receive Offload (LRO) for 8 hw queues.
Introduce TCP LRO support to airoha_eth driver for RX queues 24-31.
In order to support hw TCP LRO, increase page_pool order to 5 for RX
queues 24-31.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c  | 191 +++++++++++++++++++++++++++---
 drivers/net/ethernet/airoha/airoha_eth.h  |  10 ++
 drivers/net/ethernet/airoha/airoha_regs.h |  25 +++-
 3 files changed, 210 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index a7ec609d64dee9c8e901c7eb650bb3fe144ee00a..9378ca384fe2025a40cc528714859dd59300fbcd 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -12,6 +12,7 @@
 #include <net/dst_metadata.h>
 #include <net/page_pool/helpers.h>
 #include <net/pkt_cls.h>
+#include <net/tcp.h>
 #include <uapi/linux/ppp_defs.h>
 
 #include "airoha_regs.h"
@@ -439,6 +440,40 @@ static void airoha_fe_crsn_qsel_init(struct airoha_eth *eth)
 				 CDM_CRSN_QSEL_Q1));
 }
 
+static void airoha_fe_lro_init_rx_queue(struct airoha_eth *eth, int qdma_id,
+					int lro_queue_index, int qid,
+					int nbuf, int buf_size)
+{
+	airoha_fe_rmw(eth, REG_CDM_LRO_LIMIT(qdma_id),
+		      CDM_LRO_AGG_NUM_MASK | CDM_LRO_AGG_SIZE_MASK,
+		      FIELD_PREP(CDM_LRO_AGG_NUM_MASK, nbuf) |
+		      FIELD_PREP(CDM_LRO_AGG_SIZE_MASK, buf_size));
+	airoha_fe_rmw(eth, REG_CDM_LRO_AGE_TIME(qdma_id),
+		      CDM_LRO_AGE_TIME_MASK | CDM_LRO_AGG_TIME_MASK,
+		      FIELD_PREP(CDM_LRO_AGE_TIME_MASK,
+				 AIROHA_RXQ_LRO_MAX_AGE_TIME) |
+		      FIELD_PREP(CDM_LRO_AGG_TIME_MASK,
+				 AIROHA_RXQ_LRO_MAX_AGG_TIME));
+	airoha_fe_rmw(eth, REG_CDM_LRO_RXQ(qdma_id, lro_queue_index),
+		      LRO_RXQ_MASK(lro_queue_index),
+		      qid << __ffs(LRO_RXQ_MASK(lro_queue_index)));
+	airoha_fe_set(eth, REG_CDM_LRO_EN(qdma_id), BIT(lro_queue_index));
+}
+
+static void airoha_fe_lro_disable(struct airoha_eth *eth, int qdma_id)
+{
+	int i;
+
+	airoha_fe_clear(eth, REG_CDM_LRO_LIMIT(qdma_id),
+			CDM_LRO_AGG_NUM_MASK | CDM_LRO_AGG_SIZE_MASK);
+	airoha_fe_clear(eth, REG_CDM_LRO_AGE_TIME(qdma_id),
+			CDM_LRO_AGE_TIME_MASK | CDM_LRO_AGG_TIME_MASK);
+	airoha_fe_clear(eth, REG_CDM_LRO_EN(qdma_id), LRO_RXQ_EN_MASK);
+	for (i = 0; i < AIROHA_MAX_NUM_LRO_QUEUES; i++)
+		airoha_fe_clear(eth, REG_CDM_LRO_RXQ(qdma_id, i),
+				LRO_RXQ_MASK(i));
+}
+
 static int airoha_fe_init(struct airoha_eth *eth)
 {
 	airoha_fe_maccr_init(eth);
@@ -618,9 +653,87 @@ static int airoha_qdma_get_gdm_port(struct airoha_eth *eth,
 	return port >= ARRAY_SIZE(eth->ports) ? -EINVAL : port;
 }
 
+static bool airoha_qdma_is_lro_rx_queue(struct airoha_queue *q,
+					struct airoha_qdma *qdma)
+{
+	int qid = q - &qdma->q_rx[0];
+
+	/* EN7581 SoC supports at most 8 LRO rx queues */
+	BUILD_BUG_ON(hweight32(AIROHA_RXQ_LRO_EN_MASK) >
+		     AIROHA_MAX_NUM_LRO_QUEUES);
+
+	return !!(AIROHA_RXQ_LRO_EN_MASK & BIT(qid));
+}
+
+static int airoha_qdma_lro_rx_process(struct airoha_queue *q,
+				      struct airoha_qdma_desc *desc)
+{
+	u32 msg1 = le32_to_cpu(desc->msg1), msg2 = le32_to_cpu(desc->msg2);
+	u32 th_off, tcp_ack_seq, msg3 = le32_to_cpu(desc->msg3);
+	bool ipv4 = FIELD_GET(QDMA_ETH_RXMSG_IP4_MASK, msg1);
+	bool ipv6 = FIELD_GET(QDMA_ETH_RXMSG_IP6_MASK, msg1);
+	struct sk_buff *skb = q->skb;
+	u16 tcp_win, l2_len;
+	struct tcphdr *th;
+
+	if (FIELD_GET(QDMA_ETH_RXMSG_AGG_COUNT_MASK, msg2) <= 1)
+		return 0;
+
+	if (!ipv4 && !ipv6)
+		return -EOPNOTSUPP;
+
+	l2_len = FIELD_GET(QDMA_ETH_RXMSG_L2_LEN_MASK, msg2);
+	if (ipv4) {
+		u16 agg_len = FIELD_GET(QDMA_ETH_RXMSG_AGG_LEN_MASK, msg3);
+		struct iphdr *iph = (struct iphdr *)(skb->data + l2_len);
+
+		if (iph->protocol != IPPROTO_TCP)
+			return -EOPNOTSUPP;
+
+		iph->tot_len = cpu_to_be16(agg_len);
+		iph->check = 0;
+		iph->check = ip_fast_csum((void *)iph, iph->ihl);
+		th_off = l2_len + (iph->ihl << 2);
+	} else {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + l2_len);
+		u32 len, desc_ctrl = le32_to_cpu(desc->ctrl);
+
+		if (ip6h->nexthdr != NEXTHDR_TCP)
+			return -EOPNOTSUPP;
+
+		len = FIELD_GET(QDMA_DESC_LEN_MASK, desc_ctrl);
+		ip6h->payload_len = cpu_to_be16(len - l2_len - sizeof(*ip6h));
+		th_off = l2_len + sizeof(*ip6h);
+	}
+
+	tcp_win = FIELD_GET(QDMA_ETH_RXMSG_TCP_WIN_MASK, msg3);
+	tcp_ack_seq = le32_to_cpu(desc->data);
+
+	th = (struct tcphdr *)(skb->data + th_off);
+	th->ack_seq = cpu_to_be32(tcp_ack_seq);
+	th->window = cpu_to_be16(tcp_win);
+
+	/* check tcp timestamp option */
+	if (th->doff == sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) {
+		__be32 *topt = (__be32 *)(th + 1);
+
+		if (*topt == cpu_to_be32((TCPOPT_NOP << 24) |
+					 (TCPOPT_NOP << 16) |
+					 (TCPOPT_TIMESTAMP << 8) |
+					 TCPOLEN_TIMESTAMP)) {
+			u32 tcp_ts_reply = le32_to_cpu(desc->tcp_ts_reply);
+
+			put_unaligned_be32(tcp_ts_reply, topt + 2);
+		}
+	}
+
+	return 0;
+}
+
 static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
 {
 	enum dma_data_direction dir = page_pool_get_dma_dir(q->page_pool);
+	bool lro_queue = airoha_qdma_is_lro_rx_queue(q, q->qdma);
 	struct airoha_qdma *qdma = q->qdma;
 	struct airoha_eth *eth = qdma->eth;
 	int qid = q - &qdma->q_rx[0];
@@ -663,9 +776,14 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
 			__skb_put(q->skb, len);
 			skb_mark_for_recycle(q->skb);
 			q->skb->dev = port->dev;
-			q->skb->protocol = eth_type_trans(q->skb, port->dev);
 			q->skb->ip_summed = CHECKSUM_UNNECESSARY;
 			skb_record_rx_queue(q->skb, qid);
+
+			if (lro_queue && (port->dev->features & NETIF_F_LRO) &&
+			    airoha_qdma_lro_rx_process(q, desc) < 0)
+				goto free_frag;
+
+			q->skb->protocol = eth_type_trans(q->skb, port->dev);
 		} else { /* scattered frame */
 			struct skb_shared_info *shinfo = skb_shinfo(q->skb);
 			int nr_frags = shinfo->nr_frags;
@@ -751,14 +869,16 @@ static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget)
 }
 
 static int airoha_qdma_init_rx_queue(struct airoha_queue *q,
-				     struct airoha_qdma *qdma, int ndesc)
+				     struct airoha_qdma *qdma,
+				     int ndesc, bool lro_queue)
 {
+	int pp_order = lro_queue ? 5 : 0;
 	const struct page_pool_params pp_params = {
-		.order = 0,
-		.pool_size = 256,
+		.order = pp_order,
+		.pool_size = 256 >> pp_order,
 		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
 		.dma_dir = DMA_FROM_DEVICE,
-		.max_len = PAGE_SIZE,
+		.max_len = PAGE_SIZE << pp_order,
 		.nid = NUMA_NO_NODE,
 		.dev = qdma->eth->dev,
 		.napi = &q->napi,
@@ -767,7 +887,7 @@ static int airoha_qdma_init_rx_queue(struct airoha_queue *q,
 	int qid = q - &qdma->q_rx[0], thr;
 	dma_addr_t dma_addr;
 
-	q->buf_size = PAGE_SIZE / 2;
+	q->buf_size = pp_params.max_len / (2 * (1 + lro_queue));
 	q->ndesc = ndesc;
 	q->qdma = qdma;
 
@@ -829,15 +949,18 @@ static int airoha_qdma_init_rx(struct airoha_qdma *qdma)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) {
-		int err;
+		struct airoha_queue *q = &qdma->q_rx[i];
+		bool lro_queue;
+		int err, ndesc;
 
 		if (!(RX_DONE_INT_MASK & BIT(i))) {
 			/* rx-queue not binded to irq */
 			continue;
 		}
 
-		err = airoha_qdma_init_rx_queue(&qdma->q_rx[i], qdma,
-						RX_DSCP_NUM(i));
+		lro_queue = airoha_qdma_is_lro_rx_queue(q, qdma);
+		ndesc = lro_queue ? RX_DSCP_NUM(1) : RX_DSCP_NUM(i);
+		err = airoha_qdma_init_rx_queue(q, qdma, ndesc, lro_queue);
 		if (err)
 			return err;
 	}
@@ -1870,6 +1993,46 @@ static u32 airoha_get_dsa_tag(struct sk_buff *skb, struct net_device *dev)
 #endif
 }
 
+static int airoha_dev_set_features(struct net_device *dev,
+				   netdev_features_t features)
+{
+	netdev_features_t diff = dev->features ^ features;
+	struct airoha_gdm_port *port = netdev_priv(dev);
+	struct airoha_qdma *qdma = port->qdma;
+	struct airoha_eth *eth = qdma->eth;
+	int qdma_id = qdma - &eth->qdma[0];
+
+	if (!(diff & NETIF_F_LRO))
+		return 0;
+
+	/* reset LRO configuration */
+	if (features & NETIF_F_LRO) {
+		int i, lro_queue_index = 0;
+
+		for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) {
+			struct airoha_queue *q = &qdma->q_rx[i];
+			bool lro_queue;
+
+			if (!q->ndesc)
+				continue;
+
+			lro_queue = airoha_qdma_is_lro_rx_queue(q, qdma);
+			if (!lro_queue)
+				continue;
+
+			airoha_fe_lro_init_rx_queue(eth, qdma_id,
+						    lro_queue_index, i,
+						    q->page_pool->p.pool_size,
+						    q->buf_size);
+			lro_queue_index++;
+		}
+	} else {
+		airoha_fe_lro_disable(eth, qdma_id);
+	}
+
+	return 0;
+}
+
 static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb,
 				   struct net_device *dev)
 {
@@ -2751,6 +2914,7 @@ static const struct net_device_ops airoha_netdev_ops = {
 	.ndo_stop		= airoha_dev_stop,
 	.ndo_change_mtu		= airoha_dev_change_mtu,
 	.ndo_select_queue	= airoha_dev_select_queue,
+	.ndo_set_features	= airoha_dev_set_features,
 	.ndo_start_xmit		= airoha_dev_xmit,
 	.ndo_get_stats64        = airoha_dev_get_stats64,
 	.ndo_set_mac_address	= airoha_dev_set_macaddr,
@@ -2848,12 +3012,9 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth,
 	dev->ethtool_ops = &airoha_ethtool_ops;
 	dev->max_mtu = AIROHA_MAX_MTU;
 	dev->watchdog_timeo = 5 * HZ;
-	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
-			   NETIF_F_TSO6 | NETIF_F_IPV6_CSUM |
-			   NETIF_F_SG | NETIF_F_TSO |
-			   NETIF_F_HW_TC;
-	dev->features |= dev->hw_features;
-	dev->vlan_features = dev->hw_features;
+	dev->hw_features = AIROHA_HW_FEATURES | NETIF_F_LRO;
+	dev->features |= AIROHA_HW_FEATURES;
+	dev->vlan_features = AIROHA_HW_FEATURES;
 	dev->dev.of_node = np;
 	dev->irq = qdma->irq_banks[0].irq;
 	SET_NETDEV_DEV(dev, eth->dev);
diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h
index a970b789cf232c316e5ea27b0146493bf91c3767..bea56597af3ba0e8da3cc17e4a74b91d4f681137 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.h
+++ b/drivers/net/ethernet/airoha/airoha_eth.h
@@ -41,6 +41,16 @@
 	 (_n) == 15 ? 128 :		\
 	 (_n) ==  0 ? 1024 : 16)
 
+#define AIROHA_MAX_NUM_LRO_QUEUES	8
+#define AIROHA_RXQ_LRO_EN_MASK		0xff000000
+#define AIROHA_RXQ_LRO_MAX_AGG_TIME	100
+#define AIROHA_RXQ_LRO_MAX_AGE_TIME	2000 /* 1ms */
+
+#define AIROHA_HW_FEATURES			\
+	(NETIF_F_IP_CSUM | NETIF_F_RXCSUM |	\
+	 NETIF_F_TSO6 | NETIF_F_IPV6_CSUM |	\
+	 NETIF_F_SG | NETIF_F_TSO | NETIF_F_HW_TC)
+
 #define PSE_RSV_PAGES			128
 #define PSE_QUEUE_RSV_PAGES		64
 
diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h
index 04187eb40ec674ec5a4ccfc968bb4bd579a53095..86d320a6793b7d4ec77f823f61fa77a4c76a61a5 100644
--- a/drivers/net/ethernet/airoha/airoha_regs.h
+++ b/drivers/net/ethernet/airoha/airoha_regs.h
@@ -23,6 +23,9 @@
 #define GDM3_BASE			0x1100
 #define GDM4_BASE			0x2500
 
+#define CDM_BASE(_n)			\
+	((_n) == 1 ? CDM2_BASE : CDM1_BASE)
+
 #define GDM_BASE(_n)			\
 	((_n) == 4 ? GDM4_BASE :	\
 	 (_n) == 3 ? GDM3_BASE :	\
@@ -127,6 +130,20 @@
 #define CDM2_CRSN_QSEL_REASON_MASK(_n)	\
 	GENMASK(4 + (((_n) % 4) << 3),	(((_n) % 4) << 3))
 
+#define REG_CDM_LRO_RXQ(_n, _m)		(CDM_BASE(_n) + 0x78 + ((_m) & 0x4))
+#define LRO_RXQ_MASK(_n)		GENMASK(4 + (((_n) & 0x3) << 3), ((_n) & 0x3) << 3)
+
+#define REG_CDM_LRO_EN(_n)		(CDM_BASE(_n) + 0x80)
+#define LRO_RXQ_EN_MASK			GENMASK(7, 0)
+
+#define REG_CDM_LRO_LIMIT(_n)		(CDM_BASE(_n) + 0x84)
+#define CDM_LRO_AGG_NUM_MASK		GENMASK(23, 16)
+#define CDM_LRO_AGG_SIZE_MASK		GENMASK(15, 0)
+
+#define REG_CDM_LRO_AGE_TIME(_n)	(CDM_BASE(_n) + 0x88)
+#define CDM_LRO_AGE_TIME_MASK		GENMASK(31, 16)
+#define CDM_LRO_AGG_TIME_MASK		GENMASK(15, 0)
+
 #define REG_GDM_FWD_CFG(_n)		GDM_BASE(_n)
 #define GDM_DROP_CRC_ERR		BIT(23)
 #define GDM_IP4_CKSUM			BIT(22)
@@ -896,9 +913,15 @@
 #define QDMA_ETH_RXMSG_SPORT_MASK	GENMASK(25, 21)
 #define QDMA_ETH_RXMSG_CRSN_MASK	GENMASK(20, 16)
 #define QDMA_ETH_RXMSG_PPE_ENTRY_MASK	GENMASK(15, 0)
+/* RX MSG2 */
+#define QDMA_ETH_RXMSG_AGG_COUNT_MASK	GENMASK(31, 24)
+#define QDMA_ETH_RXMSG_L2_LEN_MASK	GENMASK(6, 0)
+/* RX MSG3 */
+#define QDMA_ETH_RXMSG_AGG_LEN_MASK	GENMASK(31, 16)
+#define QDMA_ETH_RXMSG_TCP_WIN_MASK	GENMASK(15, 0)
 
 struct airoha_qdma_desc {
-	__le32 rsv;
+	__le32 tcp_ts_reply;
 	__le32 ctrl;
 	__le32 addr;
 	__le32 data;

---
base-commit: 2c7e4a2663a1ab5a740c59c31991579b6b865a26
change-id: 20250610-airoha-eth-lro-e5fcd15fcc91

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-10  9:12 [PATCH net-next] net: airoha: Add TCP LRO support Lorenzo Bianconi
@ 2025-06-10  9:34 ` Eric Dumazet
  2025-06-10 13:39   ` Lorenzo Bianconi
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2025-06-10  9:34 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Andrew Lunn, David S. Miller, Jakub Kicinski, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

On Tue, Jun 10, 2025 at 2:12 AM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
>
> EN7581 SoC supports TCP hw Large Receive Offload (LRO) for 8 hw queues.
> Introduce TCP LRO support to airoha_eth driver for RX queues 24-31.
> In order to support hw TCP LRO, increase page_pool order to 5 for RX
> queues 24-31.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  drivers/net/ethernet/airoha/airoha_eth.c  | 191 +++++++++++++++++++++++++++---
>  drivers/net/ethernet/airoha/airoha_eth.h  |  10 ++
>  drivers/net/ethernet/airoha/airoha_regs.h |  25 +++-
>  3 files changed, 210 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
> index a7ec609d64dee9c8e901c7eb650bb3fe144ee00a..9378ca384fe2025a40cc528714859dd59300fbcd 100644
> --- a/drivers/net/ethernet/airoha/airoha_eth.c
> +++ b/drivers/net/ethernet/airoha/airoha_eth.c
> @@ -12,6 +12,7 @@
>  #include <net/dst_metadata.h>
>  #include <net/page_pool/helpers.h>
>  #include <net/pkt_cls.h>
> +#include <net/tcp.h>
>  #include <uapi/linux/ppp_defs.h>
>
>  #include "airoha_regs.h"
> @@ -439,6 +440,40 @@ static void airoha_fe_crsn_qsel_init(struct airoha_eth *eth)
>                                  CDM_CRSN_QSEL_Q1));
>  }
>
> +static void airoha_fe_lro_init_rx_queue(struct airoha_eth *eth, int qdma_id,
> +                                       int lro_queue_index, int qid,
> +                                       int nbuf, int buf_size)
> +{
> +       airoha_fe_rmw(eth, REG_CDM_LRO_LIMIT(qdma_id),
> +                     CDM_LRO_AGG_NUM_MASK | CDM_LRO_AGG_SIZE_MASK,
> +                     FIELD_PREP(CDM_LRO_AGG_NUM_MASK, nbuf) |
> +                     FIELD_PREP(CDM_LRO_AGG_SIZE_MASK, buf_size));
> +       airoha_fe_rmw(eth, REG_CDM_LRO_AGE_TIME(qdma_id),
> +                     CDM_LRO_AGE_TIME_MASK | CDM_LRO_AGG_TIME_MASK,
> +                     FIELD_PREP(CDM_LRO_AGE_TIME_MASK,
> +                                AIROHA_RXQ_LRO_MAX_AGE_TIME) |
> +                     FIELD_PREP(CDM_LRO_AGG_TIME_MASK,
> +                                AIROHA_RXQ_LRO_MAX_AGG_TIME));
> +       airoha_fe_rmw(eth, REG_CDM_LRO_RXQ(qdma_id, lro_queue_index),
> +                     LRO_RXQ_MASK(lro_queue_index),
> +                     qid << __ffs(LRO_RXQ_MASK(lro_queue_index)));
> +       airoha_fe_set(eth, REG_CDM_LRO_EN(qdma_id), BIT(lro_queue_index));
> +}
> +
> +static void airoha_fe_lro_disable(struct airoha_eth *eth, int qdma_id)
> +{
> +       int i;
> +
> +       airoha_fe_clear(eth, REG_CDM_LRO_LIMIT(qdma_id),
> +                       CDM_LRO_AGG_NUM_MASK | CDM_LRO_AGG_SIZE_MASK);
> +       airoha_fe_clear(eth, REG_CDM_LRO_AGE_TIME(qdma_id),
> +                       CDM_LRO_AGE_TIME_MASK | CDM_LRO_AGG_TIME_MASK);
> +       airoha_fe_clear(eth, REG_CDM_LRO_EN(qdma_id), LRO_RXQ_EN_MASK);
> +       for (i = 0; i < AIROHA_MAX_NUM_LRO_QUEUES; i++)
> +               airoha_fe_clear(eth, REG_CDM_LRO_RXQ(qdma_id, i),
> +                               LRO_RXQ_MASK(i));
> +}
> +
>  static int airoha_fe_init(struct airoha_eth *eth)
>  {
>         airoha_fe_maccr_init(eth);
> @@ -618,9 +653,87 @@ static int airoha_qdma_get_gdm_port(struct airoha_eth *eth,
>         return port >= ARRAY_SIZE(eth->ports) ? -EINVAL : port;
>  }
>
> +static bool airoha_qdma_is_lro_rx_queue(struct airoha_queue *q,
> +                                       struct airoha_qdma *qdma)
> +{
> +       int qid = q - &qdma->q_rx[0];
> +
> +       /* EN7581 SoC supports at most 8 LRO rx queues */
> +       BUILD_BUG_ON(hweight32(AIROHA_RXQ_LRO_EN_MASK) >
> +                    AIROHA_MAX_NUM_LRO_QUEUES);
> +
> +       return !!(AIROHA_RXQ_LRO_EN_MASK & BIT(qid));
> +}
> +
> +static int airoha_qdma_lro_rx_process(struct airoha_queue *q,
> +                                     struct airoha_qdma_desc *desc)
> +{
> +       u32 msg1 = le32_to_cpu(desc->msg1), msg2 = le32_to_cpu(desc->msg2);
> +       u32 th_off, tcp_ack_seq, msg3 = le32_to_cpu(desc->msg3);
> +       bool ipv4 = FIELD_GET(QDMA_ETH_RXMSG_IP4_MASK, msg1);
> +       bool ipv6 = FIELD_GET(QDMA_ETH_RXMSG_IP6_MASK, msg1);
> +       struct sk_buff *skb = q->skb;
> +       u16 tcp_win, l2_len;
> +       struct tcphdr *th;
> +
> +       if (FIELD_GET(QDMA_ETH_RXMSG_AGG_COUNT_MASK, msg2) <= 1)
> +               return 0;
> +
> +       if (!ipv4 && !ipv6)
> +               return -EOPNOTSUPP;
> +
> +       l2_len = FIELD_GET(QDMA_ETH_RXMSG_L2_LEN_MASK, msg2);
> +       if (ipv4) {
> +               u16 agg_len = FIELD_GET(QDMA_ETH_RXMSG_AGG_LEN_MASK, msg3);
> +               struct iphdr *iph = (struct iphdr *)(skb->data + l2_len);
> +
> +               if (iph->protocol != IPPROTO_TCP)
> +                       return -EOPNOTSUPP;
> +
> +               iph->tot_len = cpu_to_be16(agg_len);
> +               iph->check = 0;
> +               iph->check = ip_fast_csum((void *)iph, iph->ihl);
> +               th_off = l2_len + (iph->ihl << 2);
> +       } else {
> +               struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + l2_len);
> +               u32 len, desc_ctrl = le32_to_cpu(desc->ctrl);
> +
> +               if (ip6h->nexthdr != NEXTHDR_TCP)
> +                       return -EOPNOTSUPP;
> +
> +               len = FIELD_GET(QDMA_DESC_LEN_MASK, desc_ctrl);
> +               ip6h->payload_len = cpu_to_be16(len - l2_len - sizeof(*ip6h));
> +               th_off = l2_len + sizeof(*ip6h);
> +       }
> +
> +       tcp_win = FIELD_GET(QDMA_ETH_RXMSG_TCP_WIN_MASK, msg3);
> +       tcp_ack_seq = le32_to_cpu(desc->data);
> +
> +       th = (struct tcphdr *)(skb->data + th_off);
> +       th->ack_seq = cpu_to_be32(tcp_ack_seq);
> +       th->window = cpu_to_be16(tcp_win);
> +
> +       /* check tcp timestamp option */
> +       if (th->doff == sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) {
> +               __be32 *topt = (__be32 *)(th + 1);
> +
> +               if (*topt == cpu_to_be32((TCPOPT_NOP << 24) |
> +                                        (TCPOPT_NOP << 16) |
> +                                        (TCPOPT_TIMESTAMP << 8) |
> +                                        TCPOLEN_TIMESTAMP)) {
> +                       u32 tcp_ts_reply = le32_to_cpu(desc->tcp_ts_reply);
> +
> +                       put_unaligned_be32(tcp_ts_reply, topt + 2);
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
>  {
>         enum dma_data_direction dir = page_pool_get_dma_dir(q->page_pool);
> +       bool lro_queue = airoha_qdma_is_lro_rx_queue(q, q->qdma);
>         struct airoha_qdma *qdma = q->qdma;
>         struct airoha_eth *eth = qdma->eth;
>         int qid = q - &qdma->q_rx[0];
> @@ -663,9 +776,14 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
>                         __skb_put(q->skb, len);
>                         skb_mark_for_recycle(q->skb);
>                         q->skb->dev = port->dev;
> -                       q->skb->protocol = eth_type_trans(q->skb, port->dev);
>                         q->skb->ip_summed = CHECKSUM_UNNECESSARY;
>                         skb_record_rx_queue(q->skb, qid);
> +
> +                       if (lro_queue && (port->dev->features & NETIF_F_LRO) &&
> +                           airoha_qdma_lro_rx_process(q, desc) < 0)
> +                               goto free_frag;
> +
> +                       q->skb->protocol = eth_type_trans(q->skb, port->dev);
>                 } else { /* scattered frame */
>                         struct skb_shared_info *shinfo = skb_shinfo(q->skb);
>                         int nr_frags = shinfo->nr_frags;
> @@ -751,14 +869,16 @@ static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget)
>  }
>
>  static int airoha_qdma_init_rx_queue(struct airoha_queue *q,
> -                                    struct airoha_qdma *qdma, int ndesc)
> +                                    struct airoha_qdma *qdma,
> +                                    int ndesc, bool lro_queue)
>  {
> +       int pp_order = lro_queue ? 5 : 0;
>         const struct page_pool_params pp_params = {
> -               .order = 0,
> -               .pool_size = 256,
> +               .order = pp_order,
> +               .pool_size = 256 >> pp_order,
>                 .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
>                 .dma_dir = DMA_FROM_DEVICE,
> -               .max_len = PAGE_SIZE,
> +               .max_len = PAGE_SIZE << pp_order,
>                 .nid = NUMA_NO_NODE,
>                 .dev = qdma->eth->dev,
>                 .napi = &q->napi,
> @@ -767,7 +887,7 @@ static int airoha_qdma_init_rx_queue(struct airoha_queue *q,
>         int qid = q - &qdma->q_rx[0], thr;
>         dma_addr_t dma_addr;
>
> -       q->buf_size = PAGE_SIZE / 2;
> +       q->buf_size = pp_params.max_len / (2 * (1 + lro_queue));

Tell us more... It seems small LRO packets will consume a lot of
space, incurring a small skb->len/skb->truesize ratio, and bad TCP WAN
performance.

And order-5 pages are unlikely to be available in the long run anyway.

LRO support would only make sense if the NIC is able to use multiple
order-0 pages to store the payload.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-10  9:34 ` Eric Dumazet
@ 2025-06-10 13:39   ` Lorenzo Bianconi
  2025-06-12  0:36     ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-06-10 13:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrew Lunn, David S. Miller, Jakub Kicinski, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

[-- Attachment #1: Type: text/plain, Size: 1705 bytes --]

On Jun 10, Eric Dumazet wrote:
> On Tue, Jun 10, 2025 at 2:12 AM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> >
[...]
> > @@ -767,7 +887,7 @@ static int airoha_qdma_init_rx_queue(struct airoha_queue *q,
> >         int qid = q - &qdma->q_rx[0], thr;
> >         dma_addr_t dma_addr;
> >
> > -       q->buf_size = PAGE_SIZE / 2;
> > +       q->buf_size = pp_params.max_len / (2 * (1 + lro_queue));
> 
> Tell us more... It seems small LRO packets will consume a lot of
> space, incurring a small skb->len/skb->truesize ratio, and bad TCP WAN
> performance.

I think the main idea is forward to hw LRO queues (queues 24-31 in this
case) just specific protocols with mostly big packets but I completely
agree we have an issue for small packets. One possible approach would be
to define a threshold (e.g. 256B) and allocate a buffer or page from the
page allocator for small packets (something similar to what mt7601u driver
is doing[0]).  What do you think?

> 
> And order-5 pages are unlikely to be available in the long run anyway.

I agree. I guess we can reduce the order to ~ 2 (something similar to
mtk_eth_soc hw LRO implementation [1]).

> 
> LRO support would only make sense if the NIC is able to use multiple
> order-0 pages to store the payload.

The hw supports splitting big packets over multiple order-0 pages if we
increase the MTU over one page size, but according to my understanding
hw LRO requires contiguous memory to work.

Regards,
Lorenzo

[0] https://github.com/torvalds/linux/blob/master/drivers/net/wireless/mediatek/mt7601u/dma.c#L146
[1] https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mediatek/mtk_eth_soc.c#L2258

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-10 13:39   ` Lorenzo Bianconi
@ 2025-06-12  0:36     ` Jakub Kicinski
  2025-06-12 21:02       ` Lorenzo Bianconi
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2025-06-12  0:36 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

On Tue, 10 Jun 2025 15:39:34 +0200 Lorenzo Bianconi wrote:
> > Tell us more... It seems small LRO packets will consume a lot of
> > space, incurring a small skb->len/skb->truesize ratio, and bad TCP WAN
> > performance.  
> 
> I think the main idea is forward to hw LRO queues (queues 24-31 in this
> case) just specific protocols with mostly big packets but I completely
> agree we have an issue for small packets. One possible approach would be
> to define a threshold (e.g. 256B) and allocate a buffer or page from the
> page allocator for small packets (something similar to what mt7601u driver
> is doing[0]).  What do you think?

I'm not Eric but FWIW 256B is not going to help much. It's best to keep
the len / truesize ratio above 50%, so with 32k buffers we're talking
about copying multiple frames.

> > And order-5 pages are unlikely to be available in the long run anyway.  
> 
> I agree. I guess we can reduce the order to ~ 2 (something similar to
> mtk_eth_soc hw LRO implementation [1]).

Would be good to test. SW GRO can "re-GRO" the partially coalesced
packets, so it's going to be diminishing returns.

> > LRO support would only make sense if the NIC is able to use multiple
> > order-0 pages to store the payload.  
> 
> The hw supports splitting big packets over multiple order-0 pages if we
> increase the MTU over one page size, but according to my understanding
> hw LRO requires contiguous memory to work.

Hm, you're already passing buffers smaller than normal TSO so
presumably having a smaller buffers will break the sessions more 
often but still work?

You mean want to steal some of the code from:
https://lore.kernel.org/all/20250421222827.283737-1-kuba@kernel.org/
and make the buffer size user-configurable. But not a requirement.
Let's at least get some understanding of the perf benefit of 
32k vs 16k or 8k
-- 
pw-bot: cr


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-12  0:36     ` Jakub Kicinski
@ 2025-06-12 21:02       ` Lorenzo Bianconi
  2025-06-12 22:57         ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-06-12 21:02 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

[-- Attachment #1: Type: text/plain, Size: 2370 bytes --]

> On Tue, 10 Jun 2025 15:39:34 +0200 Lorenzo Bianconi wrote:
> > > Tell us more... It seems small LRO packets will consume a lot of
> > > space, incurring a small skb->len/skb->truesize ratio, and bad TCP WAN
> > > performance.  
> > 
> > I think the main idea is forward to hw LRO queues (queues 24-31 in this
> > case) just specific protocols with mostly big packets but I completely
> > agree we have an issue for small packets. One possible approach would be
> > to define a threshold (e.g. 256B) and allocate a buffer or page from the
> > page allocator for small packets (something similar to what mt7601u driver
> > is doing[0]).  What do you think?
> 
> I'm not Eric but FWIW 256B is not going to help much. It's best to keep
> the len / truesize ratio above 50%, so with 32k buffers we're talking
> about copying multiple frames.

Hi Jakub,

what I mean here is reallocate the skb if the true size is small (e.g. below
256B) in order to avoid consuming the high order page from the page_pool. Maybe
we can avoid it if reducing the page order to 2 for LRO queues provide
comparable results.

> 
> > > And order-5 pages are unlikely to be available in the long run anyway.  
> > 
> > I agree. I guess we can reduce the order to ~ 2 (something similar to
> > mtk_eth_soc hw LRO implementation [1]).
> 
> Would be good to test. SW GRO can "re-GRO" the partially coalesced
> packets, so it's going to be diminishing returns.

ack, I will do.

> 
> > > LRO support would only make sense if the NIC is able to use multiple
> > > order-0 pages to store the payload.  
> > 
> > The hw supports splitting big packets over multiple order-0 pages if we
> > increase the MTU over one page size, but according to my understanding
> > hw LRO requires contiguous memory to work.
> 
> Hm, you're already passing buffers smaller than normal TSO so
> presumably having a smaller buffers will break the sessions more 
> often but still work?

I will test it.

> 
> You mean want to steal some of the code from:
> https://lore.kernel.org/all/20250421222827.283737-1-kuba@kernel.org/

ack, I will take a look.

> and make the buffer size user-configurable. But not a requirement.
> Let's at least get some understanding of the perf benefit of 
> 32k vs 16k or 8k

ack, I will do.

Regards,
Lorenzo

> -- 
> pw-bot: cr

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-12 21:02       ` Lorenzo Bianconi
@ 2025-06-12 22:57         ` Jakub Kicinski
  2025-06-16 12:51           ` Lorenzo Bianconi
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2025-06-12 22:57 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

On Thu, 12 Jun 2025 23:02:30 +0200 Lorenzo Bianconi wrote:
> > I'm not Eric but FWIW 256B is not going to help much. It's best to keep
> > the len / truesize ratio above 50%, so with 32k buffers we're talking
> > about copying multiple frames.  
> 
> what I mean here is reallocate the skb if the true size is small (e.g. below
> 256B) in order to avoid consuming the high order page from the page_pool. Maybe
> we can avoid it if reducing the page order to 2 for LRO queues provide
> comparable results.

Hm, truesize is the buffer size, right? If the driver allocated n bytes
of memory for packets it sent up the stack, the truesizes of the skbs
it generated must add up to approximately n bytes.

So if the HW places one aggregation session per buffer, and the buffer
is 32kB -- to avoid mem use ratio < 25% you'd need to copy all sessions
smaller than 8kB?

If I'm not making sense - just ignore, I haven't looked at the rest of
the driver :)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-12 22:57         ` Jakub Kicinski
@ 2025-06-16 12:51           ` Lorenzo Bianconi
  2025-10-31  8:42             ` Lorenzo Bianconi
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-06-16 12:51 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

[-- Attachment #1: Type: text/plain, Size: 1438 bytes --]

> On Thu, 12 Jun 2025 23:02:30 +0200 Lorenzo Bianconi wrote:
> > > I'm not Eric but FWIW 256B is not going to help much. It's best to keep
> > > the len / truesize ratio above 50%, so with 32k buffers we're talking
> > > about copying multiple frames.  
> > 
> > what I mean here is reallocate the skb if the true size is small (e.g. below
> > 256B) in order to avoid consuming the high order page from the page_pool. Maybe
> > we can avoid it if reducing the page order to 2 for LRO queues provide
> > comparable results.
> 
> Hm, truesize is the buffer size, right? If the driver allocated n bytes
> of memory for packets it sent up the stack, the truesizes of the skbs
> it generated must add up to approximately n bytes.

With 'truesize' I am referring to the real data size contained in the x-order
page returned by the hw. If this size is small, I was thinking to just allocate
a skb for it, copy the data from the x-order page into it and re-insert the
x-order page into the page_pool running page_pool_put_full_page().
Let me do some tests with order-2 page to see if the GRO can compensate the
reduced page size.

Regards,
Lorenzo

> 
> So if the HW places one aggregation session per buffer, and the buffer
> is 32kB -- to avoid mem use ratio < 25% you'd need to copy all sessions
> smaller than 8kB?
> 
> If I'm not making sense - just ignore, I haven't looked at the rest of
> the driver :)
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-06-16 12:51           ` Lorenzo Bianconi
@ 2025-10-31  8:42             ` Lorenzo Bianconi
  2025-10-31 18:16               ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-10-31  8:42 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

[-- Attachment #1: Type: text/plain, Size: 2089 bytes --]

> > On Thu, 12 Jun 2025 23:02:30 +0200 Lorenzo Bianconi wrote:
> > > > I'm not Eric but FWIW 256B is not going to help much. It's best to keep
> > > > the len / truesize ratio above 50%, so with 32k buffers we're talking
> > > > about copying multiple frames.  
> > > 
> > > what I mean here is reallocate the skb if the true size is small (e.g. below
> > > 256B) in order to avoid consuming the high order page from the page_pool. Maybe
> > > we can avoid it if reducing the page order to 2 for LRO queues provide
> > > comparable results.
> > 
> > Hm, truesize is the buffer size, right? If the driver allocated n bytes
> > of memory for packets it sent up the stack, the truesizes of the skbs
> > it generated must add up to approximately n bytes.
> 
> With 'truesize' I am referring to the real data size contained in the x-order
> page returned by the hw. If this size is small, I was thinking to just allocate
> a skb for it, copy the data from the x-order page into it and re-insert the
> x-order page into the page_pool running page_pool_put_full_page().
> Let me do some tests with order-2 page to see if the GRO can compensate the
> reduced page size.

Sorry for the late reply about this item.
I carried out some comparison tests between GRO-only and GRO+LRO with order-2
pages [0]. The system is using a 2.5Gbps link. The device is receiving a single TCP
stream. MTU is set to 1500B.

- GRO only:			~1.6Gbps
- GRO+LRO (order-2 pages):	~2.1Gbps

In both cases we can't reach the line-rate. Do you think the difference can justify
the hw LRO support? Thanks in advance.

Regards,
Lorenzo

[0] the hw LRO requires contiguous memory pages to work. I reduced the size to
order-2 from order-5 (original implementation).

> 
> Regards,
> Lorenzo
> 
> > 
> > So if the HW places one aggregation session per buffer, and the buffer
> > is 32kB -- to avoid mem use ratio < 25% you'd need to copy all sessions
> > smaller than 8kB?
> > 
> > If I'm not making sense - just ignore, I haven't looked at the rest of
> > the driver :)
> > 



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-10-31  8:42             ` Lorenzo Bianconi
@ 2025-10-31 18:16               ` Jakub Kicinski
  2025-11-07 13:30                 ` Lorenzo Bianconi
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2025-10-31 18:16 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

On Fri, 31 Oct 2025 09:42:15 +0100 Lorenzo Bianconi wrote:
> > > Hm, truesize is the buffer size, right? If the driver allocated n bytes
> > > of memory for packets it sent up the stack, the truesizes of the skbs
> > > it generated must add up to approximately n bytes.  
> > 
> > With 'truesize' I am referring to the real data size contained in the x-order
> > page returned by the hw. If this size is small, I was thinking to just allocate
> > a skb for it, copy the data from the x-order page into it and re-insert the
> > x-order page into the page_pool running page_pool_put_full_page().
> > Let me do some tests with order-2 page to see if the GRO can compensate the
> > reduced page size.  
> 
> Sorry for the late reply about this item.
> I carried out some comparison tests between GRO-only and GRO+LRO with order-2
> pages [0]. The system is using a 2.5Gbps link. The device is receiving a single TCP
> stream. MTU is set to 1500B.
> 
> - GRO only:			~1.6Gbps
> - GRO+LRO (order-2 pages):	~2.1Gbps
> 
> In both cases we can't reach the line-rate. Do you think the difference can justify
> the hw LRO support? Thanks in advance.
>  
> [0] the hw LRO requires contiguous memory pages to work. I reduced the size to
> order-2 from order-5 (original implementation).

I think we're mostly advising about real world implications of 
the approach rather than nacking. I can't say for sure if potentially
terrible skb->len/skb->truesize ratio will matter for a router
application. Maybe not.

BTW is the device doing header-data split or the LRO frame has headers
and payload in a single buffer?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-10-31 18:16               ` Jakub Kicinski
@ 2025-11-07 13:30                 ` Lorenzo Bianconi
  2025-11-08  1:11                   ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Lorenzo Bianconi @ 2025-11-07 13:30 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

[-- Attachment #1: Type: text/plain, Size: 2261 bytes --]

> On Fri, 31 Oct 2025 09:42:15 +0100 Lorenzo Bianconi wrote:
> > > > Hm, truesize is the buffer size, right? If the driver allocated n bytes
> > > > of memory for packets it sent up the stack, the truesizes of the skbs
> > > > it generated must add up to approximately n bytes.  
> > > 
> > > With 'truesize' I am referring to the real data size contained in the x-order
> > > page returned by the hw. If this size is small, I was thinking to just allocate
> > > a skb for it, copy the data from the x-order page into it and re-insert the
> > > x-order page into the page_pool running page_pool_put_full_page().
> > > Let me do some tests with order-2 page to see if the GRO can compensate the
> > > reduced page size.  
> > 
> > Sorry for the late reply about this item.
> > I carried out some comparison tests between GRO-only and GRO+LRO with order-2
> > pages [0]. The system is using a 2.5Gbps link. The device is receiving a single TCP
> > stream. MTU is set to 1500B.
> > 
> > - GRO only:			~1.6Gbps
> > - GRO+LRO (order-2 pages):	~2.1Gbps
> > 
> > In both cases we can't reach the line-rate. Do you think the difference can justify
> > the hw LRO support? Thanks in advance.
> >  
> > [0] the hw LRO requires contiguous memory pages to work. I reduced the size to
> > order-2 from order-5 (original implementation).
> 
> I think we're mostly advising about real world implications of 
> the approach rather than nacking. I can't say for sure if potentially
> terrible skb->len/skb->truesize ratio will matter for a router
> application. Maybe not.
> 
> BTW is the device doing header-data split or the LRO frame has headers
> and payload in a single buffer?

According to my understanding the hw LRO is limited to a single order-x page
containing both the headers and the payload (the hw LRO module is not capable
of splitting the aggregated TCP segment over multiple pages).
What we could do is disable hw LRO by default and feed hw rx queues with
order-0 pages (current implementation). If the user enables hw LRO, we will
free order-0 pages linked to the rx DMA descriptors and allocate order-x pages
(e.g. order-2) for hw LRO queues. Disabling hw LRO will switch back to order-0
pages.

Regards,
Lorenzo

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] net: airoha: Add TCP LRO support
  2025-11-07 13:30                 ` Lorenzo Bianconi
@ 2025-11-08  1:11                   ` Jakub Kicinski
  0 siblings, 0 replies; 11+ messages in thread
From: Jakub Kicinski @ 2025-11-08  1:11 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Eric Dumazet, Andrew Lunn, David S. Miller, Paolo Abeni,
	linux-arm-kernel, linux-mediatek, netdev

On Fri, 7 Nov 2025 14:30:02 +0100 Lorenzo Bianconi wrote:
> > On Fri, 31 Oct 2025 09:42:15 +0100 Lorenzo Bianconi wrote:  
> > > Sorry for the late reply about this item.
> > > I carried out some comparison tests between GRO-only and GRO+LRO with order-2
> > > pages [0]. The system is using a 2.5Gbps link. The device is receiving a single TCP
> > > stream. MTU is set to 1500B.
> > > 
> > > - GRO only:			~1.6Gbps
> > > - GRO+LRO (order-2 pages):	~2.1Gbps
> > > 
> > > In both cases we can't reach the line-rate. Do you think the difference can justify
> > > the hw LRO support? Thanks in advance.
> > >  
> > > [0] the hw LRO requires contiguous memory pages to work. I reduced the size to
> > > order-2 from order-5 (original implementation).  
> > 
> > I think we're mostly advising about real world implications of 
> > the approach rather than nacking. I can't say for sure if potentially
> > terrible skb->len/skb->truesize ratio will matter for a router
> > application. Maybe not.
> > 
> > BTW is the device doing header-data split or the LRO frame has headers
> > and payload in a single buffer?  
> 
> According to my understanding the hw LRO is limited to a single order-x page
> containing both the headers and the payload (the hw LRO module is not capable
> of splitting the aggregated TCP segment over multiple pages).
> What we could do is disable hw LRO by default and feed hw rx queues with
> order-0 pages (current implementation). If the user enables hw LRO, we will
> free order-0 pages linked to the rx DMA descriptors and allocate order-x pages
> (e.g. order-2) for hw LRO queues. Disabling hw LRO will switch back to order-0
> pages.

Are all packets LRO-sized when it's enabled? What you describe is
definitely good, bur I was wondering if we can also use rx-buf-len
to let user select the size / order of the LRO buffers.

But the definition of rx-buf-len is that it's for _all_ rx buffers
on given queue. We'd probably need a new param if the pages are
just for lro


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-11-08  1:11 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-10  9:12 [PATCH net-next] net: airoha: Add TCP LRO support Lorenzo Bianconi
2025-06-10  9:34 ` Eric Dumazet
2025-06-10 13:39   ` Lorenzo Bianconi
2025-06-12  0:36     ` Jakub Kicinski
2025-06-12 21:02       ` Lorenzo Bianconi
2025-06-12 22:57         ` Jakub Kicinski
2025-06-16 12:51           ` Lorenzo Bianconi
2025-10-31  8:42             ` Lorenzo Bianconi
2025-10-31 18:16               ` Jakub Kicinski
2025-11-07 13:30                 ` Lorenzo Bianconi
2025-11-08  1:11                   ` Jakub Kicinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).