netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* cxgb3: Replace LRO with GRO
  2009-01-13  9:26 [1/2] e1000e: Invoke VLAN GRO handler Herbert Xu
@ 2009-01-15  6:59 ` Herbert Xu
  0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-15  6:59 UTC (permalink / raw)
  To: David S. Miller, Divy Le Ray, netdev

Hi Dave:

This one is slightly different because the driver has chosen to
expose to user-space per-queue switches to control LRO.  I've
kept those for now.

cxgb3: Replace LRO with GRO

This patch makes cxgb3 invoke the GRO hooks instead of LRO.  As
GRO has a compatible external interface to LRO this is a very
straightforward replacement.

I've kept the ioctl controls for per-queue LRO switches.  However,
we should not encourage anyone to use these.

Because of that, I've also kept the skb construction code in
cxgb3.  Hopefully we can phase out those per-queue switches
and then kill this too.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ee37540..d6bf215 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2407,7 +2407,6 @@ config CHELSIO_T3
 	tristate "Chelsio Communications T3 10Gb Ethernet support"
 	depends on CHELSIO_T3_DEPENDS
 	select FW_LOADER
-	select INET_LRO
 	help
 	  This driver supports Chelsio T3-based gigabit and 10Gb Ethernet
 	  adapters.
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 5b346f9..41b87a1 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -42,7 +42,6 @@
 #include <linux/cache.h>
 #include <linux/mutex.h>
 #include <linux/bitops.h>
-#include <linux/inet_lro.h>
 #include "t3cdev.h"
 #include <asm/io.h>
 
@@ -173,15 +172,11 @@ enum {				/* per port SGE statistics */
 	SGE_PSTAT_TX_CSUM,	/* # of TX checksum offloads */
 	SGE_PSTAT_VLANEX,	/* # of VLAN tag extractions */
 	SGE_PSTAT_VLANINS,	/* # of VLAN tag insertions */
-	SGE_PSTAT_LRO_AGGR,	/* # of page chunks added to LRO sessions */
-	SGE_PSTAT_LRO_FLUSHED,	/* # of flushed LRO sessions */
-	SGE_PSTAT_LRO_NO_DESC,	/* # of overflown LRO sessions */
 
 	SGE_PSTAT_MAX		/* must be last */
 };
 
-#define T3_MAX_LRO_SES 8
-#define T3_MAX_LRO_MAX_PKTS 64
+struct napi_gro_fraginfo;
 
 struct sge_qset {		/* an SGE queue set */
 	struct adapter *adap;
@@ -189,12 +184,8 @@ struct sge_qset {		/* an SGE queue set */
 	struct sge_rspq rspq;
 	struct sge_fl fl[SGE_RXQ_PER_SET];
 	struct sge_txq txq[SGE_TXQ_PER_SET];
-	struct net_lro_mgr lro_mgr;
-	struct net_lro_desc lro_desc[T3_MAX_LRO_SES];
-	struct skb_frag_struct *lro_frag_tbl;
-	int lro_nfrags;
+	struct napi_gro_fraginfo lro_frag_tbl;
 	int lro_enabled;
-	int lro_frag_len;
 	void *lro_va;
 	struct net_device *netdev;
 	struct netdev_queue *tx_q;	/* associated netdev TX queue */
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 2847f94..e94430d 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -508,19 +508,9 @@ static void set_qset_lro(struct net_device *dev, int qset_idx, int val)
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
-	int i, lro_on = 1;
 
 	adapter->params.sge.qset[qset_idx].lro = !!val;
 	adapter->sge.qs[qset_idx].lro_enabled = !!val;
-
-	/* let ethtool report LRO on only if all queues are LRO enabled */
-	for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; ++i)
-		lro_on &= adapter->params.sge.qset[i].lro;
-
-	if (lro_on)
-		dev->features |= NETIF_F_LRO;
-	else
-		dev->features &= ~NETIF_F_LRO;
 }
 
 /**
@@ -1433,9 +1423,9 @@ static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_AGGR);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_FLUSHED);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_NO_DESC);
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = 0;
 	*data++ = s->rx_cong_drops;
 
 	*data++ = s->num_toggled;
@@ -1824,25 +1814,6 @@ static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 	memset(&wol->sopass, 0, sizeof(wol->sopass));
 }
 
-static int cxgb3_set_flags(struct net_device *dev, u32 data)
-{
-	struct port_info *pi = netdev_priv(dev);
-	int i;
-
-	if (data & ETH_FLAG_LRO) {
-		if (!pi->rx_csum_offload)
-			return -EINVAL;
-
-		for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
-			set_qset_lro(dev, i, 1);
-
-	} else
-		for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
-			set_qset_lro(dev, i, 0);
-
-	return 0;
-}
-
 static const struct ethtool_ops cxgb_ethtool_ops = {
 	.get_settings = get_settings,
 	.set_settings = set_settings,
@@ -1872,8 +1843,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
 	.get_regs = get_regs,
 	.get_wol = get_wol,
 	.set_tso = ethtool_op_set_tso,
-	.get_flags = ethtool_op_get_flags,
-	.set_flags = cxgb3_set_flags,
 };
 
 static int in_range(int val, int lo, int hi)
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 6c641a8..893783b 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -585,8 +585,7 @@ static void t3_reset_qset(struct sge_qset *q)
 	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
 	q->txq_stopped = 0;
 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
-	kfree(q->lro_frag_tbl);
-	q->lro_nfrags = q->lro_frag_len = 0;
+	q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
 }
 
 
@@ -1945,10 +1944,8 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
 		qs->port_stats[SGE_PSTAT_VLANEX]++;
 		if (likely(grp))
 			if (lro)
-				lro_vlan_hwaccel_receive_skb(&qs->lro_mgr, skb,
-							     grp,
-							     ntohs(p->vlan),
-							     p);
+				vlan_gro_receive(&qs->napi, grp,
+						 ntohs(p->vlan), skb);
 			else {
 				if (unlikely(pi->iscsi_ipv4addr &&
 				    is_arp(skb))) {
@@ -1965,7 +1962,7 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
 			dev_kfree_skb_any(skb);
 	} else if (rq->polling) {
 		if (lro)
-			lro_receive_skb(&qs->lro_mgr, skb, p);
+			napi_gro_receive(&qs->napi, skb);
 		else {
 			if (unlikely(pi->iscsi_ipv4addr && is_arp(skb)))
 				cxgb3_arp_process(adap, skb);
@@ -1981,59 +1978,6 @@ static inline int is_eth_tcp(u32 rss)
 }
 
 /**
- *	lro_frame_ok - check if an ingress packet is eligible for LRO
- *	@p: the CPL header of the packet
- *
- *	Returns true if a received packet is eligible for LRO.
- *	The following conditions must be true:
- *	- packet is TCP/IP Ethernet II (checked elsewhere)
- *	- not an IP fragment
- *	- no IP options
- *	- TCP/IP checksums are correct
- *	- the packet is for this host
- */
-static inline int lro_frame_ok(const struct cpl_rx_pkt *p)
-{
-	const struct ethhdr *eh = (struct ethhdr *)(p + 1);
-	const struct iphdr *ih = (struct iphdr *)(eh + 1);
-
-	return (*((u8 *)p + 1) & 0x90) == 0x10 && p->csum == htons(0xffff) &&
-		eh->h_proto == htons(ETH_P_IP) && ih->ihl == (sizeof(*ih) >> 2);
-}
-
-static int t3_get_lro_header(void **eh,  void **iph, void **tcph,
-			     u64 *hdr_flags, void *priv)
-{
-	const struct cpl_rx_pkt *cpl = priv;
-
-	if (!lro_frame_ok(cpl))
-		return -1;
-
-	*eh = (struct ethhdr *)(cpl + 1);
-	*iph = (struct iphdr *)((struct ethhdr *)*eh + 1);
-	*tcph = (struct tcphdr *)((struct iphdr *)*iph + 1);
-
-	*hdr_flags = LRO_IPV4 | LRO_TCP;
-	return 0;
-}
-
-static int t3_get_skb_header(struct sk_buff *skb,
-			      void **iph, void **tcph, u64 *hdr_flags,
-			      void *priv)
-{
-	void *eh;
-
-	return t3_get_lro_header(&eh, iph, tcph, hdr_flags, priv);
-}
-
-static int t3_get_frag_header(struct skb_frag_struct *frag, void **eh,
-			      void **iph, void **tcph, u64 *hdr_flags,
-			      void *priv)
-{
-	return t3_get_lro_header(eh, iph, tcph, hdr_flags, priv);
-}
-
-/**
  *	lro_add_page - add a page chunk to an LRO session
  *	@adap: the adapter
  *	@qs: the associated queue set
@@ -2049,8 +1993,9 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 {
 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 	struct cpl_rx_pkt *cpl;
-	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl;
-	int nr_frags = qs->lro_nfrags, frag_len = qs->lro_frag_len;
+	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
+	int nr_frags = qs->lro_frag_tbl.nr_frags;
+	int frag_len = qs->lro_frag_tbl.len;
 	int offset = 0;
 
 	if (!nr_frags) {
@@ -2069,13 +2014,13 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 	rx_frag->page_offset = sd->pg_chunk.offset + offset;
 	rx_frag->size = len;
 	frag_len += len;
-	qs->lro_nfrags++;
-	qs->lro_frag_len = frag_len;
+	qs->lro_frag_tbl.nr_frags++;
+	qs->lro_frag_tbl.len = frag_len;
 
 	if (!complete)
 		return;
 
-	qs->lro_nfrags = qs->lro_frag_len = 0;
+	qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
 	cpl = qs->lro_va;
 
 	if (unlikely(cpl->vlan_valid)) {
@@ -2084,35 +2029,15 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 		struct vlan_group *grp = pi->vlan_grp;
 
 		if (likely(grp != NULL)) {
-			lro_vlan_hwaccel_receive_frags(&qs->lro_mgr,
-						       qs->lro_frag_tbl,
-						       frag_len, frag_len,
-						       grp, ntohs(cpl->vlan),
-						       cpl, 0);
-			return;
+			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
+				       &qs->lro_frag_tbl);
+			goto out;
 		}
 	}
-	lro_receive_frags(&qs->lro_mgr, qs->lro_frag_tbl,
-			  frag_len, frag_len, cpl, 0);
-}
+	napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
 
-/**
- *	init_lro_mgr - initialize a LRO manager object
- *	@lro_mgr: the LRO manager object
- */
-static void init_lro_mgr(struct sge_qset *qs, struct net_lro_mgr *lro_mgr)
-{
-	lro_mgr->dev = qs->netdev;
-	lro_mgr->features = LRO_F_NAPI;
-	lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
-	lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
-	lro_mgr->max_desc = T3_MAX_LRO_SES;
-	lro_mgr->lro_arr = qs->lro_desc;
-	lro_mgr->get_frag_header = t3_get_frag_header;
-	lro_mgr->get_skb_header = t3_get_skb_header;
-	lro_mgr->max_aggr = T3_MAX_LRO_MAX_PKTS;
-	if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
-		lro_mgr->max_aggr = MAX_SKB_FRAGS;
+out:
+	qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
 }
 
 /**
@@ -2356,10 +2281,6 @@ next_fl:
 	}
 
 	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
-	lro_flush_all(&qs->lro_mgr);
-	qs->port_stats[SGE_PSTAT_LRO_AGGR] = qs->lro_mgr.stats.aggregated;
-	qs->port_stats[SGE_PSTAT_LRO_FLUSHED] = qs->lro_mgr.stats.flushed;
-	qs->port_stats[SGE_PSTAT_LRO_NO_DESC] = qs->lro_mgr.stats.no_desc;
 
 	if (sleeping)
 		check_ring_db(adap, qs, sleeping);
@@ -2906,7 +2827,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 {
 	int i, avail, ret = -ENOMEM;
 	struct sge_qset *q = &adapter->sge.qs[id];
-	struct net_lro_mgr *lro_mgr = &q->lro_mgr;
 
 	init_qset_cntxt(q, id);
 	setup_timer(&q->tx_reclaim_timer, sge_timer_cb, (unsigned long)q);
@@ -2986,10 +2906,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 	q->fl[0].order = FL0_PG_ORDER;
 	q->fl[1].order = FL1_PG_ORDER;
 
-	q->lro_frag_tbl = kcalloc(MAX_FRAME_SIZE / FL1_PG_CHUNK_SIZE + 1,
-				  sizeof(struct skb_frag_struct),
-				  GFP_KERNEL);
-	q->lro_nfrags = q->lro_frag_len = 0;
 	spin_lock_irq(&adapter->sge.reg_lock);
 
 	/* FL threshold comparison uses < */
@@ -3041,8 +2957,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 	q->tx_q = netdevq;
 	t3_update_qset_coalesce(q, p);
 
-	init_lro_mgr(q, lro_mgr);
-
 	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
 			  GFP_KERNEL | __GFP_COMP);
 	if (!avail) {

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
@ 2009-01-15 21:14 Divy Le Ray
  2009-01-15 23:58 ` Herbert Xu
  2009-01-20  1:03 ` David Miller
  0 siblings, 2 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-15 21:14 UTC (permalink / raw)
  To: herbert, davem; +Cc: netdev, swise


Hi Herbert,

Your patch looks fine. Early testing provides the same perfs
seen with LRO.

I resubmit the original patch with 2 changes:
- cxgb3_set_flags() removal did not apply.
  commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
- replace the NETIF_F_LRO features flag with NETIF_F_GRO
 
cxgb3: Replace LRO with GRO

This patch makes cxgb3 invoke the GRO hooks instead of LRO.  As
GRO has a compatible external interface to LRO this is a very
straightforward replacement.

I've kept the ioctl controls for per-queue LRO switches.  However,
we should not encourage anyone to use these.

Because of that, I've also kept the skb construction code in
cxgb3.  Hopefully we can phase out those per-queue switches
and then kill this too.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Divy Le Ray <divy@chelsio.com>

---

 drivers/net/Kconfig            |    1 
 drivers/net/cxgb3/adapter.h    |   13 +---
 drivers/net/cxgb3/cxgb3_main.c |   42 +-------------
 drivers/net/cxgb3/sge.c        |  118 +++++-----------------------------------
 4 files changed, 22 insertions(+), 152 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9fe8cb7..9212289 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2408,7 +2408,6 @@ config CHELSIO_T3
 	tristate "Chelsio Communications T3 10Gb Ethernet support"
 	depends on CHELSIO_T3_DEPENDS
 	select FW_LOADER
-	select INET_LRO
 	help
 	  This driver supports Chelsio T3-based gigabit and 10Gb Ethernet
 	  adapters.
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index a89d8cc..f9d39c6 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -42,7 +42,6 @@
 #include <linux/cache.h>
 #include <linux/mutex.h>
 #include <linux/bitops.h>
-#include <linux/inet_lro.h>
 #include "t3cdev.h"
 #include <asm/io.h>
 
@@ -178,15 +177,11 @@ enum {				/* per port SGE statistics */
 	SGE_PSTAT_TX_CSUM,	/* # of TX checksum offloads */
 	SGE_PSTAT_VLANEX,	/* # of VLAN tag extractions */
 	SGE_PSTAT_VLANINS,	/* # of VLAN tag insertions */
-	SGE_PSTAT_LRO_AGGR,	/* # of page chunks added to LRO sessions */
-	SGE_PSTAT_LRO_FLUSHED,	/* # of flushed LRO sessions */
-	SGE_PSTAT_LRO_NO_DESC,	/* # of overflown LRO sessions */
 
 	SGE_PSTAT_MAX		/* must be last */
 };
 
-#define T3_MAX_LRO_SES 8
-#define T3_MAX_LRO_MAX_PKTS 64
+struct napi_gro_fraginfo;
 
 struct sge_qset {		/* an SGE queue set */
 	struct adapter *adap;
@@ -194,12 +189,8 @@ struct sge_qset {		/* an SGE queue set */
 	struct sge_rspq rspq;
 	struct sge_fl fl[SGE_RXQ_PER_SET];
 	struct sge_txq txq[SGE_TXQ_PER_SET];
-	struct net_lro_mgr lro_mgr;
-	struct net_lro_desc lro_desc[T3_MAX_LRO_SES];
-	struct skb_frag_struct *lro_frag_tbl;
-	int lro_nfrags;
+	struct napi_gro_fraginfo lro_frag_tbl;
 	int lro_enabled;
-	int lro_frag_len;
 	void *lro_va;
 	struct net_device *netdev;
 	struct netdev_queue *tx_q;	/* associated netdev TX queue */
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 0089746..f59b9e3 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -508,19 +508,9 @@ static void set_qset_lro(struct net_device *dev, int qset_idx, int val)
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
-	int i, lro_on = 1;
 
 	adapter->params.sge.qset[qset_idx].lro = !!val;
 	adapter->sge.qs[qset_idx].lro_enabled = !!val;
-
-	/* let ethtool report LRO on only if all queues are LRO enabled */
-	for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; ++i)
-		lro_on &= adapter->params.sge.qset[i].lro;
-
-	if (lro_on)
-		dev->features |= NETIF_F_LRO;
-	else
-		dev->features &= ~NETIF_F_LRO;
 }
 
 /**
@@ -1433,9 +1423,9 @@ static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
 	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_AGGR);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_FLUSHED);
-	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_NO_DESC);
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = 0;
 	*data++ = s->rx_cong_drops;
 
 	*data++ = s->num_toggled;
@@ -1826,28 +1816,6 @@ static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 	memset(&wol->sopass, 0, sizeof(wol->sopass));
 }
 
-static int cxgb3_set_flags(struct net_device *dev, u32 data)
-{
-	struct port_info *pi = netdev_priv(dev);
-	int i;
-
-	if (data & ETH_FLAG_LRO) {
-		if (!(pi->rx_offload & T3_RX_CSUM))
-			return -EINVAL;
-
-		pi->rx_offload |= T3_LRO;
-		for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
-			set_qset_lro(dev, i, 1);
-
-	} else {
-		pi->rx_offload &= ~T3_LRO;
-		for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
-			set_qset_lro(dev, i, 0);
-	}
-
-	return 0;
-}
-
 static const struct ethtool_ops cxgb_ethtool_ops = {
 	.get_settings = get_settings,
 	.set_settings = set_settings,
@@ -1877,8 +1845,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
 	.get_regs = get_regs,
 	.get_wol = get_wol,
 	.set_tso = ethtool_op_set_tso,
-	.get_flags = ethtool_op_get_flags,
-	.set_flags = cxgb3_set_flags,
 };
 
 static int in_range(int val, int lo, int hi)
@@ -2960,7 +2926,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 		netdev->mem_end = mmio_start + mmio_len - 1;
 		netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 		netdev->features |= NETIF_F_LLTX;
-		netdev->features |= NETIF_F_LRO;
+		netdev->features |= NETIF_F_GRO;
 		if (pci_using_dac)
 			netdev->features |= NETIF_F_HIGHDMA;
 
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 14f9fb3..8299fb5 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -585,8 +585,7 @@ static void t3_reset_qset(struct sge_qset *q)
 	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
 	q->txq_stopped = 0;
 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
-	kfree(q->lro_frag_tbl);
-	q->lro_nfrags = q->lro_frag_len = 0;
+	q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
 }
 
 
@@ -1945,10 +1944,8 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
 		qs->port_stats[SGE_PSTAT_VLANEX]++;
 		if (likely(grp))
 			if (lro)
-				lro_vlan_hwaccel_receive_skb(&qs->lro_mgr, skb,
-							     grp,
-							     ntohs(p->vlan),
-							     p);
+				vlan_gro_receive(&qs->napi, grp,
+						 ntohs(p->vlan), skb);
 			else {
 				if (unlikely(pi->iscsi_ipv4addr &&
 				    is_arp(skb))) {
@@ -1965,7 +1962,7 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
 			dev_kfree_skb_any(skb);
 	} else if (rq->polling) {
 		if (lro)
-			lro_receive_skb(&qs->lro_mgr, skb, p);
+			napi_gro_receive(&qs->napi, skb);
 		else {
 			if (unlikely(pi->iscsi_ipv4addr && is_arp(skb)))
 				cxgb3_arp_process(adap, skb);
@@ -1981,59 +1978,6 @@ static inline int is_eth_tcp(u32 rss)
 }
 
 /**
- *	lro_frame_ok - check if an ingress packet is eligible for LRO
- *	@p: the CPL header of the packet
- *
- *	Returns true if a received packet is eligible for LRO.
- *	The following conditions must be true:
- *	- packet is TCP/IP Ethernet II (checked elsewhere)
- *	- not an IP fragment
- *	- no IP options
- *	- TCP/IP checksums are correct
- *	- the packet is for this host
- */
-static inline int lro_frame_ok(const struct cpl_rx_pkt *p)
-{
-	const struct ethhdr *eh = (struct ethhdr *)(p + 1);
-	const struct iphdr *ih = (struct iphdr *)(eh + 1);
-
-	return (*((u8 *)p + 1) & 0x90) == 0x10 && p->csum == htons(0xffff) &&
-		eh->h_proto == htons(ETH_P_IP) && ih->ihl == (sizeof(*ih) >> 2);
-}
-
-static int t3_get_lro_header(void **eh,  void **iph, void **tcph,
-			     u64 *hdr_flags, void *priv)
-{
-	const struct cpl_rx_pkt *cpl = priv;
-
-	if (!lro_frame_ok(cpl))
-		return -1;
-
-	*eh = (struct ethhdr *)(cpl + 1);
-	*iph = (struct iphdr *)((struct ethhdr *)*eh + 1);
-	*tcph = (struct tcphdr *)((struct iphdr *)*iph + 1);
-
-	*hdr_flags = LRO_IPV4 | LRO_TCP;
-	return 0;
-}
-
-static int t3_get_skb_header(struct sk_buff *skb,
-			      void **iph, void **tcph, u64 *hdr_flags,
-			      void *priv)
-{
-	void *eh;
-
-	return t3_get_lro_header(&eh, iph, tcph, hdr_flags, priv);
-}
-
-static int t3_get_frag_header(struct skb_frag_struct *frag, void **eh,
-			      void **iph, void **tcph, u64 *hdr_flags,
-			      void *priv)
-{
-	return t3_get_lro_header(eh, iph, tcph, hdr_flags, priv);
-}
-
-/**
  *	lro_add_page - add a page chunk to an LRO session
  *	@adap: the adapter
  *	@qs: the associated queue set
@@ -2049,8 +1993,9 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 {
 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 	struct cpl_rx_pkt *cpl;
-	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl;
-	int nr_frags = qs->lro_nfrags, frag_len = qs->lro_frag_len;
+	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
+	int nr_frags = qs->lro_frag_tbl.nr_frags;
+	int frag_len = qs->lro_frag_tbl.len;
 	int offset = 0;
 
 	if (!nr_frags) {
@@ -2069,13 +2014,13 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 	rx_frag->page_offset = sd->pg_chunk.offset + offset;
 	rx_frag->size = len;
 	frag_len += len;
-	qs->lro_nfrags++;
-	qs->lro_frag_len = frag_len;
+	qs->lro_frag_tbl.nr_frags++;
+	qs->lro_frag_tbl.len = frag_len;
 
 	if (!complete)
 		return;
 
-	qs->lro_nfrags = qs->lro_frag_len = 0;
+	qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
 	cpl = qs->lro_va;
 
 	if (unlikely(cpl->vlan_valid)) {
@@ -2084,35 +2029,15 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 		struct vlan_group *grp = pi->vlan_grp;
 
 		if (likely(grp != NULL)) {
-			lro_vlan_hwaccel_receive_frags(&qs->lro_mgr,
-						       qs->lro_frag_tbl,
-						       frag_len, frag_len,
-						       grp, ntohs(cpl->vlan),
-						       cpl, 0);
-			return;
+			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
+				       &qs->lro_frag_tbl);
+			goto out;
 		}
 	}
-	lro_receive_frags(&qs->lro_mgr, qs->lro_frag_tbl,
-			  frag_len, frag_len, cpl, 0);
-}
+	napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
 
-/**
- *	init_lro_mgr - initialize a LRO manager object
- *	@lro_mgr: the LRO manager object
- */
-static void init_lro_mgr(struct sge_qset *qs, struct net_lro_mgr *lro_mgr)
-{
-	lro_mgr->dev = qs->netdev;
-	lro_mgr->features = LRO_F_NAPI;
-	lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
-	lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
-	lro_mgr->max_desc = T3_MAX_LRO_SES;
-	lro_mgr->lro_arr = qs->lro_desc;
-	lro_mgr->get_frag_header = t3_get_frag_header;
-	lro_mgr->get_skb_header = t3_get_skb_header;
-	lro_mgr->max_aggr = T3_MAX_LRO_MAX_PKTS;
-	if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
-		lro_mgr->max_aggr = MAX_SKB_FRAGS;
+out:
+	qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
 }
 
 /**
@@ -2356,10 +2281,6 @@ next_fl:
 	}
 
 	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
-	lro_flush_all(&qs->lro_mgr);
-	qs->port_stats[SGE_PSTAT_LRO_AGGR] = qs->lro_mgr.stats.aggregated;
-	qs->port_stats[SGE_PSTAT_LRO_FLUSHED] = qs->lro_mgr.stats.flushed;
-	qs->port_stats[SGE_PSTAT_LRO_NO_DESC] = qs->lro_mgr.stats.no_desc;
 
 	if (sleeping)
 		check_ring_db(adap, qs, sleeping);
@@ -2906,7 +2827,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 {
 	int i, avail, ret = -ENOMEM;
 	struct sge_qset *q = &adapter->sge.qs[id];
-	struct net_lro_mgr *lro_mgr = &q->lro_mgr;
 
 	init_qset_cntxt(q, id);
 	setup_timer(&q->tx_reclaim_timer, sge_timer_cb, (unsigned long)q);
@@ -2986,10 +2906,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 	q->fl[0].order = FL0_PG_ORDER;
 	q->fl[1].order = FL1_PG_ORDER;
 
-	q->lro_frag_tbl = kcalloc(MAX_FRAME_SIZE / FL1_PG_CHUNK_SIZE + 1,
-				  sizeof(struct skb_frag_struct),
-				  GFP_KERNEL);
-	q->lro_nfrags = q->lro_frag_len = 0;
 	spin_lock_irq(&adapter->sge.reg_lock);
 
 	/* FL threshold comparison uses < */
@@ -3041,8 +2957,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
 	q->tx_q = netdevq;
 	t3_update_qset_coalesce(q, p);
 
-	init_lro_mgr(q, lro_mgr);
-
 	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
 			  GFP_KERNEL | __GFP_COMP);
 	if (!avail) {


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-15 21:14 cxgb3: Replace LRO with GRO Divy Le Ray
@ 2009-01-15 23:58 ` Herbert Xu
  2009-01-16  8:06   ` Divy Le Ray
  2009-01-20  1:03 ` David Miller
  1 sibling, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-15 23:58 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, swise

Divy Le Ray <divy@chelsio.com> wrote:
> 
> Your patch looks fine. Early testing provides the same perfs
> seen with LRO.
> 
> I resubmit the original patch with 2 changes:
> - cxgb3_set_flags() removal did not apply.
>  commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
> - replace the NETIF_F_LRO features flag with NETIF_F_GRO

Thanks for the update Divy! It looks good to me.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-15 23:58 ` Herbert Xu
@ 2009-01-16  8:06   ` Divy Le Ray
  2009-01-16  8:56     ` Herbert Xu
  0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-16  8:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Steve Wise

Herbert Xu wrote:
> Divy Le Ray <divy@chelsio.com> wrote:
>> Your patch looks fine. Early testing provides the same perfs
>> seen with LRO.
>>
>> I resubmit the original patch with 2 changes:
>> - cxgb3_set_flags() removal did not apply.
>>  commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
>> - replace the NETIF_F_LRO features flag with NETIF_F_GRO
> 
> Thanks for the update Divy! It looks good to me.

Hi Herbert,

The perfs are not actually as good as with LRO.
With a slow server, 1 netperf stream gets on average:
without the patch, lro off: 3.8Gbs
without the patch, lro on: 6.1Gbs
with the patch, GRO on: 4.8Gbs.

I'll look into it.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-16  8:06   ` Divy Le Ray
@ 2009-01-16  8:56     ` Herbert Xu
  2009-01-16 11:12       ` Divy Le Ray
  0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-16  8:56 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, Steve Wise

On Fri, Jan 16, 2009 at 12:06:45AM -0800, Divy Le Ray wrote:
> 
> The perfs are not actually as good as with LRO.

Thanks for testing Divy!

> With a slow server, 1 netperf stream gets on average:

I presume the server is receiving?

> without the patch, lro off: 3.8Gbs
> without the patch, lro on: 6.1Gbs
> with the patch, GRO on: 4.8Gbs.

What about the case of GRO off with the patch? Just checking to
make sure that nothing else has changed.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-16  8:56     ` Herbert Xu
@ 2009-01-16 11:12       ` Divy Le Ray
  2009-01-16 23:58         ` Herbert Xu
  2009-01-17  5:08         ` Herbert Xu
  0 siblings, 2 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-16 11:12 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Steve Wise


> I presume the server is receiving?
Yes, it is receiving. The adapter is set up with one queue, its irq
pinned to cpu1, irq balance disabled. CPU1 is pegged in all tests.
> 
>> without the patch, lro off: 3.8Gbs
>> without the patch, lro on: 6.1Gbs
>> with the patch, GRO on: 4.8Gbs.
> 
> What about the case of GRO off with the patch? Just checking to
> make sure that nothing else has changed.

I'm getting about 3.4Gbs with the patch and GRO off.

I did some profiling with GRO on and LRO on. Here are typical outputs:
GRO on:
CPU 1 :
      26.445100  copy_user_generic_unrolled          vmlinux
      10.050300  skb_copy_datagram_iovec             vmlinux
       5.076900  memcpy                              vmlinux
       4.636400  process_responses                  cxgb3.ko
       4.625100  __pskb_pull_tail                    vmlinux
       4.463300  eth_type_trans                      vmlinux
       4.420600  inet_gro_receive                    vmlinux
       4.310500  refill_fl                          cxgb3.ko
       3.312700  skb_copy_bits                       vmlinux
       3.155300  put_page                            vmlinux
       2.818200  kfree                               vmlinux
       2.537300  tcp_gro_receive                     vmlinux
       2.422700  dev_gro_receive                     vmlinux
       2.229400  kmem_cache_alloc_node               vmlinux
       1.526000  skb_gro_receive                     vmlinux
       1.319200  free_hot_cold_page                  vmlinux
       1.224800  get_page_from_freelist              vmlinux
       1.112500  __alloc_skb                         vmlinux
       0.910200  kmem_cache_free                     vmlinux
       0.872000  napi_fraginfo_skb                   vmlinux

LRO on:
CPU 1 :
      48.511600  copy_user_generic_unrolled          vmlinux
       5.859600  put_page                            vmlinux
       4.405500  process_responses                  cxgb3.ko
       4.006400  refill_fl                          cxgb3.ko
       2.547200  irq_entries_start                   vmlinux
       2.315900  free_hot_cold_page                  vmlinux
       2.224400  skb_copy_datagram_iovec             vmlinux
       1.985400  tcp_recvmsg                         vmlinux
       1.311700  csum_partial                        vmlinux
       1.276200  _raw_spin_lock                      vmlinux
       1.088000  get_page_from_freelist              vmlinux
       1.016900  get_pageblock_flags_group           vmlinux
       0.920300  memcpy_toiovec                      vmlinux
       0.859200  kfree                               vmlinux
       0.688900  dst_release                         vmlinux
       0.630400  t3_sge_intr_msix_napi              cxgb3.ko
       0.559300  tcp_rcv_established                 vmlinux
       0.472800  kmem_cache_alloc_node               vmlinux
       0.404200  __inet_lookup_established           vmlinux
       0.399100  memset                              vmlinux

I have not looked at the code yet. It's getting late :)

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-16 11:12       ` Divy Le Ray
@ 2009-01-16 23:58         ` Herbert Xu
  2009-01-17  5:08         ` Herbert Xu
  1 sibling, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-16 23:58 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, Steve Wise

On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>
> >> without the patch, lro off: 3.8Gbs
> >> without the patch, lro on: 6.1Gbs
> >> with the patch, GRO on: 4.8Gbs.
> > 
> > What about the case of GRO off with the patch? Just checking to
> > make sure that nothing else has changed.
> 
> I'm getting about 3.4Gbs with the patch and GRO off.

I wonder where the 3.8 => 3.4 slow-down is coming from.

> I did some profiling with GRO on and LRO on. Here are typical outputs:
> GRO on:
> CPU 1 :
>       26.445100  copy_user_generic_unrolled          vmlinux
>       10.050300  skb_copy_datagram_iovec             vmlinux
>        5.076900  memcpy                              vmlinux
>        4.636400  process_responses                  cxgb3.ko
>        4.625100  __pskb_pull_tail                    vmlinux
>        4.463300  eth_type_trans                      vmlinux
>        4.420600  inet_gro_receive                    vmlinux
>        4.310500  refill_fl                          cxgb3.ko
>        3.312700  skb_copy_bits                       vmlinux

Thanks, this is very helpful.  The first obvious deficiency
is that napi_fraginfo_skb is causing multiple pskb_may_pull
calls on every fragment which is bad.

I'll get a patch out for you test.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-16 11:12       ` Divy Le Ray
  2009-01-16 23:58         ` Herbert Xu
@ 2009-01-17  5:08         ` Herbert Xu
  2009-01-17 11:11           ` Divy Le Ray
  1 sibling, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-17  5:08 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, Steve Wise

Hi Divy:

On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>
>        1.112500  __alloc_skb                         vmlinux

Can you confirm that you were testing 1500 byte packets?  If so
that would tell us why this function got up there.  As it stands
GRO will not stop aggregating even when the frags array is full,
until the skb size hits 64K.

That means half the packet will be in frag_list which is very
inefficient.

If this is indeed the problem, then for now I'll simply make
it stop aggregating after the array is full, just like LRO.

While in future we can extend it such that even when it starts
using frag_list, it'll still continue to fill in the frags array.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-17  5:08         ` Herbert Xu
@ 2009-01-17 11:11           ` Divy Le Ray
  2009-01-17 13:08             ` Herbert Xu
  0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-17 11:11 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Steve Wise

Herbert Xu wrote:
> Hi Divy:
> 
> On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>>        1.112500  __alloc_skb                         vmlinux
> 
> Can you confirm that you were testing 1500 byte packets?

Yes.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-17 11:11           ` Divy Le Ray
@ 2009-01-17 13:08             ` Herbert Xu
  2009-01-18 20:33               ` Divy Le Ray
  0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-17 13:08 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, Steve Wise

On Sat, Jan 17, 2009 at 03:11:26AM -0800, Divy Le Ray wrote:
>
> Yes.

Great.  Please give this patch a whirl.

commit a9156545da4f6ed729b1871a92e3ead04f37a034
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sat Jan 17 21:49:42 2009 +1100

    gro: Do not merge paged packets into frag_list
    
    Bigger is not always better :)
    
    It was easy to continue to merged packets into frag_list after the
    page array is full.  However, this turns out to be worse than LRO
    because frag_list is a much less efficient form of storage than the
    page array.  So we're better off stopping the merge and starting
    a new entry with an empty page array.
    
    In future we can optimise this further by doing frag_list merging
    but making sure that we continue to fill in the page array.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

commit 2e54c6210c41f7fb2c503a4b51a8288e05a1c9d5
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sat Jan 17 21:45:11 2009 +1100

    gro: Avoid copying headers of unmerged packets
    
    Unfortunately simplicity isn't always the best.  The fraginfo
    interface turned out to be suboptimal.  The problem was quite
    obvious.  For every packet, we have to copy the headers from
    the frags structure into skb->head, even though for 99% of the
    packets this part is immediately thrown away after the merge.
    
    LRO didn't have this problem because it directly read the headers
    from the frags structure.
    
    This patch attempts to address this by creating an interface
    that allows GRO to access the headers in the first frag without
    having to copy it.  Because all drivers that use frags place the
    headers in the first frag this optimisation should be enough.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

commit db14bd4bff3d85d779a83fbdc554f4147ee9b841
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sun Jan 18 00:00:13 2009 +1100

    gro: Fix merging of paged packets
    
    The previous fix to paged packets broke the merging because it
    reset the skb->len before we added it to the merged packet.  This
    wasn't detected because it simply resulted in the truncation of
    the packet while the missing bit is subsequently retransmitted.
    
    The fix is to store skb->len before we clobber it.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

commit 6ba4147cb5854d334965d89acb821e6f624534eb
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sat Jan 17 21:05:58 2009 +1100

    gro: Move common completion code into helpers
    
    Currently VLAN still has a bit of common code handling the aftermath
    of GRO that's shared with the common path.  This patch moves them
    into shared helpers to reduce code duplication.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

commit 86c3c2fc1f99394990b3ef1c2881b6036e4f5141
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sat Jan 17 15:06:40 2009 +1100

    gro: Fix error handling on extremely short frags
    
    When a frag is shorter than an Ethernet header, we'd return a
    zeroed packet instead of aborting.  This patch fixes that.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

commit 099f08d2279eff7f74226bc8f691b358b1eac7b7
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sat Jan 17 20:12:07 2009 +1100

    gro: Fix handling of complete checksums in IPv6
    
    We need to perform skb_postpull_rcsum after pulling the IPv6
    header in order to maintain the correctness of the complete
    checksum.
    
    This patch also adds a missing iph reload after pulling.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f245568..5472328 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -983,6 +983,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 void netif_napi_del(struct napi_struct *napi);
 
 struct napi_gro_cb {
+	/* This indicates where we are processing relative to skb->data. */
+	int data_offset;
+
 	/* This is non-zero if the packet may be of the same flow. */
 	int same_flow;
 
@@ -1084,6 +1087,29 @@ extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);
 #endif
+extern void	      *skb_gro_header(struct sk_buff *skb, unsigned int hlen);
+extern int	       skb_gro_receive(struct sk_buff **head,
+				       struct sk_buff *skb);
+
+static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
+{
+	return NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline unsigned int skb_gro_len(const struct sk_buff *skb)
+{
+	return skb->len - NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
+{
+	NAPI_GRO_CB(skb)->data_offset += len;
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb)
+{
+	NAPI_GRO_CB(skb)->data_offset = 0;
+}
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
@@ -1372,12 +1398,15 @@ extern int		netif_receive_skb(struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi);
 extern int		dev_gro_receive(struct napi_struct *napi,
 					struct sk_buff *skb);
+extern int		napi_skb_finish(int ret, struct sk_buff *skb);
 extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
 extern struct sk_buff *	napi_fraginfo_skb(struct napi_struct *napi,
 					  struct napi_gro_fraginfo *info);
+extern int		napi_frags_finish(struct napi_struct *napi,
+					  struct sk_buff *skb, int ret);
 extern int		napi_gro_frags(struct napi_struct *napi,
 				       struct napi_gro_fraginfo *info);
 extern void		netif_nit_deliver(struct sk_buff *skb);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf2cb50..acf17af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1687,8 +1687,6 @@ extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
 				 int shiftlen);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
-extern int	       skb_gro_receive(struct sk_buff **head,
-				       struct sk_buff *skb);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 6c13239..fb0d16a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -98,22 +98,9 @@ drop:
 int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 		     unsigned int vlan_tci, struct sk_buff *skb)
 {
-	int err = NET_RX_SUCCESS;
+	skb_gro_reset_offset(skb);
 
-	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
-	case -1:
-		return netif_receive_skb(skb);
-
-	case 2:
-		err = NET_RX_DROP;
-		/* fall through */
-
-	case 1:
-		kfree_skb(skb);
-		break;
-	}
-
-	return err;
+	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
 }
 EXPORT_SYMBOL(vlan_gro_receive);
 
@@ -121,27 +108,11 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 		   unsigned int vlan_tci, struct napi_gro_fraginfo *info)
 {
 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
-	int err = NET_RX_DROP;
 
 	if (!skb)
-		goto out;
-
-	err = NET_RX_SUCCESS;
-
-	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
-	case -1:
-		return netif_receive_skb(skb);
-
-	case 2:
-		err = NET_RX_DROP;
-		/* fall through */
-
-	case 1:
-		napi_reuse_skb(napi, skb);
-		break;
-	}
+		return NET_RX_DROP;
 
-out:
-	return err;
+	return napi_frags_finish(napi, skb,
+				 vlan_gro_common(napi, grp, vlan_tci, skb));
 }
 EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f69a2d..3742397 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,14 @@
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
+enum {
+	GRO_MERGED,
+	GRO_MERGED_FREE,
+	GRO_HELD,
+	GRO_NORMAL,
+	GRO_DROP,
+};
+
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -207,6 +215,13 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 }
 
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+	return skb_headlen(skb) ? skb_mac_header(skb) :
+	       page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset;
+}
+
 /* Device list insertion */
 static int list_netdevice(struct net_device *dev)
 {
@@ -2350,7 +2365,6 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 out:
 	skb_shinfo(skb)->gso_size = 0;
-	__skb_push(skb, -skb_network_offset(skb));
 	return netif_receive_skb(skb);
 }
 
@@ -2368,6 +2382,25 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
+void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
+{
+	unsigned int offset = skb_gro_offset(skb);
+
+	hlen += offset;
+	if (hlen <= skb_headlen(skb))
+		return skb->data + offset;
+
+	if (unlikely(!skb_shinfo(skb)->nr_frags ||
+		     skb_shinfo(skb)->frags[0].size <=
+		     hlen - skb_headlen(skb) ||
+		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
+		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+
+	return page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset + offset;
+}
+EXPORT_SYMBOL(skb_gro_header);
+
 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
@@ -2377,7 +2410,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	int count = 0;
 	int same_flow;
 	int mac_len;
-	int free;
+	int ret;
 
 	if (!(skb->dev->features & NETIF_F_GRO))
 		goto normal;
@@ -2388,11 +2421,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		struct sk_buff *p;
+		void *mac;
 
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
 			continue;
 
-		skb_reset_network_header(skb);
+		skb_set_network_header(skb, skb_gro_offset(skb));
+		mac = skb_gro_mac_header(skb);
 		mac_len = skb->network_header - skb->mac_header;
 		skb->mac_len = mac_len;
 		NAPI_GRO_CB(skb)->same_flow = 0;
@@ -2406,8 +2441,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 				continue;
 
 			if (p->mac_len != mac_len ||
-			    memcmp(skb_mac_header(p), skb_mac_header(skb),
-				   mac_len))
+			    memcmp(skb_mac_header(p), mac, mac_len))
 				NAPI_GRO_CB(p)->same_flow = 0;
 		}
 
@@ -2420,7 +2454,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		goto normal;
 
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
-	free = NAPI_GRO_CB(skb)->free;
+	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
 		struct sk_buff *nskb = *pp;
@@ -2434,21 +2468,20 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	if (same_flow)
 		goto ok;
 
-	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
-		__skb_push(skb, -skb_network_offset(skb));
+	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
 		goto normal;
-	}
 
 	NAPI_GRO_CB(skb)->count = 1;
-	skb_shinfo(skb)->gso_size = skb->len;
+	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
 	napi->gro_list = skb;
+	ret = GRO_HELD;
 
 ok:
-	return free;
+	return ret;
 
 normal:
-	return -1;
+	return GRO_NORMAL;
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
@@ -2464,18 +2497,32 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+int napi_skb_finish(int ret, struct sk_buff *skb)
 {
-	switch (__napi_gro_receive(napi, skb)) {
-	case -1:
+	int err = NET_RX_SUCCESS;
+
+	switch (ret) {
+	case GRO_NORMAL:
 		return netif_receive_skb(skb);
 
-	case 1:
+	case GRO_DROP:
+		err = NET_RX_DROP;
+		/* fall through */
+
+	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
 	}
 
-	return NET_RX_SUCCESS;
+	return err;
+}
+EXPORT_SYMBOL(napi_skb_finish);
+
+int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	skb_gro_reset_offset(skb);
+
+	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
@@ -2493,6 +2540,7 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 {
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
+	struct ethhdr *eth;
 
 	napi->skb = NULL;
 
@@ -2512,12 +2560,23 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 	skb->len += info->len;
 	skb->truesize += info->len;
 
-	if (!pskb_may_pull(skb, ETH_HLEN)) {
+	skb_reset_mac_header(skb);
+	skb_gro_reset_offset(skb);
+
+	eth = skb_gro_header(skb, sizeof(*eth));
+	if (!eth) {
 		napi_reuse_skb(napi, skb);
+		skb = NULL;
 		goto out;
 	}
 
-	skb->protocol = eth_type_trans(skb, dev);
+	skb_gro_pull(skb, sizeof(*eth));
+
+	/*
+	 * This works because the only protocols we care about don't require
+	 * special handling.  We'll fix it up properly at the end.
+	 */
+	skb->protocol = eth->h_proto;
 
 	skb->ip_summed = info->ip_summed;
 	skb->csum = info->csum;
@@ -2527,29 +2586,47 @@ out:
 }
 EXPORT_SYMBOL(napi_fraginfo_skb);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
-	int err = NET_RX_DROP;
+	int err = NET_RX_SUCCESS;
+	int may;
 
-	if (!skb)
-		goto out;
+	switch (ret) {
+	case GRO_NORMAL:
+	case GRO_HELD:
+		may = pskb_may_pull(skb, skb_gro_offset(skb));
+		BUG_ON(!may);
 
-	err = NET_RX_SUCCESS;
+		skb->protocol = eth_type_trans(skb, napi->dev);
 
-	switch (__napi_gro_receive(napi, skb)) {
-	case -1:
-		return netif_receive_skb(skb);
+		if (ret == GRO_NORMAL)
+			return netif_receive_skb(skb);
 
-	case 0:
-		goto out;
-	}
+		skb_gro_pull(skb, -ETH_HLEN);
+		break;
 
-	napi_reuse_skb(napi, skb);
+	case GRO_DROP:
+		err = NET_RX_DROP;
+		/* fall through */
+
+	case GRO_MERGED_FREE:
+		napi_reuse_skb(napi, skb);
+		break;
+	}
 
-out:
 	return err;
 }
+EXPORT_SYMBOL(napi_frags_finish);
+
+int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+{
+	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+
+	if (!skb)
+		return NET_RX_DROP;
+
+	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+}
 EXPORT_SYMBOL(napi_gro_frags);
 
 static int process_backlog(struct napi_struct *napi, int quota)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 65eac77..a11d9a4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2587,16 +2587,23 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	struct sk_buff *p = *head;
 	struct sk_buff *nskb;
 	unsigned int headroom;
-	unsigned int hlen = p->data - skb_mac_header(p);
+	unsigned int len = skb_gro_len(skb);
 
-	if (hlen + p->len + skb->len >= 65536)
+	if (p->len + len >= 65536)
 		return -E2BIG;
 
 	if (skb_shinfo(p)->frag_list)
 		goto merge;
-	else if (!skb_headlen(p) && !skb_headlen(skb) &&
-		 skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <
-		 MAX_SKB_FRAGS) {
+	else if (skb_headlen(skb) <= skb_gro_offset(skb)) {
+		if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags >
+		    MAX_SKB_FRAGS)
+			return -E2BIG;
+
+		skb_shinfo(skb)->frags[0].page_offset +=
+			skb_gro_offset(skb) - skb_headlen(skb);
+		skb_shinfo(skb)->frags[0].size -=
+			skb_gro_offset(skb) - skb_headlen(skb);
+
 		memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
 		       skb_shinfo(skb)->frags,
 		       skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
@@ -2621,12 +2628,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	nskb->mac_len = p->mac_len;
 
 	skb_reserve(nskb, headroom);
+	__skb_put(nskb, skb_headlen(p));
 
-	skb_set_mac_header(nskb, -hlen);
+	skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
 	skb_set_network_header(nskb, skb_network_offset(p));
 	skb_set_transport_header(nskb, skb_transport_offset(p));
 
-	memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen);
+	__skb_pull(p, skb_gro_offset(p));
+	memcpy(skb_mac_header(nskb), skb_mac_header(p),
+	       p->data - skb_mac_header(p));
 
 	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
 	skb_shinfo(nskb)->frag_list = p;
@@ -2651,9 +2661,9 @@ merge:
 
 done:
 	NAPI_GRO_CB(p)->count++;
-	p->data_len += skb->len;
-	p->truesize += skb->len;
-	p->len += skb->len;
+	p->data_len += len;
+	p->truesize += len;
+	p->len += len;
 
 	NAPI_GRO_CB(skb)->same_flow = 1;
 	return 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f554..d6770f2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1253,10 +1253,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	int proto;
 	int id;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+	iph = skb_gro_header(skb, sizeof(*iph));
+	if (unlikely(!iph))
 		goto out;
 
-	iph = ip_hdr(skb);
 	proto = iph->protocol & (MAX_INET_PROTOS - 1);
 
 	rcu_read_lock();
@@ -1270,7 +1270,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 		goto out_unlock;
 
-	flush = ntohs(iph->tot_len) != skb->len ||
+	flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
 		iph->frag_off != htons(IP_DF);
 	id = ntohs(iph->id);
 
@@ -1298,8 +1298,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	}
 
 	NAPI_GRO_CB(skb)->flush |= flush;
-	__skb_pull(skb, sizeof(*iph));
-	skb_reset_transport_header(skb);
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
 
 	pp = ops->gro_receive(head, skb);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ce572f9..0191d39 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2473,19 +2473,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	unsigned int mss = 1;
 	int flush = 1;
 
-	if (!pskb_may_pull(skb, sizeof(*th)))
+	th = skb_gro_header(skb, sizeof(*th));
+	if (unlikely(!th))
 		goto out;
 
-	th = tcp_hdr(skb);
 	thlen = th->doff * 4;
 	if (thlen < sizeof(*th))
 		goto out;
 
-	if (!pskb_may_pull(skb, thlen))
+	th = skb_gro_header(skb, thlen);
+	if (unlikely(!th))
 		goto out;
 
-	th = tcp_hdr(skb);
-	__skb_pull(skb, thlen);
+	skb_gro_pull(skb, thlen);
 
 	flags = tcp_flag_word(th);
 
@@ -2513,10 +2513,10 @@ found:
 	flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
 	flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
 
-	total = p->len;
+	total = skb_gro_len(p);
 	mss = skb_shinfo(p)->gso_size;
 
-	flush |= skb->len > mss || skb->len <= 0;
+	flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb);
 	flush |= ntohl(th2->seq) + total != ntohl(th->seq);
 
 	if (flush || skb_gro_receive(head, skb)) {
@@ -2529,7 +2529,7 @@ found:
 	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
 
 out_check_final:
-	flush = skb->len < mss;
+	flush = skb_gro_len(skb) < mss;
 	flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
 			  TCP_FLAG_SYN | TCP_FLAG_FIN);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 19d7b42..f6b962f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2355,7 +2355,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
-		if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr,
+		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
 				  skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 94f74f5..bd91ead 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -797,24 +797,36 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	unsigned int nlen;
 	int flush = 1;
 	int proto;
+	__wsum csum;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+	iph = skb_gro_header(skb, sizeof(*iph));
+	if (unlikely(!iph))
 		goto out;
 
-	iph = ipv6_hdr(skb);
-	__skb_pull(skb, sizeof(*iph));
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
 
-	flush += ntohs(iph->payload_len) != skb->len;
+	flush += ntohs(iph->payload_len) != skb_gro_len(skb);
 
 	rcu_read_lock();
-	proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr);
-	IPV6_GRO_CB(skb)->proto = proto;
+	proto = iph->nexthdr;
 	ops = rcu_dereference(inet6_protos[proto]);
-	if (!ops || !ops->gro_receive)
-		goto out_unlock;
+	if (!ops || !ops->gro_receive) {
+		__pskb_pull(skb, skb_gro_offset(skb));
+		proto = ipv6_gso_pull_exthdrs(skb, proto);
+		skb_gro_pull(skb, -skb_transport_offset(skb));
+		skb_reset_transport_header(skb);
+		__skb_push(skb, skb_gro_offset(skb));
+
+		if (!ops || !ops->gro_receive)
+			goto out_unlock;
+
+		iph = ipv6_hdr(skb);
+	}
+
+	IPV6_GRO_CB(skb)->proto = proto;
 
 	flush--;
-	skb_reset_transport_header(skb);
 	nlen = skb_network_header_len(skb);
 
 	for (p = *head; p; p = p->next) {
@@ -839,8 +851,13 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 
 	NAPI_GRO_CB(skb)->flush |= flush;
 
+	csum = skb->csum;
+	skb_postpull_rcsum(skb, iph, skb_network_header_len(skb));
+
 	pp = ops->gro_receive(head, skb);
 
+	skb->csum = csum;
+
 out_unlock:
 	rcu_read_unlock();
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5b85d4..00f1269 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -948,7 +948,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
-		if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr,
+		if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
 				  skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-17 13:08             ` Herbert Xu
@ 2009-01-18 20:33               ` Divy Le Ray
  2009-01-18 22:50                 ` Herbert Xu
  0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-18 20:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Steve Wise

Herbert Xu wrote:
> On Sat, Jan 17, 2009 at 03:11:26AM -0800, Divy Le Ray wrote:
>> Yes.
> 
> Great.  Please give this patch a whirl.
> 

Hi Herbert,

Thanks for the patch. It does improve the performance.
I'm now getting about 5.3Gbs.

Here is a oprofile output:
      33.929900  copy_user_generic_unrolled          vmlinux
       7.150300  process_responses                  cxgb3.ko
       6.261100  refill_fl                          cxgb3.ko
       6.181700  memcpy                              vmlinux
       6.168100  inet_gro_receive                    vmlinux
       4.593700  put_page                            vmlinux
       4.085600  dev_gro_receive                     vmlinux
       3.833800  tcp_gro_receive                     vmlinux
       1.817100  free_hot_cold_page                  vmlinux
       1.715000  skb_copy_datagram_iovec             vmlinux
       1.705900  skb_gro_header                      vmlinux
       1.676400  get_page_from_freelist              vmlinux
       1.256700  tcp_recvmsg                         vmlinux
       0.893800  get_pageblock_flags_group           vmlinux
       0.850700  irq_entries_start                   vmlinux
       0.803000  _raw_spin_lock                      vmlinux
       0.782600  skb_gro_receive                     vmlinux
       0.739500  memcpy_toiovec                      vmlinux
       0.725900  napi_fraginfo_skb                   vmlinux
       0.601200  kfree                               vmlinux

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-18 20:33               ` Divy Le Ray
@ 2009-01-18 22:50                 ` Herbert Xu
  0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-18 22:50 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: davem, netdev, Steve Wise

On Sun, Jan 18, 2009 at 12:33:42PM -0800, Divy Le Ray wrote:
> 
> Thanks for the patch. It does improve the performance.
> I'm now getting about 5.3Gbs.

Thanks for testing Divy!

> Here is a oprofile output:
>       33.929900  copy_user_generic_unrolled          vmlinux
>        7.150300  process_responses                  cxgb3.ko
>        6.261100  refill_fl                          cxgb3.ko
>        6.181700  memcpy                              vmlinux

I wonder why this is still up there.

Maybe this patch will help?

diff --git a/net/core/dev.c b/net/core/dev.c
index 3742397..3cc19fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2554,7 +2554,8 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 
 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
 	skb_shinfo(skb)->nr_frags = info->nr_frags;
-	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
+	memcpy(skb_shinfo(skb)->frags, info->frags,
+	       info->nr_frags * sizeof(*info->frags));
 
 	skb->data_len = info->len;
 	skb->len += info->len;

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-15 21:14 cxgb3: Replace LRO with GRO Divy Le Ray
  2009-01-15 23:58 ` Herbert Xu
@ 2009-01-20  1:03 ` David Miller
  2009-01-20  2:03   ` David Miller
  1 sibling, 1 reply; 23+ messages in thread
From: David Miller @ 2009-01-20  1:03 UTC (permalink / raw)
  To: divy; +Cc: herbert, netdev, swise

From: Divy Le Ray <divy@chelsio.com>
Date: Thu, 15 Jan 2009 13:14:03 -0800

> cxgb3: Replace LRO with GRO
> 
> This patch makes cxgb3 invoke the GRO hooks instead of LRO.  As
> GRO has a compatible external interface to LRO this is a very
> straightforward replacement.
> 
> I've kept the ioctl controls for per-queue LRO switches.  However,
> we should not encourage anyone to use these.
> 
> Because of that, I've also kept the skb construction code in
> cxgb3.  Hopefully we can phase out those per-queue switches
> and then kill this too.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> Acked-by: Divy Le Ray <divy@chelsio.com>

I've added this to my net-next-2.6 tree.

I'm confident that you will work out the performance issues
with Herbert, else we'll simply revert this.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-20  1:03 ` David Miller
@ 2009-01-20  2:03   ` David Miller
  2009-01-20  5:24     ` Herbert Xu
  2009-01-20 10:04     ` Divy Le Ray
  0 siblings, 2 replies; 23+ messages in thread
From: David Miller @ 2009-01-20  2:03 UTC (permalink / raw)
  To: divy; +Cc: herbert, netdev, swise

From: David Miller <davem@davemloft.net>
Date: Mon, 19 Jan 2009 17:03:08 -0800 (PST)

> From: Divy Le Ray <divy@chelsio.com>
> Date: Thu, 15 Jan 2009 13:14:03 -0800
> 
> > cxgb3: Replace LRO with GRO
> > 
> > This patch makes cxgb3 invoke the GRO hooks instead of LRO.  As
> > GRO has a compatible external interface to LRO this is a very
> > straightforward replacement.
> > 
> > I've kept the ioctl controls for per-queue LRO switches.  However,
> > we should not encourage anyone to use these.
> > 
> > Because of that, I've also kept the skb construction code in
> > cxgb3.  Hopefully we can phase out those per-queue switches
> > and then kill this too.
> > 
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> > Acked-by: Divy Le Ray <divy@chelsio.com>
> 
> I've added this to my net-next-2.6 tree.
> 
> I'm confident that you will work out the performance issues
> with Herbert, else we'll simply revert this.

I had to add a fix due to some build bustage resulting from
this change.

drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
inclusion of linux/vmalloc.h coming from the inet_lro.h include
which is removed by this change.

In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'

I'll fix this up.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-20  2:03   ` David Miller
@ 2009-01-20  5:24     ` Herbert Xu
  2009-01-20 10:04     ` Divy Le Ray
  1 sibling, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-20  5:24 UTC (permalink / raw)
  To: David Miller; +Cc: divy, netdev, swise

On Mon, Jan 19, 2009 at 06:03:30PM -0800, David Miller wrote:
>
> drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
> inclusion of linux/vmalloc.h coming from the inet_lro.h include
> which is removed by this change.
> 
> In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'

Thanks Dave!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-20  2:03   ` David Miller
  2009-01-20  5:24     ` Herbert Xu
@ 2009-01-20 10:04     ` Divy Le Ray
  1 sibling, 0 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-20 10:04 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, Steve Wise

David Miller wrote:

> I had to add a fix due to some build bustage resulting from
> this change.
> 
> drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
> inclusion of linux/vmalloc.h coming from the inet_lro.h include
> which is removed by this change.
> 
> In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'
> 
> I'll fix this up.

Oops. Sorry about that. Thanks for fixing it.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
@ 2009-01-20 10:14 Divy Le Ray
  2009-01-21  8:29 ` Herbert Xu
  0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-20 10:14 UTC (permalink / raw)
  To: herbert; +Cc: netdev


Hi Herbert,

I have tried the following patch as an attempt to eliminate the memcpy
seen on the previous oprofile. I'm now getting about 5.5 Gbs.
After that, I went through the output of opreport -d to figure out
the most expensive ops witnessed in my profiling.

Here is the patch:

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2554,6 +2554,8 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
 	struct ethhdr *eth;
+	skb_frag_t *frag;
+	int i;

 	napi->skb = NULL;

@@ -2566,9 +2568,15 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 	}

 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
-	skb_shinfo(skb)->nr_frags = info->nr_frags;
-	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
+	frag = &info->frags[info->nr_frags - 1];

+	for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
+		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
+				   frag->size);
+		frag++;
+	}
+	skb_shinfo(skb)->nr_frags = info->nr_frags;
+
 	skb->data_len = info->len;
 	skb->len += info->len;
 	skb->truesize += info->len;

Here is the non detailed opreport output for the CPU managing the reception
of netperf traffic:

      38.815300  copy_user_generic_unrolled          vmlinux
       6.373900  process_responses                  cxgb3.ko
       4.957800  inet_gro_receive                    vmlinux
       4.908800  put_page                            vmlinux
       4.862100  refill_fl                          cxgb3.ko
       3.774900  dev_gro_receive                     vmlinux
       3.096000  tcp_gro_receive                     vmlinux
       2.764700  napi_fraginfo_skb                   vmlinux
       2.174400  free_hot_cold_page                  vmlinux
       2.006400  skb_copy_datagram_iovec             vmlinux
       1.511800  tcp_recvmsg                         vmlinux
       1.488500  get_page_from_freelist              vmlinux
       1.455800  irq_entries_start                   vmlinux
       1.453500  skb_gro_header                      vmlinux
       0.877200  get_pageblock_flags_group           vmlinux
       0.863200  memcpy_toiovec                      vmlinux
       0.856200  _raw_spin_lock                      vmlinux
       0.720900  memcpy                              vmlinux
       0.711600  skb_gro_receive                     vmlinux
       0.683600  kfree                               vmlinux

Here is a list of more detailed info sorted per GRO function as seen above:
- Relative % for the most expensive instructions
- gdb dissass'output for these instructions
- gdb list's output.

inet_gro_receive 4.9578 ffffffff805468c0
  ffffffff80546a49 11.1059%
    0xffffffff80546a49 <inet_gro_receive+393>:      jne    0xffffffff805469e5 <inet_gro_receive+293>
    0xffffffff80546a49 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
      1280                    if (!NAPI_GRO_CB(p)->same_flow)
      1281                            continue;
      1282
      1283                    iph2 = ip_hdr(p);
      1284
      1285                    if (iph->protocol != iph2->protocol ||
      1286                        iph->tos != iph2->tos ||
      1287                        memcmp(&iph->saddr, &iph2->saddr, 8)) {
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;

  ffffffff80546a61 10.4000%
    0xffffffff80546a61 <inet_gro_receive+417>:      je     0xffffffff80546abb <inet_gro_receive+507>
    0xffffffff80546a61 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;
      1290                    }
      1291
      1292                    /* All fields must match except length and checksum. */
      1293                    NAPI_GRO_CB(p)->flush |=
      1294                            memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
      1295                            (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;

  ffffffff80546a58 8.2353%
    0xffffffff80546a58 <inet_gro_receive+408>:      mov    %rdx,%rcx
    0xffffffff80546a58 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;
      1290                    }
      1291
      1292                    /* All fields must match except length and checksum. */
      1293                    NAPI_GRO_CB(p)->flush |=
      1294                            memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
      1295                            (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
      1296
      1297                    NAPI_GRO_CB(p)->flush |= flush;

  ffffffff80546abb 8.2353%
    0xffffffff80546abb <inet_gro_receive+507>:      movzwl 0x4(%r10),%eax
      (gdb) list *(0xffffffff80546abb)
      0xffffffff80546abb is in inet_gro_receive (/mnt/net-2.6/include/linux/swab.h:51).
      46      static inline __attribute_const__ __u16 __fswab16(__u16 val)
      47      {
      48      #ifdef __arch_swab16
      49              return __arch_swab16(val);
      50      #else
      51              return ___constant_swab16(val);
      52      #endif
      53      }
      54
      55      static inline __attribute_const__ __u32 __fswab32(__u32 val)

  ffffffff80546a4b 8.1882%
    0xffffffff80546a4b is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;
      1290                    }
      1291
      1292                    /* All fields must match except length and checksum. */
      1293                    NAPI_GRO_CB(p)->flush |=
      1294                            memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
      1295                            (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
      1296
      1297                    NAPI_GRO_CB(p)->flush |= flush;

  ffffffff80546a47 7.5765%
    0xffffffff80546a47 <inet_gro_receive+391>:      repz cmpsb %es:(%rdi),%ds:(%rsi)
    0xffffffff80546a47 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
      1280                    if (!NAPI_GRO_CB(p)->same_flow)
      1281                            continue;
      1282
      1283                    iph2 = ip_hdr(p);
      1284
      1285                    if (iph->protocol != iph2->protocol ||
      1286                        iph->tos != iph2->tos ||
      1287                        memcmp(&iph->saddr, &iph2->saddr, 8)) {
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;

  ffffffff80546a44 7.1529%
    0xffffffff80546a44 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
      1280                    if (!NAPI_GRO_CB(p)->same_flow)
      1281                            continue;
      1282
      1283                    iph2 = ip_hdr(p);
      1284
      1285                    if (iph->protocol != iph2->protocol ||
      1286                        iph->tos != iph2->tos ||
      1287                        memcmp(&iph->saddr, &iph2->saddr, 8)) {
      1288                            NAPI_GRO_CB(p)->same_flow = 0;
      1289                            continue;


dev_gro_receive 3.7749 ffffffff805024b0
  ffffffff805026a2 18.7268%
    0xffffffff805026a2 <dev_gro_receive+498>:       repz cmpsb %es:(%rdi),%ds:(%rsi)
    0xffffffff805026a2 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2450).
      2445                            count++;
      2446
      2447                            if (!NAPI_GRO_CB(p)->same_flow)
      2448                                    continue;
      2449
      2450                            if (p->mac_len != mac_len ||
      2451                                memcmp(skb_mac_header(p), mac, mac_len))
      2452                                    NAPI_GRO_CB(p)->same_flow = 0;
      2453                    }
      2454

  ffffffff805026a4 13.4734%
    0xffffffff805026a4 <dev_gro_receive+500>:       je     0xffffffff805025c8 <dev_gro_receive+280>
      (gdb) list *(0xffffffff805026a4)
      0xffffffff805026a4 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2450).
      2445                            count++;
      2446
      2447                            if (!NAPI_GRO_CB(p)->same_flow)
      2448                                    continue;
      2449
      2450                            if (p->mac_len != mac_len ||
      2451                                memcmp(skb_mac_header(p), mac, mac_len))
      2452                                    NAPI_GRO_CB(p)->same_flow = 0;

  ffffffff805025c8 9.3943%
    0xffffffff805025c8 <dev_gro_receive+280>:       mov    (%r9),%r9
    0xffffffff805025c8 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2444).
      2439                    skb->mac_len = mac_len;
      2440                    NAPI_GRO_CB(skb)->same_flow = 0;
      2441                    NAPI_GRO_CB(skb)->flush = 0;
      2442                    NAPI_GRO_CB(skb)->free = 0;
      2443
      2444                    for (p = napi->gro_list; p; p = p->next) {
      2445                            count++;
      2446
      2447                            if (!NAPI_GRO_CB(p)->same_flow)
      2448                                    continue;

  ffffffff805025f9 7.3548%
    0xffffffff805025f9 <dev_gro_receive+329>:       je     0xffffffff80502614 <dev_gro_receive+356>
    0xffffffff805025f9 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2466).
      2461                    goto normal;
      2462
      2463            same_flow = NAPI_GRO_CB(skb)->same_flow;
      2464            ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
      2465
      2466            if (pp) {
      2467                    struct sk_buff *nskb = *pp;
      2468
      2469                    *pp = nskb->next;
      2470                    nskb->next = NULL;


tcp_gro_receive 3.0960 ffffffff80528df0
  ffffffff80528f2b 16.3527%
    0xffffffff80528f2b <tcp_gro_receive+315>:       repz cmpsb %es:(%rdi),%ds:(%rsi)
    0xffffffff80528f2b is in tcp_gro_receive (/mnt/net-2.6/net/ipv4/tcp.c:2521).
      2516            flush = NAPI_GRO_CB(p)->flush;
      2517            flush |= flags & TCP_FLAG_CWR;
      2518            flush |= (flags ^ tcp_flag_word(th2)) &
      2519                      ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
      2520            flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
      2521            flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
      2522
      2523            total = skb_gro_len(p);
      2524            mss = skb_shinfo(p)->gso_size;

  ffffffff80528f2d 15.9759%
    0xffffffff80528f2d <tcp_gro_receive+317>:       mov    0x60(%r8),%edi
      0xffffffff80528f2d is in tcp_gro_receive (/mnt/net-2.6/include/linux/netdevice.h:1101).
      1096            return NAPI_GRO_CB(skb)->data_offset;
      1097    }
      1098
      1099    static inline unsigned int skb_gro_len(const struct sk_buff *skb)
      1100    {
      1101            return skb->len - NAPI_GRO_CB(skb)->data_offset;
      1102    }
      1103
      1104    static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
      1105    {

  ffffffff80528f31 13.7905%
    0xffffffff80528f31 <tcp_gro_receive+321>:       setb   %al
    0xffffffff80528f31 is in tcp_gro_receive (/mnt/net-2.6/net/ipv4/tcp.c:2521).
      2516            flush = NAPI_GRO_CB(p)->flush;
      2517            flush |= flags & TCP_FLAG_CWR;
      2518            flush |= (flags ^ tcp_flag_word(th2)) &
      2519                      ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
      2520            flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
      2521            flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
      2522
      2523            total = skb_gro_len(p);
      2524            mss = skb_shinfo(p)->gso_size;

napi_fraginfo_skb 2.7647 ffffffff80501dd0
  ffffffff80501f16 65.2321%
    0xffffffff80501f16 <napi_fraginfo_skb+326>:     mov    %eax,0x6c(%rbx)
    0xffffffff80501f16 is in napi_fraginfo_skb (/mnt/net-2.6/net/core/dev.c:2606).
      2601             * special handling.  We'll fix it up properly at the end.
      2602             */
      2603            skb->protocol = eth->h_proto;
      2604
      2605            skb->ip_summed = info->ip_summed;
      2606            skb->csum = info->csum;
      2607
      2608    out:
      2609            return skb;
      2610    }

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-20 10:14 Divy Le Ray
@ 2009-01-21  8:29 ` Herbert Xu
  2009-01-22  9:42   ` Divy Le Ray
  0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-21  8:29 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: netdev

On Tue, Jan 20, 2009 at 02:14:19AM -0800, Divy Le Ray wrote:
> 
> Here is the patch:

Thanks, I'll add it to my series and push it later.

> Here is the non detailed opreport output for the CPU managing the reception
> of netperf traffic:

This is very helpful! Now some of the extra cost of GRO is due
to extra checks which are unavoidable.  However, there is a lot
of microoptimisations that we can perform to minimise the cost
and also reduce the cost of some existing checks.

Please try the following patch (compile tested only since I'm
at LCA) and let me know if it improves things.

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 1cb0f0b..a1f17ab 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -184,4 +184,25 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 }
 #endif	/* __KERNEL__ */
 
+/**
+ * compare_ether_header - Compare two Ethernet headers
+ * @a: Pointer to Ethernet header
+ * @b: Pointer to Ethernet header
+ *
+ * Compare two ethernet headers, returns 0 if equal.
+ * This assumes that the network header (i.e., IP header) is 4-byte
+ * aligned OR the platform can handle unaligned access.  This is the
+ * case for all packets coming into netif_receive_skb or similar
+ * entry points.
+ */
+
+static inline int compare_ether_header(const void *a, const void *b)
+{
+	u32 *a32 = (u32 *)((u8 *)a + 2);
+	u32 *b32 = (u32 *)((u8 *)b + 2);
+
+	return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
+	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5472328..89e5753 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,6 +314,9 @@ struct napi_struct {
 	spinlock_t		poll_lock;
 	int			poll_owner;
 #endif
+
+	unsigned int		gro_count;
+
 	struct net_device	*dev;
 	struct list_head	dev_list;
 	struct sk_buff		*gro_list;
@@ -1111,6 +1114,13 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb)
 	NAPI_GRO_CB(skb)->data_offset = 0;
 }
 
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+	return skb_headlen(skb) ? skb_mac_header(skb) :
+	       page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset;
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index fb0d16a..63f103b 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -85,7 +85,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 		goto drop;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev;
+		NAPI_GRO_CB(p)->same_flow =
+			p->dev == skb->dev && !compare_ether_header(
+				skb_mac_header(p), skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d55f725..fd5998a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -215,13 +215,6 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 }
 
-static inline void *skb_gro_mac_header(struct sk_buff *skb)
-{
-	return skb_headlen(skb) ? skb_mac_header(skb) :
-	       page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset;
-}
-
 /* Device list insertion */
 static int list_netdevice(struct net_device *dev)
 {
@@ -2378,6 +2371,7 @@ void napi_gro_flush(struct napi_struct *napi)
 		napi_gro_complete(skb);
 	}
 
+	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
@@ -2407,7 +2401,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct packet_type *ptype;
 	__be16 type = skb->protocol;
 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
-	int count = 0;
 	int same_flow;
 	int mac_len;
 	int ret;
@@ -2421,30 +2414,17 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		struct sk_buff *p;
-		void *mac;
 
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
 			continue;
 
 		skb_set_network_header(skb, skb_gro_offset(skb));
-		mac = skb_gro_mac_header(skb);
 		mac_len = skb->network_header - skb->mac_header;
 		skb->mac_len = mac_len;
 		NAPI_GRO_CB(skb)->same_flow = 0;
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 
-		for (p = napi->gro_list; p; p = p->next) {
-			count++;
-
-			if (!NAPI_GRO_CB(p)->same_flow)
-				continue;
-
-			if (p->mac_len != mac_len ||
-			    memcmp(skb_mac_header(p), mac, mac_len))
-				NAPI_GRO_CB(p)->same_flow = 0;
-		}
-
 		pp = ptype->gro_receive(&napi->gro_list, skb);
 		break;
 	}
@@ -2462,15 +2442,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		*pp = nskb->next;
 		nskb->next = NULL;
 		napi_gro_complete(nskb);
-		count--;
+		napi->gro_count--;
 	}
 
 	if (same_flow)
 		goto ok;
 
-	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
+	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
 		goto normal;
 
+	napi->gro_count++;
 	NAPI_GRO_CB(skb)->count = 1;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
@@ -2490,7 +2471,8 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct sk_buff *p;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = 1;
+		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
+			skb_mac_header(p), skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
@@ -2714,6 +2696,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
 	INIT_LIST_HEAD(&napi->poll_list);
+	napi->gro_count = 0;
 	napi->gro_list = NULL;
 	napi->skb = NULL;
 	napi->poll = poll;
@@ -2742,6 +2725,7 @@ void netif_napi_del(struct napi_struct *napi)
 	}
 
 	napi->gro_list = NULL;
+	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
 
@@ -5208,6 +5192,7 @@ static int __init net_dev_init(void)
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
+		queue->backlog.gro_count = 0;
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d6770f2..f011ce6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1264,7 +1264,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	if (!ops || !ops->gro_receive)
 		goto out_unlock;
 
-	if (iph->version != 4 || iph->ihl != 5)
+	if (*(u8 *)iph != 0x45)
 		goto out_unlock;
 
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
@@ -1282,17 +1282,18 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 
 		iph2 = ip_hdr(p);
 
-		if (iph->protocol != iph2->protocol ||
-		    iph->tos != iph2->tos ||
-		    memcmp(&iph->saddr, &iph2->saddr, 8)) {
+		if ((iph->protocol ^ iph2->protocol) |
+		    (iph->tos ^ iph2->tos) |
+		    (iph->saddr ^ iph2->saddr) |
+		    (iph->daddr ^ iph2->daddr)) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
 
 		/* All fields must match except length and checksum. */
 		NAPI_GRO_CB(p)->flush |=
-			memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
-			(u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
+			(iph->ttl ^ iph2->ttl) |
+			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
 
 		NAPI_GRO_CB(p)->flush |= flush;
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0191d39..a1d1f3b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2469,9 +2469,9 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	struct tcphdr *th2;
 	unsigned int thlen;
 	unsigned int flags;
-	unsigned int total;
 	unsigned int mss = 1;
 	int flush = 1;
+	int i;
 
 	th = skb_gro_header(skb, sizeof(*th));
 	if (unlikely(!th))
@@ -2495,7 +2495,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 		th2 = tcp_hdr(p);
 
-		if (th->source != th2->source || th->dest != th2->dest) {
+		if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
@@ -2510,14 +2510,15 @@ found:
 	flush |= flags & TCP_FLAG_CWR;
 	flush |= (flags ^ tcp_flag_word(th2)) &
 		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
-	flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
-	flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
+	flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
+	for (i = sizeof(*th); !flush && i < thlen; i += 4)
+		flush |= *(u32 *)((u8 *)th + i) ^
+			 *(u32 *)((u8 *)th2 + i);
 
-	total = skb_gro_len(p);
 	mss = skb_shinfo(p)->gso_size;
 
-	flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb);
-	flush |= ntohl(th2->seq) + total != ntohl(th->seq);
+	flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb);
+	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 
 	if (flush || skb_gro_receive(head, skb)) {
 		mss = 1;

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-21  8:29 ` Herbert Xu
@ 2009-01-22  9:42   ` Divy Le Ray
  2009-02-16  3:36     ` Herbert Xu
  0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-22  9:42 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:


> there is a lot
> of microoptimisations that we can perform to minimise the cost
> and also reduce the cost of some existing checks.
> 
> Please try the following patch (compile tested only since I'm
> at LCA) and let me know if it improves things.
> 

I'm now getting about an average 5.7Gbs, oprofile typically showing:

      37.294500  copy_user_generic_unrolled          vmlinux
       8.733800  process_responses                  cxgb3.ko
       6.123700  refill_fl                          cxgb3.ko
       5.006900  put_page                            vmlinux
       3.400700  napi_fraginfo_skb                   vmlinux
       2.484600  tcp_gro_receive                     vmlinux
       2.308900  inet_gro_receive                    vmlinux
       2.196000  free_hot_cold_page                  vmlinux
       2.032900  skb_gro_header                      vmlinux
       1.970100  get_page_from_freelist              vmlinux
       1.869700  skb_copy_datagram_iovec             vmlinux
       1.380300  dev_gro_receive                     vmlinux
       1.242300  tcp_recvmsg                         vmlinux
       1.079200  irq_entries_start                   vmlinux
       1.003900  get_pageblock_flags_group           vmlinux
       0.991300  skb_gro_receive                     vmlinux
       0.878400  _raw_spin_lock                      vmlinux
       0.878400  memcpy                              vmlinux
       0.840800  kfree                               vmlinux
       0.828200  list_del                            vmlinux

Cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-01-22  9:42   ` Divy Le Ray
@ 2009-02-16  3:36     ` Herbert Xu
  2009-02-16  3:47       ` Divy Le Ray
  2009-03-13  7:28       ` Divy Le Ray
  0 siblings, 2 replies; 23+ messages in thread
From: Herbert Xu @ 2009-02-16  3:36 UTC (permalink / raw)
  To: Divy Le Ray; +Cc: netdev

On Thu, Jan 22, 2009 at 01:42:22AM -0800, Divy Le Ray wrote:
>
> I'm now getting about an average 5.7Gbs, oprofile typically showing:
> 
>       37.294500  copy_user_generic_unrolled          vmlinux
>        8.733800  process_responses                  cxgb3.ko
>        6.123700  refill_fl                          cxgb3.ko
>        5.006900  put_page                            vmlinux
>        3.400700  napi_fraginfo_skb                   vmlinux
>        2.484600  tcp_gro_receive                     vmlinux
>        2.308900  inet_gro_receive                    vmlinux
>        2.196000  free_hot_cold_page                  vmlinux
>        2.032900  skb_gro_header                      vmlinux
>        1.970100  get_page_from_freelist              vmlinux
>        1.869700  skb_copy_datagram_iovec             vmlinux
>        1.380300  dev_gro_receive                     vmlinux
>        1.242300  tcp_recvmsg                         vmlinux
>        1.079200  irq_entries_start                   vmlinux
>        1.003900  get_pageblock_flags_group           vmlinux
>        0.991300  skb_gro_receive                     vmlinux
>        0.878400  _raw_spin_lock                      vmlinux
>        0.878400  memcpy                              vmlinux

When you can get a chance can you see if this patch makes any
difference at all?

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d7efaf9..6a542fa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2586,8 +2586,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
 	struct sk_buff *p = *head;
 	struct sk_buff *nskb;
+	skb_frag_t *frag;
 	unsigned int headroom;
 	unsigned int len = skb_gro_len(skb);
+	int i;
 
 	if (p->len + len >= 65536)
 		return -E2BIG;
@@ -2604,9 +2606,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		skb_shinfo(skb)->frags[0].size -=
 			skb_gro_offset(skb) - skb_headlen(skb);
 
-		memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
-		       skb_shinfo(skb)->frags,
-		       skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
+		frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags;
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			*frag++ = skb_shinfo(skb)->frags[i];
 
 		skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
 		skb_shinfo(skb)->nr_frags = 0;

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-02-16  3:36     ` Herbert Xu
@ 2009-02-16  3:47       ` Divy Le Ray
  2009-03-13  7:28       ` Divy Le Ray
  1 sibling, 0 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-02-16  3:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:

> When you can get a chance can you see if this patch makes any
> difference at all?

Hi Herbert,

Thanks for the patch.
I should be able to test within the next few days.

cheers,
Divy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-02-16  3:36     ` Herbert Xu
  2009-02-16  3:47       ` Divy Le Ray
@ 2009-03-13  7:28       ` Divy Le Ray
  2009-04-13 15:24         ` Herbert Xu
  1 sibling, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-03-13  7:28 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev


> When you can get a chance can you see if this patch makes any
> difference at all?

Hi Herbert,

Sorry for the delay.
I had to switch to different development platforms.
I've not seen much perf change with this patch, it looks good though.

Cheers,
Divy

> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index d7efaf9..6a542fa 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2586,8 +2586,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
>  {
>  	struct sk_buff *p = *head;
>  	struct sk_buff *nskb;
> +	skb_frag_t *frag;
>  	unsigned int headroom;
>  	unsigned int len = skb_gro_len(skb);
> +	int i;
>  
>  	if (p->len + len >= 65536)
>  		return -E2BIG;
> @@ -2604,9 +2606,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
>  		skb_shinfo(skb)->frags[0].size -=
>  			skb_gro_offset(skb) - skb_headlen(skb);
>  
> -		memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
> -		       skb_shinfo(skb)->frags,
> -		       skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
> +		frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags;
> +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +			*frag++ = skb_shinfo(skb)->frags[i];
>  
>  		skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
>  		skb_shinfo(skb)->nr_frags = 0;
> 
> Thanks,


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: cxgb3: Replace LRO with GRO
  2009-03-13  7:28       ` Divy Le Ray
@ 2009-04-13 15:24         ` Herbert Xu
  0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-04-13 15:24 UTC (permalink / raw)
  To: Divy Le Ray, David S. Miller; +Cc: netdev

On Fri, Mar 13, 2009 at 12:28:52AM -0700, Divy Le Ray wrote:
>
> I've not seen much perf change with this patch, it looks good though.

Thanks.  I decided to get rid of the copy altogether after I got
my hands on a cxgb3 machine (well, remotely anyway).  It seems to
do the trick for me as in both LRO and GRO now produces about the
same performance with the CPU 100% maxed out.

Here's the patch.  Let me know how it fares on your machine.

gro: New frags interface to avoid copying shinfo

It turns out that copying a 16-byte area at ~800k times a second
can be really expensive :) This patch redesigns the frags GRO
interface to avoid copying that area twice.

The two disciples of the frags interface have been converted.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 714df2b..322434a 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -195,7 +195,7 @@ struct sge_qset {		/* an SGE queue set */
 	struct sge_rspq rspq;
 	struct sge_fl fl[SGE_RXQ_PER_SET];
 	struct sge_txq txq[SGE_TXQ_PER_SET];
-	struct napi_gro_fraginfo lro_frag_tbl;
+	int nomem;
 	int lro_enabled;
 	void *lro_va;
 	struct net_device *netdev;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 26d3587..73d569e 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -654,7 +654,8 @@ static void t3_reset_qset(struct sge_qset *q)
 	q->txq_stopped = 0;
 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
 	q->rx_reclaim_timer.function = NULL;
-	q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
+	q->nomem = 0;
+	napi_free_frags(&q->napi);
 }
 
 
@@ -2074,20 +2075,19 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 			 struct sge_fl *fl, int len, int complete)
 {
 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+	struct sk_buff *skb = NULL;
 	struct cpl_rx_pkt *cpl;
-	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
-	int nr_frags = qs->lro_frag_tbl.nr_frags;
-	int frag_len = qs->lro_frag_tbl.len;
+	struct skb_frag_struct *rx_frag;
+	int nr_frags;
 	int offset = 0;
 
-	if (!nr_frags) {
-		offset = 2 + sizeof(struct cpl_rx_pkt);
-		qs->lro_va = cpl = sd->pg_chunk.va + 2;
+	if (!qs->nomem) {
+		skb = napi_get_frags(&qs->napi);
+		qs->nomem = !skb;
 	}
 
 	fl->credits--;
 
-	len -= offset;
 	pci_dma_sync_single_for_cpu(adap->pdev,
 				    pci_unmap_addr(sd, dma_addr),
 				    fl->buf_size - SGE_PG_RSVD,
@@ -2100,21 +2100,38 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 			       fl->alloc_size,
 			       PCI_DMA_FROMDEVICE);
 
+	if (!skb) {
+		put_page(sd->pg_chunk.page);
+		if (complete)
+			qs->nomem = 0;
+		return;
+	}
+
+	rx_frag = skb_shinfo(skb)->frags;
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (!nr_frags) {
+		offset = 2 + sizeof(struct cpl_rx_pkt);
+		qs->lro_va = sd->pg_chunk.va + 2;
+	}
+	len -= offset;
+
 	prefetch(qs->lro_va);
 
 	rx_frag += nr_frags;
 	rx_frag->page = sd->pg_chunk.page;
 	rx_frag->page_offset = sd->pg_chunk.offset + offset;
 	rx_frag->size = len;
-	frag_len += len;
-	qs->lro_frag_tbl.nr_frags++;
-	qs->lro_frag_tbl.len = frag_len;
 
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+	skb_shinfo(skb)->nr_frags++;
 
 	if (!complete)
 		return;
 
-	qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	cpl = qs->lro_va;
 
 	if (unlikely(cpl->vlan_valid)) {
@@ -2123,15 +2140,11 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 		struct vlan_group *grp = pi->vlan_grp;
 
 		if (likely(grp != NULL)) {
-			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
-				       &qs->lro_frag_tbl);
-			goto out;
+			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan));
+			return;
 		}
 	}
-	napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
-
-out:
-	qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
+	napi_gro_frags(&qs->napi);
 }
 
 /**
@@ -2300,8 +2313,6 @@ no_mem:
 			if (fl->use_pages) {
 				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
 
-				prefetch(&qs->lro_frag_tbl);
-
 				prefetch(addr);
 #if L1_CACHE_BYTES < 128
 				prefetch(addr + L1_CACHE_BYTES);
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index 66d7fe3..01f9432 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -450,17 +450,27 @@ static void efx_rx_packet_lro(struct efx_channel *channel,
 
 	/* Pass the skb/page into the LRO engine */
 	if (rx_buf->page) {
-		struct napi_gro_fraginfo info;
+		struct sk_buff *skb = napi_get_frags(napi);
 
-		info.frags[0].page = rx_buf->page;
-		info.frags[0].page_offset = efx_rx_buf_offset(rx_buf);
-		info.frags[0].size = rx_buf->len;
-		info.nr_frags = 1;
-		info.ip_summed = CHECKSUM_UNNECESSARY;
-		info.len = rx_buf->len;
+		if (!skb) {
+			put_page(rx_buf->page);
+			goto out;
+		}
+
+		skb_shinfo(skb)->frags[0].page = rx_buf->page;
+		skb_shinfo(skb)->frags[0].page_offset =
+			efx_rx_buf_offset(rx_buf);
+		skb_shinfo(skb)->frags[0].size = rx_buf->len;
+		skb_shinfo(skb)->nr_frags = 1;
+
+		skb->len = rx_buf->len;
+		skb->data_len = rx_buf->len;
+		skb->truesize += rx_buf->len;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		napi_gro_frags(napi, &info);
+		napi_gro_frags(napi);
 
+out:
 		EFX_BUG_ON_PARANOID(rx_buf->skb);
 		rx_buf->page = NULL;
 	} else {
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index e1ff5b1..7ff9af1 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -118,8 +118,7 @@ extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
 extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 			    unsigned int vlan_tci, struct sk_buff *skb);
 extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-			  unsigned int vlan_tci,
-			  struct napi_gro_fraginfo *info);
+			  unsigned int vlan_tci);
 
 #else
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -154,8 +153,7 @@ static inline int vlan_gro_receive(struct napi_struct *napi,
 }
 
 static inline int vlan_gro_frags(struct napi_struct *napi,
-				 struct vlan_group *grp, unsigned int vlan_tci,
-				 struct napi_gro_fraginfo *info)
+				 struct vlan_group *grp, unsigned int vlan_tci)
 {
 	return NET_RX_DROP;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..54db3eb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1047,14 +1047,6 @@ struct packet_type {
 	struct list_head	list;
 };
 
-struct napi_gro_fraginfo {
-	skb_frag_t frags[MAX_SKB_FRAGS];
-	unsigned int nr_frags;
-	unsigned int ip_summed;
-	unsigned int len;
-	__wsum csum;
-};
-
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 
@@ -1442,12 +1434,18 @@ extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
-extern struct sk_buff *	napi_fraginfo_skb(struct napi_struct *napi,
-					  struct napi_gro_fraginfo *info);
+extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
 extern int		napi_frags_finish(struct napi_struct *napi,
 					  struct sk_buff *skb, int ret);
-extern int		napi_gro_frags(struct napi_struct *napi,
-				       struct napi_gro_fraginfo *info);
+extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
+extern int		napi_gro_frags(struct napi_struct *napi);
+
+static inline void napi_free_frags(struct napi_struct *napi)
+{
+	kfree_skb(napi->skb);
+	napi->skb = NULL;
+}
+
 extern void		netif_nit_deliver(struct sk_buff *skb);
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c67fe6f..7f7de1a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -114,9 +114,9 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 EXPORT_SYMBOL(vlan_gro_receive);
 
 int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-		   unsigned int vlan_tci, struct napi_gro_fraginfo *info)
+		   unsigned int vlan_tci)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
 		return NET_RX_DROP;
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..619fa14 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2519,16 +2519,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_reuse_skb);
 
-struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
-				  struct napi_gro_fraginfo *info)
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
 {
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
-	struct ethhdr *eth;
-	skb_frag_t *frag;
-	int i;
-
-	napi->skb = NULL;
 
 	if (!skb) {
 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
@@ -2536,47 +2530,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 			goto out;
 
 		skb_reserve(skb, NET_IP_ALIGN);
-	}
-
-	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
-	frag = &info->frags[info->nr_frags - 1];
 
-	for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
-		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
-				   frag->size);
-		frag++;
+		napi->skb = skb;
 	}
-	skb_shinfo(skb)->nr_frags = info->nr_frags;
-
-	skb->data_len = info->len;
-	skb->len += info->len;
-	skb->truesize += info->len;
-
-	skb_reset_mac_header(skb);
-	skb_gro_reset_offset(skb);
-
-	eth = skb_gro_header(skb, sizeof(*eth));
-	if (!eth) {
-		napi_reuse_skb(napi, skb);
-		skb = NULL;
-		goto out;
-	}
-
-	skb_gro_pull(skb, sizeof(*eth));
-
-	/*
-	 * This works because the only protocols we care about don't require
-	 * special handling.  We'll fix it up properly at the end.
-	 */
-	skb->protocol = eth->h_proto;
-
-	skb->ip_summed = info->ip_summed;
-	skb->csum = info->csum;
 
 out:
 	return skb;
 }
-EXPORT_SYMBOL(napi_fraginfo_skb);
+EXPORT_SYMBOL(napi_get_frags);
 
 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
@@ -2606,9 +2567,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 }
 EXPORT_SYMBOL(napi_frags_finish);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+	struct sk_buff *skb = napi->skb;
+	struct ethhdr *eth;
+
+	napi->skb = NULL;
+
+	skb_reset_mac_header(skb);
+	skb_gro_reset_offset(skb);
+
+	eth = skb_gro_header(skb, sizeof(*eth));
+	if (!eth) {
+		napi_reuse_skb(napi, skb);
+		skb = NULL;
+		goto out;
+	}
+
+	skb_gro_pull(skb, sizeof(*eth));
+
+	/*
+	 * This works because the only protocols we care about don't require
+	 * special handling.  We'll fix it up properly at the end.
+	 */
+	skb->protocol = eth->h_proto;
+
+out:
+	return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+int napi_gro_frags(struct napi_struct *napi)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
 		return NET_RX_DROP;
@@ -2712,7 +2703,7 @@ void netif_napi_del(struct napi_struct *napi)
 	struct sk_buff *skb, *next;
 
 	list_del_init(&napi->dev_list);
-	kfree_skb(napi->skb);
+	napi_free_frags(napi);
 
 	for (skb = napi->gro_list; skb; skb = next) {
 		next = skb->next;

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-04-13 15:24 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-15 21:14 cxgb3: Replace LRO with GRO Divy Le Ray
2009-01-15 23:58 ` Herbert Xu
2009-01-16  8:06   ` Divy Le Ray
2009-01-16  8:56     ` Herbert Xu
2009-01-16 11:12       ` Divy Le Ray
2009-01-16 23:58         ` Herbert Xu
2009-01-17  5:08         ` Herbert Xu
2009-01-17 11:11           ` Divy Le Ray
2009-01-17 13:08             ` Herbert Xu
2009-01-18 20:33               ` Divy Le Ray
2009-01-18 22:50                 ` Herbert Xu
2009-01-20  1:03 ` David Miller
2009-01-20  2:03   ` David Miller
2009-01-20  5:24     ` Herbert Xu
2009-01-20 10:04     ` Divy Le Ray
  -- strict thread matches above, loose matches on Subject: below --
2009-01-20 10:14 Divy Le Ray
2009-01-21  8:29 ` Herbert Xu
2009-01-22  9:42   ` Divy Le Ray
2009-02-16  3:36     ` Herbert Xu
2009-02-16  3:47       ` Divy Le Ray
2009-03-13  7:28       ` Divy Le Ray
2009-04-13 15:24         ` Herbert Xu
2009-01-13  9:26 [1/2] e1000e: Invoke VLAN GRO handler Herbert Xu
2009-01-15  6:59 ` cxgb3: Replace LRO with GRO Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).