From: divy@chelsio.com
To: jeff@garzik.org
Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
swise@opengridcomputing.com
Subject: [PATCH 7/7] cxgb3 - Add SW LRO support
Date: Thu, 22 Feb 2007 03:59:52 -0800 [thread overview]
Message-ID: <20070222115952.10091.80725.stgit@localhost.localdomain> (raw)
From: Divy Le Ray <divy@chelsio.com>
Add all-in-sw lro support.
Signed-off-by: Divy Le Ray <divy@chelsio.com>
---
drivers/net/cxgb3/adapter.h | 21 ++
drivers/net/cxgb3/common.h | 1
drivers/net/cxgb3/cxgb3_ioctl.h | 1
drivers/net/cxgb3/cxgb3_main.c | 16 ++
drivers/net/cxgb3/sge.c | 341 ++++++++++++++++++++++++++++++++++++++-
drivers/net/cxgb3/t3_cpl.h | 10 +
6 files changed, 384 insertions(+), 6 deletions(-)
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 80c3d8f..576db4a 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -95,6 +95,23 @@ struct sge_fl { /* SGE per free-buffer
unsigned long alloc_failed; /* # of times buffer allocation failed */
};
+/* Max active LRO sessions per queue set */
+#define MAX_LRO_PER_QSET 8
+
+struct sge_lro_session {
+ struct sk_buff *skb;
+ struct sk_buff *skb_last_frag;
+ u32 seq;
+ u16 iplen;
+};
+
+struct sge_lro {
+ unsigned int enabled;
+ unsigned int num_active;
+ struct sge_lro_session *last_s;
+ struct sge_lro_session s[MAX_LRO_PER_QSET];
+};
+
/*
* Bundle size for grouping offload RX packets for delivery to the stack.
* Don't make this too big as we do prefetch on each packet in a bundle.
@@ -164,6 +181,9 @@ enum { /* per port SGE statistics */
SGE_PSTAT_TX_CSUM, /* # of TX checksum offloads */
SGE_PSTAT_VLANEX, /* # of VLAN tag extractions */
SGE_PSTAT_VLANINS, /* # of VLAN tag insertions */
+ SGE_PSTATS_LRO_QUEUED, /* # of LRO appended packets */
+ SGE_PSTATS_LRO_FLUSHED, /* # of LRO flushed packets */
+ SGE_PSTATS_LRO_X_STREAMS, /* # of exceeded LRO contexts */
SGE_PSTAT_MAX /* must be last */
};
@@ -171,6 +191,7 @@ enum { /* per port SGE statistics */
struct sge_qset { /* an SGE queue set */
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
+ struct sge_lro lro;
struct sge_txq txq[SGE_TXQ_PER_SET];
struct net_device *netdev; /* associated net device */
unsigned long txq_stopped; /* which Tx queues are stopped */
diff --git a/drivers/net/cxgb3/common.h b/drivers/net/cxgb3/common.h
index e23deeb..1031ad0 100644
--- a/drivers/net/cxgb3/common.h
+++ b/drivers/net/cxgb3/common.h
@@ -322,6 +322,7 @@ struct tp_params {
struct qset_params { /* SGE queue set parameters */
unsigned int polling; /* polling/interrupt service for rspq */
+ unsigned int lro; /* large receive offload */
unsigned int coalesce_usecs; /* irq coalescing timer */
unsigned int rspq_size; /* # of entries in response queue */
unsigned int fl_size; /* # of entries in regular free list */
diff --git a/drivers/net/cxgb3/cxgb3_ioctl.h b/drivers/net/cxgb3/cxgb3_ioctl.h
index 42d4bf6..52bb2be 100644
--- a/drivers/net/cxgb3/cxgb3_ioctl.h
+++ b/drivers/net/cxgb3/cxgb3_ioctl.h
@@ -90,6 +90,7 @@ struct ch_qset_params {
int32_t fl_size[2];
int32_t intr_lat;
int32_t polling;
+ int32_t lro;
int32_t cong_thres;
};
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 7ff834e..b78eefb 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -1031,7 +1031,11 @@ static char stats_strings[][ETH_GSTRING_
"VLANinsertions ",
"TxCsumOffload ",
"RxCsumGood ",
- "RxDrops "
+ "RxDrops ",
+
+ "LroQueued ",
+ "LroFlushed ",
+ "LroExceededSessions"
};
static int get_stats_count(struct net_device *dev)
@@ -1145,6 +1149,9 @@ static void get_stats(struct net_device
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
*data++ = s->rx_cong_drops;
+ *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTATS_LRO_QUEUED);
+ *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTATS_LRO_FLUSHED);
+ *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTATS_LRO_X_STREAMS);
}
static inline void reg_block_dump(struct adapter *ap, void *buf,
@@ -1624,6 +1631,12 @@ static int cxgb_extension_ioctl(struct n
}
}
}
+ if (t.lro >= 0) {
+ struct sge_qset *qs = &adapter->sge.qs[t.qset_idx];
+
+ q->lro = t.lro;
+ qs->lro.enabled = t.lro;
+ }
break;
}
case CHELSIO_GET_QSET_PARAMS:{
@@ -1643,6 +1656,7 @@ static int cxgb_extension_ioctl(struct n
t.fl_size[0] = q->fl_size;
t.fl_size[1] = q->jumbo_size;
t.polling = q->polling;
+ t.lro = q->lro;
t.intr_lat = q->coalesce_usecs;
t.cong_thres = q->cong_thres;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 850bcc0..c4f8c6a 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -35,6 +35,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tcp.h>
+#include <net/tcp.h>
#include <linux/dma-mapping.h>
#include "common.h"
#include "regs.h"
@@ -1710,6 +1711,324 @@ static void rx_eth(struct adapter *adap,
netif_rx(skb);
}
+#define IPH_OFFSET (2 + sizeof (struct cpl_rx_pkt) + ETH_HLEN)
+#define SKB_HASHVAL(skb) (skb->priority)
+#define LRO_SESSION_IDX_HINT(skb) (SKB_HASHVAL(skb) & (MAX_LRO_PER_QSET - 1))
+#define LRO_SESSION_IDX_HINT_HASH(hash) (hash & (MAX_LRO_PER_QSET - 1))
+#define LRO_IDX_INC(idx) idx = (idx + 1) & (MAX_LRO_PER_QSET - 1)
+
+static inline struct sge_lro_session *lro_session(struct sge_lro *l, int idx)
+{
+ return l->s + idx;
+}
+
+static inline int lro_match_session(struct sge_lro_session *s,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct iphdr *s_iph = (struct iphdr *)(s->skb->data + IPH_OFFSET);
+ struct tcphdr *s_tcph = (struct tcphdr *)(s_iph + 1);
+
+ return *(u32 *) & tcph->source == *(u32 *) & s_tcph->source &&
+ iph->saddr == s_iph->saddr && iph->daddr == s_iph->daddr;
+}
+
+static inline struct sge_lro_session *lro_find_session(struct sge_lro *l,
+ int idx,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct sge_lro_session *s;
+ int active = 0;
+
+ while (active < l->num_active) {
+ s = lro_session(l, idx);
+ if (s->skb) {
+ if (lro_match_session(s, iph, tcph)) {
+ l->last_s = s;
+ return s;
+ }
+ active++;
+ }
+ LRO_IDX_INC(idx);
+ }
+
+ return NULL;
+}
+
+static inline void lro_new_session_init(struct sge_lro_session *s,
+ struct sk_buff *skb)
+{
+ struct iphdr *ih = (struct iphdr *)(skb->data + IPH_OFFSET);
+ struct tcphdr *th = (struct tcphdr *)(ih + 1);
+ int iplen = ntohs(ih->tot_len);
+
+ s->skb = skb;
+ s->iplen = iplen;
+ s->seq = ntohl(th->seq) + iplen - sizeof(*ih) - (th->doff << 2);
+}
+
+static void lro_flush_session(struct adapter *adap, struct sge_qset *qs,
+ struct sge_lro_session *s, struct sk_buff *skb)
+{
+ struct sge_lro *l = &qs->lro;
+ struct iphdr *ih = (struct iphdr *)(s->skb->data + IPH_OFFSET);
+
+ ih->tot_len = htons(s->iplen);
+ ih->check = 0;
+ ih->check = ip_fast_csum((unsigned char *)ih, ih->ihl);
+
+ rx_eth(adap, &qs->rspq, s->skb, 2);
+
+ s->skb = skb;
+ if (skb)
+ lro_new_session_init(s, skb);
+ else
+ l->num_active--;
+
+ qs->port_stats[SGE_PSTATS_LRO_FLUSHED]++;
+}
+
+static inline struct sge_lro_session *lro_new_session(struct adapter *adap,
+ struct sge_qset *qs,
+ struct sk_buff *skb)
+{
+ struct sge_lro *l = &qs->lro;
+ int idx = LRO_SESSION_IDX_HINT(skb);
+ struct sge_lro_session *s = lro_session(l, idx);
+
+ if (likely(!s->skb))
+ goto done;
+
+ BUG_ON(l->num_active > MAX_LRO_PER_QSET);
+ if (l->num_active == MAX_LRO_PER_QSET) {
+ lro_flush_session(adap, qs, s, skb);
+ qs->port_stats[SGE_PSTATS_LRO_X_STREAMS]++;
+ return s;
+ }
+
+ while (1) {
+ LRO_IDX_INC(idx);
+ s = lro_session(l, idx);
+ if (!s->skb)
+ break;
+ }
+
+done:
+ lro_new_session_init(s, skb);
+
+ l->num_active++;
+ return s;
+}
+
+static inline void sge_lro_flush_all(struct adapter *adap, struct sge_qset *qs)
+{
+ struct sge_lro *l = &qs->lro;
+ struct sge_lro_session *s = l->last_s;
+ int active = 0, idx = 0, num_active = l->num_active;
+
+ if (unlikely(!s))
+ s = lro_session(l, idx);
+
+ while (active < num_active) {
+ if (s->skb) {
+ lro_flush_session(adap, qs, s, NULL);
+ active++;
+ }
+ LRO_IDX_INC(idx);
+ s = lro_session(l, idx);
+ }
+}
+
+static inline int can_lro_packet(struct cpl_rx_pkt *cpl, unsigned int rss_hi)
+{
+ struct ethhdr *eh = (struct ethhdr *)(cpl + 1);
+ struct iphdr *ih = (struct iphdr *)(eh + 1);
+
+ if (unlikely(G_HASHTYPE(ntohl(rss_hi)) != RSS_HASH_4_TUPLE ||
+ (*((u8 *) cpl + 1) & 0x90) != 0x10 ||
+ cpl->csum != 0xffff || eh->h_proto != ntohs(ETH_P_IP) ||
+ ih->ihl != (sizeof(*ih) >> 2))) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int can_lro_tcpsegment(struct tcphdr *th)
+{
+ int olen = (th->doff << 2) - sizeof(*th);
+ u8 control_bits = *((u8 *) th + 13);
+
+ if (unlikely((control_bits & 0xB7) != 0x10))
+ goto no_lro;
+
+ if (olen) {
+ u32 *ptr = (u32 *) (th + 1);
+ if (unlikely(olen != TCPOLEN_TSTAMP_ALIGNED ||
+ *ptr != ntohl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP)))
+ goto no_lro;
+ }
+
+ return 1;
+
+no_lro:
+ return 0;
+}
+
+static inline int lro_update_session(struct sge_lro_session *s,
+ unsigned char *va,
+ struct skb_frag_struct *frag,
+ struct sk_buff *skb)
+{
+ struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(s->skb->data + 2);
+ struct cpl_rx_pkt *ncpl = (struct cpl_rx_pkt *)(va + 2);
+ struct iphdr *nih = (struct iphdr *)(va + IPH_OFFSET);
+ struct tcphdr *th, *nth = (struct tcphdr *)(nih + 1);
+ u32 seq = ntohl(nth->seq);
+ int plen, tcpiphlen, olen = (nth->doff << 2) - sizeof(*nth);
+
+ if (cpl->vlan_valid && cpl->vlan != ncpl->vlan)
+ return -1;
+
+ if (unlikely(seq != s->seq))
+ return -1;
+
+ th = (struct tcphdr *)(s->skb->data + IPH_OFFSET +
+ sizeof(struct iphdr));
+
+ if (olen) {
+ u32 *ptr = (u32 *) (th + 1), *nptr = (u32 *) (nth + 1);
+
+ if (unlikely(ntohl(*(ptr + 1)) > ntohl(*(nptr + 1)) ||
+ !*(nptr + 2)))
+ return -1;
+
+ *(ptr + 1) = *(nptr + 1);
+ *(ptr + 2) = *(nptr + 2);
+ }
+ th->ack_seq = nth->ack_seq;
+ th->window = nth->window;
+
+ tcpiphlen = (nth->doff << 2) + sizeof(*nih);
+ plen = ntohs(nih->tot_len) - tcpiphlen;
+ s->seq += plen;
+ s->iplen += plen;
+ s->skb->data_len += plen;
+ s->skb->len += plen;
+ s->skb->truesize += plen;
+
+ if (plen > skb_shinfo(s->skb)->gso_size)
+ skb_shinfo(s->skb)->gso_size = plen;
+
+ if (unlikely(skb)) {
+ skb_pull(skb, skb->len - plen);
+ if (unlikely(!skb_shinfo(s->skb)->frag_list))
+ skb_shinfo(s->skb)->frag_list = skb;
+ else
+ s->skb_last_frag->next = skb;
+ s->skb_last_frag = skb;
+ } else {
+ int nr = skb_shinfo(s->skb)->nr_frags;
+ skb_shinfo(s->skb)->frags[nr].page = frag->page;
+ skb_shinfo(s->skb)->frags[nr].page_offset =
+ frag->page_offset + IPH_OFFSET + tcpiphlen;
+ skb_shinfo(s->skb)->frags[nr].size = plen;
+ skb_shinfo(s->skb)->nr_frags = ++nr;
+ }
+
+ return 0;
+}
+
+static inline int rx_eth_lro_page(struct adapter *adap, struct sge_qset *qs,
+ struct sge_fl_page *p, u32 hash, u32 csum,
+ int *lro)
+{
+ struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(p->va + 2);
+ struct iphdr *ih;
+ struct tcphdr *th;
+ struct sge_lro_session *s = NULL;
+
+ if (!can_lro_packet(cpl, csum)) {
+ *lro = 0;
+ goto no_lro;
+ }
+
+ ih = (struct iphdr *)(p->va + IPH_OFFSET);
+ th = (struct tcphdr *)(ih + 1);
+ s = lro_find_session(&qs->lro, LRO_SESSION_IDX_HINT_HASH(hash), ih, th);
+ if (unlikely(!s))
+ goto no_lro;
+
+ /* If we already started LRO via chaining skbs, keep doing it that way.
+ */
+ if (unlikely(skb_shinfo(s->skb)->frag_list))
+ return -1;
+
+ if (unlikely(!can_lro_tcpsegment(th)))
+ goto no_lro;
+
+ if (lro_update_session(s, p->va, &p->frag, NULL))
+ goto no_lro;
+
+ if (unlikely(skb_shinfo(s->skb)->nr_frags == MAX_SKB_FRAGS ||
+ s->skb->len + qs->netdev->mtu > 65535))
+ lro_flush_session(adap, qs, s, NULL);
+
+ qs->port_stats[SGE_PSTATS_LRO_QUEUED]++;
+
+ return 0;
+
+no_lro:
+ if (s)
+ lro_flush_session(adap, qs, s, NULL);
+
+ return -1;
+}
+
+static void rx_eth_lro_skb(struct adapter *adap, struct sge_rspq *rq,
+ struct sk_buff *skb, int ethpad)
+{
+ struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(skb->data + ethpad);
+ struct sge_qset *qs = rspq_to_qset(rq);
+ struct iphdr *ih;
+ struct tcphdr *th;
+ struct sge_lro_session *s = NULL;
+
+ if (!can_lro_packet(cpl, skb->csum))
+ goto no_lro;
+
+ ih = (struct iphdr *)(skb->data + IPH_OFFSET);
+ th = (struct tcphdr *)(ih + 1);
+ s = lro_find_session(&qs->lro,
+ LRO_SESSION_IDX_HINT_HASH(skb->priority), ih, th);
+
+ if (unlikely(!can_lro_tcpsegment(th)))
+ goto no_lro;
+ else if (unlikely(!s))
+ s = lro_new_session(adap, qs, skb);
+ else {
+ if (lro_update_session(s, skb->data, NULL, skb)) {
+ lro_flush_session(adap, qs, s, skb);
+ return;
+ }
+
+ if (unlikely(s->skb->len + qs->netdev->mtu > 65535))
+ lro_flush_session(adap, qs, s, NULL);
+ }
+
+ qs->port_stats[SGE_PSTATS_LRO_QUEUED]++;
+ return;
+
+no_lro:
+ if (s)
+ lro_flush_session(adap, qs, s, NULL);
+
+ rx_eth(adap, rq, skb, ethpad);
+}
+
#define SKB_DATA_SIZE 128
static void skb_data_init(struct sk_buff *skb, struct sge_fl_page *p,
@@ -1911,7 +2230,7 @@ static int process_responses(struct adap
q->next_holdoff = q->holdoff_tmr;
while (likely(budget_left && is_new_response(r, q))) {
- int eth, ethpad = 2;
+ int eth, ethpad = 2, lro = qs->lro.enabled;
struct sk_buff *skb = NULL;
u32 len, flags = ntohl(r->flags);
u32 rss_hi = *(const u32 *)r, rss_lo = r->rss_hdr.rss_hash_val;
@@ -1961,6 +2280,13 @@ static int process_responses(struct adap
if (unlikely(fl->credits <
SGE_RX_DROP_THRES))
goto eth_recycle;
+
+ if (likely(lro &&
+ !rx_eth_lro_page(adap, qs,
+ p, rss_lo,
+ rss_hi,
+ &lro)))
+ goto eth_done;
skb = alloc_skb(SKB_DATA_SIZE,
GFP_ATOMIC);
@@ -2016,9 +2342,12 @@ eth_done:
skb->csum = rss_hi;
skb->priority = rss_lo;
- if (eth)
- rx_eth(adap, q, skb, ethpad);
- else {
+ if (eth) {
+ if (likely(lro))
+ rx_eth_lro_skb(adap, q, skb, ethpad);
+ else
+ rx_eth(adap, q, skb, ethpad);
+ } else {
if (unlikely(r->rss_hdr.opcode ==
CPL_TRACE_PKT))
__skb_pull(skb, ethpad);
@@ -2030,7 +2359,7 @@ eth_done:
}
--budget_left;
}
-
+ sge_lro_flush_all(adap, qs);
deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
if (sleeping)
check_ring_db(adap, qs, sleeping);
@@ -2698,6 +3027,7 @@ int t3_sge_alloc_qset(struct adapter *ad
spin_unlock(&adapter->sge.reg_lock);
q->netdev = netdev;
t3_update_qset_coalesce(q, p);
+ q->lro.enabled = p->lro;
/*
* We use atalk_ptr as a backpointer to a qset. In case a device is
@@ -2839,6 +3169,7 @@ void __devinit t3_sge_prep(struct adapte
q->polling = adap->params.rev > 0;
q->coalesce_usecs = 5;
+ q->lro = 1;
q->rspq_size = 1024;
q->fl_size = 1024;
q->jumbo_size = 512;
diff --git a/drivers/net/cxgb3/t3_cpl.h b/drivers/net/cxgb3/t3_cpl.h
index b7a1a31..0f9f67d 100644
--- a/drivers/net/cxgb3/t3_cpl.h
+++ b/drivers/net/cxgb3/t3_cpl.h
@@ -174,6 +174,12 @@ enum { /* TCP congestion control algo
CONG_ALG_HIGHSPEED
};
+enum { /* RSS hash type */
+ RSS_HASH_NONE = 0,
+ RSS_HASH_2_TUPLE = 1 << 0,
+ RSS_HASH_4_TUPLE = 1 << 1
+};
+
union opcode_tid {
__be32 opcode_tid;
__u8 opcode;
@@ -184,6 +190,10 @@ union opcode_tid {
#define G_OPCODE(x) (((x) >> S_OPCODE) & 0xFF)
#define G_TID(x) ((x) & 0xFFFFFF)
+#define S_HASHTYPE 22
+#define M_HASHTYPE 0x3
+#define G_HASHTYPE(x) (((x) >> S_HASHTYPE) & M_HASHTYPE)
+
/* tid is assumed to be 24-bits */
#define MK_OPCODE_TID(opcode, tid) (V_OPCODE(opcode) | (tid))
next reply other threads:[~2007-02-22 12:00 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-02-22 11:59 divy [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-02-25 0:44 [PATCH 7/7] cxgb3 - Add SW LRO support divy
2007-02-26 5:13 ` Christoph Hellwig
2007-02-27 0:55 ` Divy Le Ray
2007-02-27 14:17 ` Steve Wise
2007-02-27 14:53 ` Jeff Garzik
2007-02-27 14:57 ` Steve Wise
2007-02-27 16:29 ` Brice Goglin
2007-06-01 20:37 ` Jeff Garzik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070222115952.10091.80725.stgit@localhost.localdomain \
--to=divy@chelsio.com \
--cc=jeff@garzik.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=swise@opengridcomputing.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.