* Re: cxgb3: Replace LRO with GRO
@ 2009-01-20 10:14 Divy Le Ray
2009-01-21 8:29 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-20 10:14 UTC (permalink / raw)
To: herbert; +Cc: netdev
Hi Herbert,
I have tried the following patch as an attempt to eliminate the memcpy
seen on the previous oprofile. I'm now getting about 5.5 Gbs.
After that, I went through the output of opreport -d to figure out
the most expensive ops witnessed in my profiling.
Here is the patch:
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2554,6 +2554,8 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
struct net_device *dev = napi->dev;
struct sk_buff *skb = napi->skb;
struct ethhdr *eth;
+ skb_frag_t *frag;
+ int i;
napi->skb = NULL;
@@ -2566,9 +2568,15 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
}
BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
- skb_shinfo(skb)->nr_frags = info->nr_frags;
- memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
+ frag = &info->frags[info->nr_frags - 1];
+ for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
+ skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
+ frag->size);
+ frag++;
+ }
+ skb_shinfo(skb)->nr_frags = info->nr_frags;
+
skb->data_len = info->len;
skb->len += info->len;
skb->truesize += info->len;
Here is the non detailed opreport output for the CPU managing the reception
of netperf traffic:
38.815300 copy_user_generic_unrolled vmlinux
6.373900 process_responses cxgb3.ko
4.957800 inet_gro_receive vmlinux
4.908800 put_page vmlinux
4.862100 refill_fl cxgb3.ko
3.774900 dev_gro_receive vmlinux
3.096000 tcp_gro_receive vmlinux
2.764700 napi_fraginfo_skb vmlinux
2.174400 free_hot_cold_page vmlinux
2.006400 skb_copy_datagram_iovec vmlinux
1.511800 tcp_recvmsg vmlinux
1.488500 get_page_from_freelist vmlinux
1.455800 irq_entries_start vmlinux
1.453500 skb_gro_header vmlinux
0.877200 get_pageblock_flags_group vmlinux
0.863200 memcpy_toiovec vmlinux
0.856200 _raw_spin_lock vmlinux
0.720900 memcpy vmlinux
0.711600 skb_gro_receive vmlinux
0.683600 kfree vmlinux
Here is a list of more detailed info sorted per GRO function as seen above:
- Relative % for the most expensive instructions
- gdb dissass'output for these instructions
- gdb list's output.
inet_gro_receive 4.9578 ffffffff805468c0
ffffffff80546a49 11.1059%
0xffffffff80546a49 <inet_gro_receive+393>: jne 0xffffffff805469e5 <inet_gro_receive+293>
0xffffffff80546a49 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
1280 if (!NAPI_GRO_CB(p)->same_flow)
1281 continue;
1282
1283 iph2 = ip_hdr(p);
1284
1285 if (iph->protocol != iph2->protocol ||
1286 iph->tos != iph2->tos ||
1287 memcmp(&iph->saddr, &iph2->saddr, 8)) {
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
ffffffff80546a61 10.4000%
0xffffffff80546a61 <inet_gro_receive+417>: je 0xffffffff80546abb <inet_gro_receive+507>
0xffffffff80546a61 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
1290 }
1291
1292 /* All fields must match except length and checksum. */
1293 NAPI_GRO_CB(p)->flush |=
1294 memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
1295 (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
ffffffff80546a58 8.2353%
0xffffffff80546a58 <inet_gro_receive+408>: mov %rdx,%rcx
0xffffffff80546a58 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
1290 }
1291
1292 /* All fields must match except length and checksum. */
1293 NAPI_GRO_CB(p)->flush |=
1294 memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
1295 (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
1296
1297 NAPI_GRO_CB(p)->flush |= flush;
ffffffff80546abb 8.2353%
0xffffffff80546abb <inet_gro_receive+507>: movzwl 0x4(%r10),%eax
(gdb) list *(0xffffffff80546abb)
0xffffffff80546abb is in inet_gro_receive (/mnt/net-2.6/include/linux/swab.h:51).
46 static inline __attribute_const__ __u16 __fswab16(__u16 val)
47 {
48 #ifdef __arch_swab16
49 return __arch_swab16(val);
50 #else
51 return ___constant_swab16(val);
52 #endif
53 }
54
55 static inline __attribute_const__ __u32 __fswab32(__u32 val)
ffffffff80546a4b 8.1882%
0xffffffff80546a4b is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1293).
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
1290 }
1291
1292 /* All fields must match except length and checksum. */
1293 NAPI_GRO_CB(p)->flush |=
1294 memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
1295 (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
1296
1297 NAPI_GRO_CB(p)->flush |= flush;
ffffffff80546a47 7.5765%
0xffffffff80546a47 <inet_gro_receive+391>: repz cmpsb %es:(%rdi),%ds:(%rsi)
0xffffffff80546a47 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
1280 if (!NAPI_GRO_CB(p)->same_flow)
1281 continue;
1282
1283 iph2 = ip_hdr(p);
1284
1285 if (iph->protocol != iph2->protocol ||
1286 iph->tos != iph2->tos ||
1287 memcmp(&iph->saddr, &iph2->saddr, 8)) {
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
ffffffff80546a44 7.1529%
0xffffffff80546a44 is in inet_gro_receive (/mnt/net-2.6/net/ipv4/af_inet.c:1285).
1280 if (!NAPI_GRO_CB(p)->same_flow)
1281 continue;
1282
1283 iph2 = ip_hdr(p);
1284
1285 if (iph->protocol != iph2->protocol ||
1286 iph->tos != iph2->tos ||
1287 memcmp(&iph->saddr, &iph2->saddr, 8)) {
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
dev_gro_receive 3.7749 ffffffff805024b0
ffffffff805026a2 18.7268%
0xffffffff805026a2 <dev_gro_receive+498>: repz cmpsb %es:(%rdi),%ds:(%rsi)
0xffffffff805026a2 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2450).
2445 count++;
2446
2447 if (!NAPI_GRO_CB(p)->same_flow)
2448 continue;
2449
2450 if (p->mac_len != mac_len ||
2451 memcmp(skb_mac_header(p), mac, mac_len))
2452 NAPI_GRO_CB(p)->same_flow = 0;
2453 }
2454
ffffffff805026a4 13.4734%
0xffffffff805026a4 <dev_gro_receive+500>: je 0xffffffff805025c8 <dev_gro_receive+280>
(gdb) list *(0xffffffff805026a4)
0xffffffff805026a4 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2450).
2445 count++;
2446
2447 if (!NAPI_GRO_CB(p)->same_flow)
2448 continue;
2449
2450 if (p->mac_len != mac_len ||
2451 memcmp(skb_mac_header(p), mac, mac_len))
2452 NAPI_GRO_CB(p)->same_flow = 0;
ffffffff805025c8 9.3943%
0xffffffff805025c8 <dev_gro_receive+280>: mov (%r9),%r9
0xffffffff805025c8 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2444).
2439 skb->mac_len = mac_len;
2440 NAPI_GRO_CB(skb)->same_flow = 0;
2441 NAPI_GRO_CB(skb)->flush = 0;
2442 NAPI_GRO_CB(skb)->free = 0;
2443
2444 for (p = napi->gro_list; p; p = p->next) {
2445 count++;
2446
2447 if (!NAPI_GRO_CB(p)->same_flow)
2448 continue;
ffffffff805025f9 7.3548%
0xffffffff805025f9 <dev_gro_receive+329>: je 0xffffffff80502614 <dev_gro_receive+356>
0xffffffff805025f9 is in dev_gro_receive (/mnt/net-2.6/net/core/dev.c:2466).
2461 goto normal;
2462
2463 same_flow = NAPI_GRO_CB(skb)->same_flow;
2464 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2465
2466 if (pp) {
2467 struct sk_buff *nskb = *pp;
2468
2469 *pp = nskb->next;
2470 nskb->next = NULL;
tcp_gro_receive 3.0960 ffffffff80528df0
ffffffff80528f2b 16.3527%
0xffffffff80528f2b <tcp_gro_receive+315>: repz cmpsb %es:(%rdi),%ds:(%rsi)
0xffffffff80528f2b is in tcp_gro_receive (/mnt/net-2.6/net/ipv4/tcp.c:2521).
2516 flush = NAPI_GRO_CB(p)->flush;
2517 flush |= flags & TCP_FLAG_CWR;
2518 flush |= (flags ^ tcp_flag_word(th2)) &
2519 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2520 flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
2521 flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
2522
2523 total = skb_gro_len(p);
2524 mss = skb_shinfo(p)->gso_size;
ffffffff80528f2d 15.9759%
0xffffffff80528f2d <tcp_gro_receive+317>: mov 0x60(%r8),%edi
0xffffffff80528f2d is in tcp_gro_receive (/mnt/net-2.6/include/linux/netdevice.h:1101).
1096 return NAPI_GRO_CB(skb)->data_offset;
1097 }
1098
1099 static inline unsigned int skb_gro_len(const struct sk_buff *skb)
1100 {
1101 return skb->len - NAPI_GRO_CB(skb)->data_offset;
1102 }
1103
1104 static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
1105 {
ffffffff80528f31 13.7905%
0xffffffff80528f31 <tcp_gro_receive+321>: setb %al
0xffffffff80528f31 is in tcp_gro_receive (/mnt/net-2.6/net/ipv4/tcp.c:2521).
2516 flush = NAPI_GRO_CB(p)->flush;
2517 flush |= flags & TCP_FLAG_CWR;
2518 flush |= (flags ^ tcp_flag_word(th2)) &
2519 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2520 flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
2521 flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
2522
2523 total = skb_gro_len(p);
2524 mss = skb_shinfo(p)->gso_size;
napi_fraginfo_skb 2.7647 ffffffff80501dd0
ffffffff80501f16 65.2321%
0xffffffff80501f16 <napi_fraginfo_skb+326>: mov %eax,0x6c(%rbx)
0xffffffff80501f16 is in napi_fraginfo_skb (/mnt/net-2.6/net/core/dev.c:2606).
2601 * special handling. We'll fix it up properly at the end.
2602 */
2603 skb->protocol = eth->h_proto;
2604
2605 skb->ip_summed = info->ip_summed;
2606 skb->csum = info->csum;
2607
2608 out:
2609 return skb;
2610 }
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-20 10:14 cxgb3: Replace LRO with GRO Divy Le Ray
@ 2009-01-21 8:29 ` Herbert Xu
2009-01-22 9:42 ` Divy Le Ray
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-21 8:29 UTC (permalink / raw)
To: Divy Le Ray; +Cc: netdev
On Tue, Jan 20, 2009 at 02:14:19AM -0800, Divy Le Ray wrote:
>
> Here is the patch:
Thanks, I'll add it to my series and push it later.
> Here is the non detailed opreport output for the CPU managing the reception
> of netperf traffic:
This is very helpful! Now some of the extra cost of GRO is due
to extra checks which are unavoidable. However, there is a lot
of microoptimisations that we can perform to minimise the cost
and also reduce the cost of some existing checks.
Please try the following patch (compile tested only since I'm
at LCA) and let me know if it improves things.
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 1cb0f0b..a1f17ab 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -184,4 +184,25 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
}
#endif /* __KERNEL__ */
+/**
+ * compare_ether_header - Compare two Ethernet headers
+ * @a: Pointer to Ethernet header
+ * @b: Pointer to Ethernet header
+ *
+ * Compare two ethernet headers, returns 0 if equal.
+ * This assumes that the network header (i.e., IP header) is 4-byte
+ * aligned OR the platform can handle unaligned access. This is the
+ * case for all packets coming into netif_receive_skb or similar
+ * entry points.
+ */
+
+static inline int compare_ether_header(const void *a, const void *b)
+{
+ u32 *a32 = (u32 *)((u8 *)a + 2);
+ u32 *b32 = (u32 *)((u8 *)b + 2);
+
+ return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
+ (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
+}
+
#endif /* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5472328..89e5753 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,6 +314,9 @@ struct napi_struct {
spinlock_t poll_lock;
int poll_owner;
#endif
+
+ unsigned int gro_count;
+
struct net_device *dev;
struct list_head dev_list;
struct sk_buff *gro_list;
@@ -1111,6 +1114,13 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb)
NAPI_GRO_CB(skb)->data_offset = 0;
}
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+ return skb_headlen(skb) ? skb_mac_header(skb) :
+ page_address(skb_shinfo(skb)->frags[0].page) +
+ skb_shinfo(skb)->frags[0].page_offset;
+}
+
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index fb0d16a..63f103b 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -85,7 +85,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
goto drop;
for (p = napi->gro_list; p; p = p->next) {
- NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev;
+ NAPI_GRO_CB(p)->same_flow =
+ p->dev == skb->dev && !compare_ether_header(
+ skb_mac_header(p), skb_gro_mac_header(skb));
NAPI_GRO_CB(p)->flush = 0;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d55f725..fd5998a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -215,13 +215,6 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
}
-static inline void *skb_gro_mac_header(struct sk_buff *skb)
-{
- return skb_headlen(skb) ? skb_mac_header(skb) :
- page_address(skb_shinfo(skb)->frags[0].page) +
- skb_shinfo(skb)->frags[0].page_offset;
-}
-
/* Device list insertion */
static int list_netdevice(struct net_device *dev)
{
@@ -2378,6 +2371,7 @@ void napi_gro_flush(struct napi_struct *napi)
napi_gro_complete(skb);
}
+ napi->gro_count = 0;
napi->gro_list = NULL;
}
EXPORT_SYMBOL(napi_gro_flush);
@@ -2407,7 +2401,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
struct packet_type *ptype;
__be16 type = skb->protocol;
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
- int count = 0;
int same_flow;
int mac_len;
int ret;
@@ -2421,30 +2414,17 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
struct sk_buff *p;
- void *mac;
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
- mac = skb_gro_mac_header(skb);
mac_len = skb->network_header - skb->mac_header;
skb->mac_len = mac_len;
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
- for (p = napi->gro_list; p; p = p->next) {
- count++;
-
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- if (p->mac_len != mac_len ||
- memcmp(skb_mac_header(p), mac, mac_len))
- NAPI_GRO_CB(p)->same_flow = 0;
- }
-
pp = ptype->gro_receive(&napi->gro_list, skb);
break;
}
@@ -2462,15 +2442,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
*pp = nskb->next;
nskb->next = NULL;
napi_gro_complete(nskb);
- count--;
+ napi->gro_count--;
}
if (same_flow)
goto ok;
- if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
+ if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
goto normal;
+ napi->gro_count++;
NAPI_GRO_CB(skb)->count = 1;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
@@ -2490,7 +2471,8 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
struct sk_buff *p;
for (p = napi->gro_list; p; p = p->next) {
- NAPI_GRO_CB(p)->same_flow = 1;
+ NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
+ skb_mac_header(p), skb_gro_mac_header(skb));
NAPI_GRO_CB(p)->flush = 0;
}
@@ -2714,6 +2696,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
+ napi->gro_count = 0;
napi->gro_list = NULL;
napi->skb = NULL;
napi->poll = poll;
@@ -2742,6 +2725,7 @@ void netif_napi_del(struct napi_struct *napi)
}
napi->gro_list = NULL;
+ napi->gro_count = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -5208,6 +5192,7 @@ static int __init net_dev_init(void)
queue->backlog.poll = process_backlog;
queue->backlog.weight = weight_p;
queue->backlog.gro_list = NULL;
+ queue->backlog.gro_count = 0;
}
dev_boot_phase = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d6770f2..f011ce6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1264,7 +1264,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!ops || !ops->gro_receive)
goto out_unlock;
- if (iph->version != 4 || iph->ihl != 5)
+ if (*(u8 *)iph != 0x45)
goto out_unlock;
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
@@ -1282,17 +1282,18 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
iph2 = ip_hdr(p);
- if (iph->protocol != iph2->protocol ||
- iph->tos != iph2->tos ||
- memcmp(&iph->saddr, &iph2->saddr, 8)) {
+ if ((iph->protocol ^ iph2->protocol) |
+ (iph->tos ^ iph2->tos) |
+ (iph->saddr ^ iph2->saddr) |
+ (iph->daddr ^ iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
- memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
- (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
+ (iph->ttl ^ iph2->ttl) |
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0191d39..a1d1f3b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2469,9 +2469,9 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct tcphdr *th2;
unsigned int thlen;
unsigned int flags;
- unsigned int total;
unsigned int mss = 1;
int flush = 1;
+ int i;
th = skb_gro_header(skb, sizeof(*th));
if (unlikely(!th))
@@ -2495,7 +2495,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
th2 = tcp_hdr(p);
- if (th->source != th2->source || th->dest != th2->dest) {
+ if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -2510,14 +2510,15 @@ found:
flush |= flags & TCP_FLAG_CWR;
flush |= (flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
- flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
- flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
+ flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
+ for (i = sizeof(*th); !flush && i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
- total = skb_gro_len(p);
mss = skb_shinfo(p)->gso_size;
- flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb);
- flush |= ntohl(th2->seq) + total != ntohl(th->seq);
+ flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb);
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
if (flush || skb_gro_receive(head, skb)) {
mss = 1;
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-21 8:29 ` Herbert Xu
@ 2009-01-22 9:42 ` Divy Le Ray
2009-02-16 3:36 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-22 9:42 UTC (permalink / raw)
To: Herbert Xu; +Cc: netdev
Herbert Xu wrote:
> there is a lot
> of microoptimisations that we can perform to minimise the cost
> and also reduce the cost of some existing checks.
>
> Please try the following patch (compile tested only since I'm
> at LCA) and let me know if it improves things.
>
I'm now getting about an average 5.7Gbs, oprofile typically showing:
37.294500 copy_user_generic_unrolled vmlinux
8.733800 process_responses cxgb3.ko
6.123700 refill_fl cxgb3.ko
5.006900 put_page vmlinux
3.400700 napi_fraginfo_skb vmlinux
2.484600 tcp_gro_receive vmlinux
2.308900 inet_gro_receive vmlinux
2.196000 free_hot_cold_page vmlinux
2.032900 skb_gro_header vmlinux
1.970100 get_page_from_freelist vmlinux
1.869700 skb_copy_datagram_iovec vmlinux
1.380300 dev_gro_receive vmlinux
1.242300 tcp_recvmsg vmlinux
1.079200 irq_entries_start vmlinux
1.003900 get_pageblock_flags_group vmlinux
0.991300 skb_gro_receive vmlinux
0.878400 _raw_spin_lock vmlinux
0.878400 memcpy vmlinux
0.840800 kfree vmlinux
0.828200 list_del vmlinux
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-22 9:42 ` Divy Le Ray
@ 2009-02-16 3:36 ` Herbert Xu
2009-02-16 3:47 ` Divy Le Ray
2009-03-13 7:28 ` Divy Le Ray
0 siblings, 2 replies; 23+ messages in thread
From: Herbert Xu @ 2009-02-16 3:36 UTC (permalink / raw)
To: Divy Le Ray; +Cc: netdev
On Thu, Jan 22, 2009 at 01:42:22AM -0800, Divy Le Ray wrote:
>
> I'm now getting about an average 5.7Gbs, oprofile typically showing:
>
> 37.294500 copy_user_generic_unrolled vmlinux
> 8.733800 process_responses cxgb3.ko
> 6.123700 refill_fl cxgb3.ko
> 5.006900 put_page vmlinux
> 3.400700 napi_fraginfo_skb vmlinux
> 2.484600 tcp_gro_receive vmlinux
> 2.308900 inet_gro_receive vmlinux
> 2.196000 free_hot_cold_page vmlinux
> 2.032900 skb_gro_header vmlinux
> 1.970100 get_page_from_freelist vmlinux
> 1.869700 skb_copy_datagram_iovec vmlinux
> 1.380300 dev_gro_receive vmlinux
> 1.242300 tcp_recvmsg vmlinux
> 1.079200 irq_entries_start vmlinux
> 1.003900 get_pageblock_flags_group vmlinux
> 0.991300 skb_gro_receive vmlinux
> 0.878400 _raw_spin_lock vmlinux
> 0.878400 memcpy vmlinux
When you can get a chance can you see if this patch makes any
difference at all?
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d7efaf9..6a542fa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2586,8 +2586,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
struct sk_buff *p = *head;
struct sk_buff *nskb;
+ skb_frag_t *frag;
unsigned int headroom;
unsigned int len = skb_gro_len(skb);
+ int i;
if (p->len + len >= 65536)
return -E2BIG;
@@ -2604,9 +2606,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
skb_shinfo(skb)->frags[0].size -=
skb_gro_offset(skb) - skb_headlen(skb);
- memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
- skb_shinfo(skb)->frags,
- skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
+ frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ *frag++ = skb_shinfo(skb)->frags[i];
skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-02-16 3:36 ` Herbert Xu
@ 2009-02-16 3:47 ` Divy Le Ray
2009-03-13 7:28 ` Divy Le Ray
1 sibling, 0 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-02-16 3:47 UTC (permalink / raw)
To: Herbert Xu; +Cc: netdev
Herbert Xu wrote:
> When you can get a chance can you see if this patch makes any
> difference at all?
Hi Herbert,
Thanks for the patch.
I should be able to test within the next few days.
cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-02-16 3:36 ` Herbert Xu
2009-02-16 3:47 ` Divy Le Ray
@ 2009-03-13 7:28 ` Divy Le Ray
2009-04-13 15:24 ` Herbert Xu
1 sibling, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-03-13 7:28 UTC (permalink / raw)
To: Herbert Xu; +Cc: netdev
> When you can get a chance can you see if this patch makes any
> difference at all?
Hi Herbert,
Sorry for the delay.
I had to switch to different development platforms.
I've not seen much perf change with this patch, it looks good though.
Cheers,
Divy
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index d7efaf9..6a542fa 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2586,8 +2586,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
> {
> struct sk_buff *p = *head;
> struct sk_buff *nskb;
> + skb_frag_t *frag;
> unsigned int headroom;
> unsigned int len = skb_gro_len(skb);
> + int i;
>
> if (p->len + len >= 65536)
> return -E2BIG;
> @@ -2604,9 +2606,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
> skb_shinfo(skb)->frags[0].size -=
> skb_gro_offset(skb) - skb_headlen(skb);
>
> - memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
> - skb_shinfo(skb)->frags,
> - skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
> + frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags;
> + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> + *frag++ = skb_shinfo(skb)->frags[i];
>
> skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags;
> skb_shinfo(skb)->nr_frags = 0;
>
> Thanks,
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-03-13 7:28 ` Divy Le Ray
@ 2009-04-13 15:24 ` Herbert Xu
0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-04-13 15:24 UTC (permalink / raw)
To: Divy Le Ray, David S. Miller; +Cc: netdev
On Fri, Mar 13, 2009 at 12:28:52AM -0700, Divy Le Ray wrote:
>
> I've not seen much perf change with this patch, it looks good though.
Thanks. I decided to get rid of the copy altogether after I got
my hands on a cxgb3 machine (well, remotely anyway). It seems to
do the trick for me as in both LRO and GRO now produces about the
same performance with the CPU 100% maxed out.
Here's the patch. Let me know how it fares on your machine.
gro: New frags interface to avoid copying shinfo
It turns out that copying a 16-byte area at ~800k times a second
can be really expensive :) This patch redesigns the frags GRO
interface to avoid copying that area twice.
The two disciples of the frags interface have been converted.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 714df2b..322434a 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -195,7 +195,7 @@ struct sge_qset { /* an SGE queue set */
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
struct sge_txq txq[SGE_TXQ_PER_SET];
- struct napi_gro_fraginfo lro_frag_tbl;
+ int nomem;
int lro_enabled;
void *lro_va;
struct net_device *netdev;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 26d3587..73d569e 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -654,7 +654,8 @@ static void t3_reset_qset(struct sge_qset *q)
q->txq_stopped = 0;
q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
q->rx_reclaim_timer.function = NULL;
- q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
+ q->nomem = 0;
+ napi_free_frags(&q->napi);
}
@@ -2074,20 +2075,19 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
struct sge_fl *fl, int len, int complete)
{
struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+ struct sk_buff *skb = NULL;
struct cpl_rx_pkt *cpl;
- struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
- int nr_frags = qs->lro_frag_tbl.nr_frags;
- int frag_len = qs->lro_frag_tbl.len;
+ struct skb_frag_struct *rx_frag;
+ int nr_frags;
int offset = 0;
- if (!nr_frags) {
- offset = 2 + sizeof(struct cpl_rx_pkt);
- qs->lro_va = cpl = sd->pg_chunk.va + 2;
+ if (!qs->nomem) {
+ skb = napi_get_frags(&qs->napi);
+ qs->nomem = !skb;
}
fl->credits--;
- len -= offset;
pci_dma_sync_single_for_cpu(adap->pdev,
pci_unmap_addr(sd, dma_addr),
fl->buf_size - SGE_PG_RSVD,
@@ -2100,21 +2100,38 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
fl->alloc_size,
PCI_DMA_FROMDEVICE);
+ if (!skb) {
+ put_page(sd->pg_chunk.page);
+ if (complete)
+ qs->nomem = 0;
+ return;
+ }
+
+ rx_frag = skb_shinfo(skb)->frags;
+ nr_frags = skb_shinfo(skb)->nr_frags;
+
+ if (!nr_frags) {
+ offset = 2 + sizeof(struct cpl_rx_pkt);
+ qs->lro_va = sd->pg_chunk.va + 2;
+ }
+ len -= offset;
+
prefetch(qs->lro_va);
rx_frag += nr_frags;
rx_frag->page = sd->pg_chunk.page;
rx_frag->page_offset = sd->pg_chunk.offset + offset;
rx_frag->size = len;
- frag_len += len;
- qs->lro_frag_tbl.nr_frags++;
- qs->lro_frag_tbl.len = frag_len;
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+ skb_shinfo(skb)->nr_frags++;
if (!complete)
return;
- qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
cpl = qs->lro_va;
if (unlikely(cpl->vlan_valid)) {
@@ -2123,15 +2140,11 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
struct vlan_group *grp = pi->vlan_grp;
if (likely(grp != NULL)) {
- vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
- &qs->lro_frag_tbl);
- goto out;
+ vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan));
+ return;
}
}
- napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
-
-out:
- qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
+ napi_gro_frags(&qs->napi);
}
/**
@@ -2300,8 +2313,6 @@ no_mem:
if (fl->use_pages) {
void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
- prefetch(&qs->lro_frag_tbl);
-
prefetch(addr);
#if L1_CACHE_BYTES < 128
prefetch(addr + L1_CACHE_BYTES);
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index 66d7fe3..01f9432 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -450,17 +450,27 @@ static void efx_rx_packet_lro(struct efx_channel *channel,
/* Pass the skb/page into the LRO engine */
if (rx_buf->page) {
- struct napi_gro_fraginfo info;
+ struct sk_buff *skb = napi_get_frags(napi);
- info.frags[0].page = rx_buf->page;
- info.frags[0].page_offset = efx_rx_buf_offset(rx_buf);
- info.frags[0].size = rx_buf->len;
- info.nr_frags = 1;
- info.ip_summed = CHECKSUM_UNNECESSARY;
- info.len = rx_buf->len;
+ if (!skb) {
+ put_page(rx_buf->page);
+ goto out;
+ }
+
+ skb_shinfo(skb)->frags[0].page = rx_buf->page;
+ skb_shinfo(skb)->frags[0].page_offset =
+ efx_rx_buf_offset(rx_buf);
+ skb_shinfo(skb)->frags[0].size = rx_buf->len;
+ skb_shinfo(skb)->nr_frags = 1;
+
+ skb->len = rx_buf->len;
+ skb->data_len = rx_buf->len;
+ skb->truesize += rx_buf->len;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
- napi_gro_frags(napi, &info);
+ napi_gro_frags(napi);
+out:
EFX_BUG_ON_PARANOID(rx_buf->skb);
rx_buf->page = NULL;
} else {
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index e1ff5b1..7ff9af1 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -118,8 +118,7 @@ extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct sk_buff *skb);
extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
- unsigned int vlan_tci,
- struct napi_gro_fraginfo *info);
+ unsigned int vlan_tci);
#else
static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -154,8 +153,7 @@ static inline int vlan_gro_receive(struct napi_struct *napi,
}
static inline int vlan_gro_frags(struct napi_struct *napi,
- struct vlan_group *grp, unsigned int vlan_tci,
- struct napi_gro_fraginfo *info)
+ struct vlan_group *grp, unsigned int vlan_tci)
{
return NET_RX_DROP;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..54db3eb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1047,14 +1047,6 @@ struct packet_type {
struct list_head list;
};
-struct napi_gro_fraginfo {
- skb_frag_t frags[MAX_SKB_FRAGS];
- unsigned int nr_frags;
- unsigned int ip_summed;
- unsigned int len;
- __wsum csum;
-};
-
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -1442,12 +1434,18 @@ extern int napi_gro_receive(struct napi_struct *napi,
struct sk_buff *skb);
extern void napi_reuse_skb(struct napi_struct *napi,
struct sk_buff *skb);
-extern struct sk_buff * napi_fraginfo_skb(struct napi_struct *napi,
- struct napi_gro_fraginfo *info);
+extern struct sk_buff * napi_get_frags(struct napi_struct *napi);
extern int napi_frags_finish(struct napi_struct *napi,
struct sk_buff *skb, int ret);
-extern int napi_gro_frags(struct napi_struct *napi,
- struct napi_gro_fraginfo *info);
+extern struct sk_buff * napi_frags_skb(struct napi_struct *napi);
+extern int napi_gro_frags(struct napi_struct *napi);
+
+static inline void napi_free_frags(struct napi_struct *napi)
+{
+ kfree_skb(napi->skb);
+ napi->skb = NULL;
+}
+
extern void netif_nit_deliver(struct sk_buff *skb);
extern int dev_valid_name(const char *name);
extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c67fe6f..7f7de1a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -114,9 +114,9 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
EXPORT_SYMBOL(vlan_gro_receive);
int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
- unsigned int vlan_tci, struct napi_gro_fraginfo *info)
+ unsigned int vlan_tci)
{
- struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+ struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
return NET_RX_DROP;
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..619fa14 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2519,16 +2519,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_reuse_skb);
-struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
- struct napi_gro_fraginfo *info)
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct net_device *dev = napi->dev;
struct sk_buff *skb = napi->skb;
- struct ethhdr *eth;
- skb_frag_t *frag;
- int i;
-
- napi->skb = NULL;
if (!skb) {
skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
@@ -2536,47 +2530,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
goto out;
skb_reserve(skb, NET_IP_ALIGN);
- }
-
- BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
- frag = &info->frags[info->nr_frags - 1];
- for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
- skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
- frag->size);
- frag++;
+ napi->skb = skb;
}
- skb_shinfo(skb)->nr_frags = info->nr_frags;
-
- skb->data_len = info->len;
- skb->len += info->len;
- skb->truesize += info->len;
-
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
-
- eth = skb_gro_header(skb, sizeof(*eth));
- if (!eth) {
- napi_reuse_skb(napi, skb);
- skb = NULL;
- goto out;
- }
-
- skb_gro_pull(skb, sizeof(*eth));
-
- /*
- * This works because the only protocols we care about don't require
- * special handling. We'll fix it up properly at the end.
- */
- skb->protocol = eth->h_proto;
-
- skb->ip_summed = info->ip_summed;
- skb->csum = info->csum;
out:
return skb;
}
-EXPORT_SYMBOL(napi_fraginfo_skb);
+EXPORT_SYMBOL(napi_get_frags);
int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
{
@@ -2606,9 +2567,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
}
EXPORT_SYMBOL(napi_frags_finish);
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ struct ethhdr *eth;
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header(skb, sizeof(*eth));
+ if (!eth) {
+ napi_reuse_skb(napi, skb);
+ skb = NULL;
+ goto out;
+ }
+
+ skb_gro_pull(skb, sizeof(*eth));
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling. We'll fix it up properly at the end.
+ */
+ skb->protocol = eth->h_proto;
+
+out:
+ return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+int napi_gro_frags(struct napi_struct *napi)
{
- struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+ struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
return NET_RX_DROP;
@@ -2712,7 +2703,7 @@ void netif_napi_del(struct napi_struct *napi)
struct sk_buff *skb, *next;
list_del_init(&napi->dev_list);
- kfree_skb(napi->skb);
+ napi_free_frags(napi);
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
@ 2009-01-15 21:14 Divy Le Ray
2009-01-15 23:58 ` Herbert Xu
2009-01-20 1:03 ` David Miller
0 siblings, 2 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-15 21:14 UTC (permalink / raw)
To: herbert, davem; +Cc: netdev, swise
Hi Herbert,
Your patch looks fine. Early testing provides the same perfs
seen with LRO.
I resubmit the original patch with 2 changes:
- cxgb3_set_flags() removal did not apply.
commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
- replace the NETIF_F_LRO features flag with NETIF_F_GRO
cxgb3: Replace LRO with GRO
This patch makes cxgb3 invoke the GRO hooks instead of LRO. As
GRO has a compatible external interface to LRO this is a very
straightforward replacement.
I've kept the ioctl controls for per-queue LRO switches. However,
we should not encourage anyone to use these.
Because of that, I've also kept the skb construction code in
cxgb3. Hopefully we can phase out those per-queue switches
and then kill this too.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Divy Le Ray <divy@chelsio.com>
---
drivers/net/Kconfig | 1
drivers/net/cxgb3/adapter.h | 13 +---
drivers/net/cxgb3/cxgb3_main.c | 42 +-------------
drivers/net/cxgb3/sge.c | 118 +++++-----------------------------------
4 files changed, 22 insertions(+), 152 deletions(-)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9fe8cb7..9212289 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2408,7 +2408,6 @@ config CHELSIO_T3
tristate "Chelsio Communications T3 10Gb Ethernet support"
depends on CHELSIO_T3_DEPENDS
select FW_LOADER
- select INET_LRO
help
This driver supports Chelsio T3-based gigabit and 10Gb Ethernet
adapters.
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index a89d8cc..f9d39c6 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -42,7 +42,6 @@
#include <linux/cache.h>
#include <linux/mutex.h>
#include <linux/bitops.h>
-#include <linux/inet_lro.h>
#include "t3cdev.h"
#include <asm/io.h>
@@ -178,15 +177,11 @@ enum { /* per port SGE statistics */
SGE_PSTAT_TX_CSUM, /* # of TX checksum offloads */
SGE_PSTAT_VLANEX, /* # of VLAN tag extractions */
SGE_PSTAT_VLANINS, /* # of VLAN tag insertions */
- SGE_PSTAT_LRO_AGGR, /* # of page chunks added to LRO sessions */
- SGE_PSTAT_LRO_FLUSHED, /* # of flushed LRO sessions */
- SGE_PSTAT_LRO_NO_DESC, /* # of overflown LRO sessions */
SGE_PSTAT_MAX /* must be last */
};
-#define T3_MAX_LRO_SES 8
-#define T3_MAX_LRO_MAX_PKTS 64
+struct napi_gro_fraginfo;
struct sge_qset { /* an SGE queue set */
struct adapter *adap;
@@ -194,12 +189,8 @@ struct sge_qset { /* an SGE queue set */
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
struct sge_txq txq[SGE_TXQ_PER_SET];
- struct net_lro_mgr lro_mgr;
- struct net_lro_desc lro_desc[T3_MAX_LRO_SES];
- struct skb_frag_struct *lro_frag_tbl;
- int lro_nfrags;
+ struct napi_gro_fraginfo lro_frag_tbl;
int lro_enabled;
- int lro_frag_len;
void *lro_va;
struct net_device *netdev;
struct netdev_queue *tx_q; /* associated netdev TX queue */
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 0089746..f59b9e3 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -508,19 +508,9 @@ static void set_qset_lro(struct net_device *dev, int qset_idx, int val)
{
struct port_info *pi = netdev_priv(dev);
struct adapter *adapter = pi->adapter;
- int i, lro_on = 1;
adapter->params.sge.qset[qset_idx].lro = !!val;
adapter->sge.qs[qset_idx].lro_enabled = !!val;
-
- /* let ethtool report LRO on only if all queues are LRO enabled */
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; ++i)
- lro_on &= adapter->params.sge.qset[i].lro;
-
- if (lro_on)
- dev->features |= NETIF_F_LRO;
- else
- dev->features &= ~NETIF_F_LRO;
}
/**
@@ -1433,9 +1423,9 @@ static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_AGGR);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_FLUSHED);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_NO_DESC);
+ *data++ = 0;
+ *data++ = 0;
+ *data++ = 0;
*data++ = s->rx_cong_drops;
*data++ = s->num_toggled;
@@ -1826,28 +1816,6 @@ static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
memset(&wol->sopass, 0, sizeof(wol->sopass));
}
-static int cxgb3_set_flags(struct net_device *dev, u32 data)
-{
- struct port_info *pi = netdev_priv(dev);
- int i;
-
- if (data & ETH_FLAG_LRO) {
- if (!(pi->rx_offload & T3_RX_CSUM))
- return -EINVAL;
-
- pi->rx_offload |= T3_LRO;
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
- set_qset_lro(dev, i, 1);
-
- } else {
- pi->rx_offload &= ~T3_LRO;
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
- set_qset_lro(dev, i, 0);
- }
-
- return 0;
-}
-
static const struct ethtool_ops cxgb_ethtool_ops = {
.get_settings = get_settings,
.set_settings = set_settings,
@@ -1877,8 +1845,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
.get_regs = get_regs,
.get_wol = get_wol,
.set_tso = ethtool_op_set_tso,
- .get_flags = ethtool_op_get_flags,
- .set_flags = cxgb3_set_flags,
};
static int in_range(int val, int lo, int hi)
@@ -2960,7 +2926,7 @@ static int __devinit init_one(struct pci_dev *pdev,
netdev->mem_end = mmio_start + mmio_len - 1;
netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
netdev->features |= NETIF_F_LLTX;
- netdev->features |= NETIF_F_LRO;
+ netdev->features |= NETIF_F_GRO;
if (pci_using_dac)
netdev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 14f9fb3..8299fb5 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -585,8 +585,7 @@ static void t3_reset_qset(struct sge_qset *q)
memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
q->txq_stopped = 0;
q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
- kfree(q->lro_frag_tbl);
- q->lro_nfrags = q->lro_frag_len = 0;
+ q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
}
@@ -1945,10 +1944,8 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
qs->port_stats[SGE_PSTAT_VLANEX]++;
if (likely(grp))
if (lro)
- lro_vlan_hwaccel_receive_skb(&qs->lro_mgr, skb,
- grp,
- ntohs(p->vlan),
- p);
+ vlan_gro_receive(&qs->napi, grp,
+ ntohs(p->vlan), skb);
else {
if (unlikely(pi->iscsi_ipv4addr &&
is_arp(skb))) {
@@ -1965,7 +1962,7 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
dev_kfree_skb_any(skb);
} else if (rq->polling) {
if (lro)
- lro_receive_skb(&qs->lro_mgr, skb, p);
+ napi_gro_receive(&qs->napi, skb);
else {
if (unlikely(pi->iscsi_ipv4addr && is_arp(skb)))
cxgb3_arp_process(adap, skb);
@@ -1981,59 +1978,6 @@ static inline int is_eth_tcp(u32 rss)
}
/**
- * lro_frame_ok - check if an ingress packet is eligible for LRO
- * @p: the CPL header of the packet
- *
- * Returns true if a received packet is eligible for LRO.
- * The following conditions must be true:
- * - packet is TCP/IP Ethernet II (checked elsewhere)
- * - not an IP fragment
- * - no IP options
- * - TCP/IP checksums are correct
- * - the packet is for this host
- */
-static inline int lro_frame_ok(const struct cpl_rx_pkt *p)
-{
- const struct ethhdr *eh = (struct ethhdr *)(p + 1);
- const struct iphdr *ih = (struct iphdr *)(eh + 1);
-
- return (*((u8 *)p + 1) & 0x90) == 0x10 && p->csum == htons(0xffff) &&
- eh->h_proto == htons(ETH_P_IP) && ih->ihl == (sizeof(*ih) >> 2);
-}
-
-static int t3_get_lro_header(void **eh, void **iph, void **tcph,
- u64 *hdr_flags, void *priv)
-{
- const struct cpl_rx_pkt *cpl = priv;
-
- if (!lro_frame_ok(cpl))
- return -1;
-
- *eh = (struct ethhdr *)(cpl + 1);
- *iph = (struct iphdr *)((struct ethhdr *)*eh + 1);
- *tcph = (struct tcphdr *)((struct iphdr *)*iph + 1);
-
- *hdr_flags = LRO_IPV4 | LRO_TCP;
- return 0;
-}
-
-static int t3_get_skb_header(struct sk_buff *skb,
- void **iph, void **tcph, u64 *hdr_flags,
- void *priv)
-{
- void *eh;
-
- return t3_get_lro_header(&eh, iph, tcph, hdr_flags, priv);
-}
-
-static int t3_get_frag_header(struct skb_frag_struct *frag, void **eh,
- void **iph, void **tcph, u64 *hdr_flags,
- void *priv)
-{
- return t3_get_lro_header(eh, iph, tcph, hdr_flags, priv);
-}
-
-/**
* lro_add_page - add a page chunk to an LRO session
* @adap: the adapter
* @qs: the associated queue set
@@ -2049,8 +1993,9 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
{
struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
struct cpl_rx_pkt *cpl;
- struct skb_frag_struct *rx_frag = qs->lro_frag_tbl;
- int nr_frags = qs->lro_nfrags, frag_len = qs->lro_frag_len;
+ struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
+ int nr_frags = qs->lro_frag_tbl.nr_frags;
+ int frag_len = qs->lro_frag_tbl.len;
int offset = 0;
if (!nr_frags) {
@@ -2069,13 +2014,13 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
rx_frag->page_offset = sd->pg_chunk.offset + offset;
rx_frag->size = len;
frag_len += len;
- qs->lro_nfrags++;
- qs->lro_frag_len = frag_len;
+ qs->lro_frag_tbl.nr_frags++;
+ qs->lro_frag_tbl.len = frag_len;
if (!complete)
return;
- qs->lro_nfrags = qs->lro_frag_len = 0;
+ qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
cpl = qs->lro_va;
if (unlikely(cpl->vlan_valid)) {
@@ -2084,35 +2029,15 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
struct vlan_group *grp = pi->vlan_grp;
if (likely(grp != NULL)) {
- lro_vlan_hwaccel_receive_frags(&qs->lro_mgr,
- qs->lro_frag_tbl,
- frag_len, frag_len,
- grp, ntohs(cpl->vlan),
- cpl, 0);
- return;
+ vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
+ &qs->lro_frag_tbl);
+ goto out;
}
}
- lro_receive_frags(&qs->lro_mgr, qs->lro_frag_tbl,
- frag_len, frag_len, cpl, 0);
-}
+ napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
-/**
- * init_lro_mgr - initialize a LRO manager object
- * @lro_mgr: the LRO manager object
- */
-static void init_lro_mgr(struct sge_qset *qs, struct net_lro_mgr *lro_mgr)
-{
- lro_mgr->dev = qs->netdev;
- lro_mgr->features = LRO_F_NAPI;
- lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
- lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
- lro_mgr->max_desc = T3_MAX_LRO_SES;
- lro_mgr->lro_arr = qs->lro_desc;
- lro_mgr->get_frag_header = t3_get_frag_header;
- lro_mgr->get_skb_header = t3_get_skb_header;
- lro_mgr->max_aggr = T3_MAX_LRO_MAX_PKTS;
- if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
- lro_mgr->max_aggr = MAX_SKB_FRAGS;
+out:
+ qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
}
/**
@@ -2356,10 +2281,6 @@ next_fl:
}
deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
- lro_flush_all(&qs->lro_mgr);
- qs->port_stats[SGE_PSTAT_LRO_AGGR] = qs->lro_mgr.stats.aggregated;
- qs->port_stats[SGE_PSTAT_LRO_FLUSHED] = qs->lro_mgr.stats.flushed;
- qs->port_stats[SGE_PSTAT_LRO_NO_DESC] = qs->lro_mgr.stats.no_desc;
if (sleeping)
check_ring_db(adap, qs, sleeping);
@@ -2906,7 +2827,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
{
int i, avail, ret = -ENOMEM;
struct sge_qset *q = &adapter->sge.qs[id];
- struct net_lro_mgr *lro_mgr = &q->lro_mgr;
init_qset_cntxt(q, id);
setup_timer(&q->tx_reclaim_timer, sge_timer_cb, (unsigned long)q);
@@ -2986,10 +2906,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
q->fl[0].order = FL0_PG_ORDER;
q->fl[1].order = FL1_PG_ORDER;
- q->lro_frag_tbl = kcalloc(MAX_FRAME_SIZE / FL1_PG_CHUNK_SIZE + 1,
- sizeof(struct skb_frag_struct),
- GFP_KERNEL);
- q->lro_nfrags = q->lro_frag_len = 0;
spin_lock_irq(&adapter->sge.reg_lock);
/* FL threshold comparison uses < */
@@ -3041,8 +2957,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
q->tx_q = netdevq;
t3_update_qset_coalesce(q, p);
- init_lro_mgr(q, lro_mgr);
-
avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
GFP_KERNEL | __GFP_COMP);
if (!avail) {
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-15 21:14 Divy Le Ray
@ 2009-01-15 23:58 ` Herbert Xu
2009-01-16 8:06 ` Divy Le Ray
2009-01-20 1:03 ` David Miller
1 sibling, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-15 23:58 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, swise
Divy Le Ray <divy@chelsio.com> wrote:
>
> Your patch looks fine. Early testing provides the same perfs
> seen with LRO.
>
> I resubmit the original patch with 2 changes:
> - cxgb3_set_flags() removal did not apply.
> commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
> - replace the NETIF_F_LRO features flag with NETIF_F_GRO
Thanks for the update Divy! It looks good to me.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-15 23:58 ` Herbert Xu
@ 2009-01-16 8:06 ` Divy Le Ray
2009-01-16 8:56 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-16 8:06 UTC (permalink / raw)
To: Herbert Xu; +Cc: davem, netdev, Steve Wise
Herbert Xu wrote:
> Divy Le Ray <divy@chelsio.com> wrote:
>> Your patch looks fine. Early testing provides the same perfs
>> seen with LRO.
>>
>> I resubmit the original patch with 2 changes:
>> - cxgb3_set_flags() removal did not apply.
>> commit 47fd23fe8efeea3af4593a8424419df48724eb25 had updated the routine.
>> - replace the NETIF_F_LRO features flag with NETIF_F_GRO
>
> Thanks for the update Divy! It looks good to me.
Hi Herbert,
The perfs are not actually as good as with LRO.
With a slow server, 1 netperf stream gets on average:
without the patch, lro off: 3.8Gbs
without the patch, lro on: 6.1Gbs
with the patch, GRO on: 4.8Gbs.
I'll look into it.
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-01-16 8:06 ` Divy Le Ray
@ 2009-01-16 8:56 ` Herbert Xu
2009-01-16 11:12 ` Divy Le Ray
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-16 8:56 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, Steve Wise
On Fri, Jan 16, 2009 at 12:06:45AM -0800, Divy Le Ray wrote:
>
> The perfs are not actually as good as with LRO.
Thanks for testing Divy!
> With a slow server, 1 netperf stream gets on average:
I presume the server is receiving?
> without the patch, lro off: 3.8Gbs
> without the patch, lro on: 6.1Gbs
> with the patch, GRO on: 4.8Gbs.
What about the case of GRO off with the patch? Just checking to
make sure that nothing else has changed.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-16 8:56 ` Herbert Xu
@ 2009-01-16 11:12 ` Divy Le Ray
2009-01-16 23:58 ` Herbert Xu
2009-01-17 5:08 ` Herbert Xu
0 siblings, 2 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-16 11:12 UTC (permalink / raw)
To: Herbert Xu; +Cc: davem, netdev, Steve Wise
> I presume the server is receiving?
Yes, it is receiving. The adapter is set up with one queue, its irq
pinned to cpu1, irq balance disabled. CPU1 is pegged in all tests.
>
>> without the patch, lro off: 3.8Gbs
>> without the patch, lro on: 6.1Gbs
>> with the patch, GRO on: 4.8Gbs.
>
> What about the case of GRO off with the patch? Just checking to
> make sure that nothing else has changed.
I'm getting about 3.4Gbs with the patch and GRO off.
I did some profiling with GRO on and LRO on. Here are typical outputs:
GRO on:
CPU 1 :
26.445100 copy_user_generic_unrolled vmlinux
10.050300 skb_copy_datagram_iovec vmlinux
5.076900 memcpy vmlinux
4.636400 process_responses cxgb3.ko
4.625100 __pskb_pull_tail vmlinux
4.463300 eth_type_trans vmlinux
4.420600 inet_gro_receive vmlinux
4.310500 refill_fl cxgb3.ko
3.312700 skb_copy_bits vmlinux
3.155300 put_page vmlinux
2.818200 kfree vmlinux
2.537300 tcp_gro_receive vmlinux
2.422700 dev_gro_receive vmlinux
2.229400 kmem_cache_alloc_node vmlinux
1.526000 skb_gro_receive vmlinux
1.319200 free_hot_cold_page vmlinux
1.224800 get_page_from_freelist vmlinux
1.112500 __alloc_skb vmlinux
0.910200 kmem_cache_free vmlinux
0.872000 napi_fraginfo_skb vmlinux
LRO on:
CPU 1 :
48.511600 copy_user_generic_unrolled vmlinux
5.859600 put_page vmlinux
4.405500 process_responses cxgb3.ko
4.006400 refill_fl cxgb3.ko
2.547200 irq_entries_start vmlinux
2.315900 free_hot_cold_page vmlinux
2.224400 skb_copy_datagram_iovec vmlinux
1.985400 tcp_recvmsg vmlinux
1.311700 csum_partial vmlinux
1.276200 _raw_spin_lock vmlinux
1.088000 get_page_from_freelist vmlinux
1.016900 get_pageblock_flags_group vmlinux
0.920300 memcpy_toiovec vmlinux
0.859200 kfree vmlinux
0.688900 dst_release vmlinux
0.630400 t3_sge_intr_msix_napi cxgb3.ko
0.559300 tcp_rcv_established vmlinux
0.472800 kmem_cache_alloc_node vmlinux
0.404200 __inet_lookup_established vmlinux
0.399100 memset vmlinux
I have not looked at the code yet. It's getting late :)
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-16 11:12 ` Divy Le Ray
@ 2009-01-16 23:58 ` Herbert Xu
2009-01-17 5:08 ` Herbert Xu
1 sibling, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-16 23:58 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, Steve Wise
On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>
> >> without the patch, lro off: 3.8Gbs
> >> without the patch, lro on: 6.1Gbs
> >> with the patch, GRO on: 4.8Gbs.
> >
> > What about the case of GRO off with the patch? Just checking to
> > make sure that nothing else has changed.
>
> I'm getting about 3.4Gbs with the patch and GRO off.
I wonder where the 3.8 => 3.4 slow-down is coming from.
> I did some profiling with GRO on and LRO on. Here are typical outputs:
> GRO on:
> CPU 1 :
> 26.445100 copy_user_generic_unrolled vmlinux
> 10.050300 skb_copy_datagram_iovec vmlinux
> 5.076900 memcpy vmlinux
> 4.636400 process_responses cxgb3.ko
> 4.625100 __pskb_pull_tail vmlinux
> 4.463300 eth_type_trans vmlinux
> 4.420600 inet_gro_receive vmlinux
> 4.310500 refill_fl cxgb3.ko
> 3.312700 skb_copy_bits vmlinux
Thanks, this is very helpful. The first obvious deficiency
is that napi_fraginfo_skb is causing multiple pskb_may_pull
calls on every fragment which is bad.
I'll get a patch out for you test.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-16 11:12 ` Divy Le Ray
2009-01-16 23:58 ` Herbert Xu
@ 2009-01-17 5:08 ` Herbert Xu
2009-01-17 11:11 ` Divy Le Ray
1 sibling, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-17 5:08 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, Steve Wise
Hi Divy:
On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>
> 1.112500 __alloc_skb vmlinux
Can you confirm that you were testing 1500 byte packets? If so
that would tell us why this function got up there. As it stands
GRO will not stop aggregating even when the frags array is full,
until the skb size hits 64K.
That means half the packet will be in frag_list which is very
inefficient.
If this is indeed the problem, then for now I'll simply make
it stop aggregating after the array is full, just like LRO.
While in future we can extend it such that even when it starts
using frag_list, it'll still continue to fill in the frags array.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-17 5:08 ` Herbert Xu
@ 2009-01-17 11:11 ` Divy Le Ray
2009-01-17 13:08 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-17 11:11 UTC (permalink / raw)
To: Herbert Xu; +Cc: davem, netdev, Steve Wise
Herbert Xu wrote:
> Hi Divy:
>
> On Fri, Jan 16, 2009 at 03:12:07AM -0800, Divy Le Ray wrote:
>> 1.112500 __alloc_skb vmlinux
>
> Can you confirm that you were testing 1500 byte packets?
Yes.
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-01-17 11:11 ` Divy Le Ray
@ 2009-01-17 13:08 ` Herbert Xu
2009-01-18 20:33 ` Divy Le Ray
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-17 13:08 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, Steve Wise
On Sat, Jan 17, 2009 at 03:11:26AM -0800, Divy Le Ray wrote:
>
> Yes.
Great. Please give this patch a whirl.
commit a9156545da4f6ed729b1871a92e3ead04f37a034
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat Jan 17 21:49:42 2009 +1100
gro: Do not merge paged packets into frag_list
Bigger is not always better :)
It was easy to continue to merged packets into frag_list after the
page array is full. However, this turns out to be worse than LRO
because frag_list is a much less efficient form of storage than the
page array. So we're better off stopping the merge and starting
a new entry with an empty page array.
In future we can optimise this further by doing frag_list merging
but making sure that we continue to fill in the page array.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
commit 2e54c6210c41f7fb2c503a4b51a8288e05a1c9d5
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat Jan 17 21:45:11 2009 +1100
gro: Avoid copying headers of unmerged packets
Unfortunately simplicity isn't always the best. The fraginfo
interface turned out to be suboptimal. The problem was quite
obvious. For every packet, we have to copy the headers from
the frags structure into skb->head, even though for 99% of the
packets this part is immediately thrown away after the merge.
LRO didn't have this problem because it directly read the headers
from the frags structure.
This patch attempts to address this by creating an interface
that allows GRO to access the headers in the first frag without
having to copy it. Because all drivers that use frags place the
headers in the first frag this optimisation should be enough.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
commit db14bd4bff3d85d779a83fbdc554f4147ee9b841
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun Jan 18 00:00:13 2009 +1100
gro: Fix merging of paged packets
The previous fix to paged packets broke the merging because it
reset the skb->len before we added it to the merged packet. This
wasn't detected because it simply resulted in the truncation of
the packet while the missing bit is subsequently retransmitted.
The fix is to store skb->len before we clobber it.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
commit 6ba4147cb5854d334965d89acb821e6f624534eb
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat Jan 17 21:05:58 2009 +1100
gro: Move common completion code into helpers
Currently VLAN still has a bit of common code handling the aftermath
of GRO that's shared with the common path. This patch moves them
into shared helpers to reduce code duplication.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
commit 86c3c2fc1f99394990b3ef1c2881b6036e4f5141
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat Jan 17 15:06:40 2009 +1100
gro: Fix error handling on extremely short frags
When a frag is shorter than an Ethernet header, we'd return a
zeroed packet instead of aborting. This patch fixes that.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
commit 099f08d2279eff7f74226bc8f691b358b1eac7b7
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat Jan 17 20:12:07 2009 +1100
gro: Fix handling of complete checksums in IPv6
We need to perform skb_postpull_rcsum after pulling the IPv6
header in order to maintain the correctness of the complete
checksum.
This patch also adds a missing iph reload after pulling.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f245568..5472328 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -983,6 +983,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
void netif_napi_del(struct napi_struct *napi);
struct napi_gro_cb {
+ /* This indicates where we are processing relative to skb->data. */
+ int data_offset;
+
/* This is non-zero if the packet may be of the same flow. */
int same_flow;
@@ -1084,6 +1087,29 @@ extern int dev_restart(struct net_device *dev);
#ifdef CONFIG_NETPOLL_TRAP
extern int netpoll_trap(void);
#endif
+extern void *skb_gro_header(struct sk_buff *skb, unsigned int hlen);
+extern int skb_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb);
+
+static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
+{
+ return NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline unsigned int skb_gro_len(const struct sk_buff *skb)
+{
+ return skb->len - NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
+{
+ NAPI_GRO_CB(skb)->data_offset += len;
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->data_offset = 0;
+}
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
@@ -1372,12 +1398,15 @@ extern int netif_receive_skb(struct sk_buff *skb);
extern void napi_gro_flush(struct napi_struct *napi);
extern int dev_gro_receive(struct napi_struct *napi,
struct sk_buff *skb);
+extern int napi_skb_finish(int ret, struct sk_buff *skb);
extern int napi_gro_receive(struct napi_struct *napi,
struct sk_buff *skb);
extern void napi_reuse_skb(struct napi_struct *napi,
struct sk_buff *skb);
extern struct sk_buff * napi_fraginfo_skb(struct napi_struct *napi,
struct napi_gro_fraginfo *info);
+extern int napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb, int ret);
extern int napi_gro_frags(struct napi_struct *napi,
struct napi_gro_fraginfo *info);
extern void netif_nit_deliver(struct sk_buff *skb);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf2cb50..acf17af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1687,8 +1687,6 @@ extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
int shiftlen);
extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
-extern int skb_gro_receive(struct sk_buff **head,
- struct sk_buff *skb);
static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
int len, void *buffer)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 6c13239..fb0d16a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -98,22 +98,9 @@ drop:
int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct sk_buff *skb)
{
- int err = NET_RX_SUCCESS;
+ skb_gro_reset_offset(skb);
- switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
- case -1:
- return netif_receive_skb(skb);
-
- case 2:
- err = NET_RX_DROP;
- /* fall through */
-
- case 1:
- kfree_skb(skb);
- break;
- }
-
- return err;
+ return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
}
EXPORT_SYMBOL(vlan_gro_receive);
@@ -121,27 +108,11 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct napi_gro_fraginfo *info)
{
struct sk_buff *skb = napi_fraginfo_skb(napi, info);
- int err = NET_RX_DROP;
if (!skb)
- goto out;
-
- err = NET_RX_SUCCESS;
-
- switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
- case -1:
- return netif_receive_skb(skb);
-
- case 2:
- err = NET_RX_DROP;
- /* fall through */
-
- case 1:
- napi_reuse_skb(napi, skb);
- break;
- }
+ return NET_RX_DROP;
-out:
- return err;
+ return napi_frags_finish(napi, skb,
+ vlan_gro_common(napi, grp, vlan_tci, skb));
}
EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f69a2d..3742397 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,14 @@
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
+enum {
+ GRO_MERGED,
+ GRO_MERGED_FREE,
+ GRO_HELD,
+ GRO_NORMAL,
+ GRO_DROP,
+};
+
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
@@ -207,6 +215,13 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
}
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+ return skb_headlen(skb) ? skb_mac_header(skb) :
+ page_address(skb_shinfo(skb)->frags[0].page) +
+ skb_shinfo(skb)->frags[0].page_offset;
+}
+
/* Device list insertion */
static int list_netdevice(struct net_device *dev)
{
@@ -2350,7 +2365,6 @@ static int napi_gro_complete(struct sk_buff *skb)
out:
skb_shinfo(skb)->gso_size = 0;
- __skb_push(skb, -skb_network_offset(skb));
return netif_receive_skb(skb);
}
@@ -2368,6 +2382,25 @@ void napi_gro_flush(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_gro_flush);
+void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
+{
+ unsigned int offset = skb_gro_offset(skb);
+
+ hlen += offset;
+ if (hlen <= skb_headlen(skb))
+ return skb->data + offset;
+
+ if (unlikely(!skb_shinfo(skb)->nr_frags ||
+ skb_shinfo(skb)->frags[0].size <=
+ hlen - skb_headlen(skb) ||
+ PageHighMem(skb_shinfo(skb)->frags[0].page)))
+ return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+
+ return page_address(skb_shinfo(skb)->frags[0].page) +
+ skb_shinfo(skb)->frags[0].page_offset + offset;
+}
+EXPORT_SYMBOL(skb_gro_header);
+
int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
@@ -2377,7 +2410,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
int count = 0;
int same_flow;
int mac_len;
- int free;
+ int ret;
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
@@ -2388,11 +2421,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
struct sk_buff *p;
+ void *mac;
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
continue;
- skb_reset_network_header(skb);
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ mac = skb_gro_mac_header(skb);
mac_len = skb->network_header - skb->mac_header;
skb->mac_len = mac_len;
NAPI_GRO_CB(skb)->same_flow = 0;
@@ -2406,8 +2441,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
continue;
if (p->mac_len != mac_len ||
- memcmp(skb_mac_header(p), skb_mac_header(skb),
- mac_len))
+ memcmp(skb_mac_header(p), mac, mac_len))
NAPI_GRO_CB(p)->same_flow = 0;
}
@@ -2420,7 +2454,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
goto normal;
same_flow = NAPI_GRO_CB(skb)->same_flow;
- free = NAPI_GRO_CB(skb)->free;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
struct sk_buff *nskb = *pp;
@@ -2434,21 +2468,20 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
if (same_flow)
goto ok;
- if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
- __skb_push(skb, -skb_network_offset(skb));
+ if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
goto normal;
- }
NAPI_GRO_CB(skb)->count = 1;
- skb_shinfo(skb)->gso_size = skb->len;
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
+ ret = GRO_HELD;
ok:
- return free;
+ return ret;
normal:
- return -1;
+ return GRO_NORMAL;
}
EXPORT_SYMBOL(dev_gro_receive);
@@ -2464,18 +2497,32 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
return dev_gro_receive(napi, skb);
}
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+int napi_skb_finish(int ret, struct sk_buff *skb)
{
- switch (__napi_gro_receive(napi, skb)) {
- case -1:
+ int err = NET_RX_SUCCESS;
+
+ switch (ret) {
+ case GRO_NORMAL:
return netif_receive_skb(skb);
- case 1:
+ case GRO_DROP:
+ err = NET_RX_DROP;
+ /* fall through */
+
+ case GRO_MERGED_FREE:
kfree_skb(skb);
break;
}
- return NET_RX_SUCCESS;
+ return err;
+}
+EXPORT_SYMBOL(napi_skb_finish);
+
+int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ skb_gro_reset_offset(skb);
+
+ return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
@@ -2493,6 +2540,7 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
{
struct net_device *dev = napi->dev;
struct sk_buff *skb = napi->skb;
+ struct ethhdr *eth;
napi->skb = NULL;
@@ -2512,12 +2560,23 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
skb->len += info->len;
skb->truesize += info->len;
- if (!pskb_may_pull(skb, ETH_HLEN)) {
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header(skb, sizeof(*eth));
+ if (!eth) {
napi_reuse_skb(napi, skb);
+ skb = NULL;
goto out;
}
- skb->protocol = eth_type_trans(skb, dev);
+ skb_gro_pull(skb, sizeof(*eth));
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling. We'll fix it up properly at the end.
+ */
+ skb->protocol = eth->h_proto;
skb->ip_summed = info->ip_summed;
skb->csum = info->csum;
@@ -2527,29 +2586,47 @@ out:
}
EXPORT_SYMBOL(napi_fraginfo_skb);
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
{
- struct sk_buff *skb = napi_fraginfo_skb(napi, info);
- int err = NET_RX_DROP;
+ int err = NET_RX_SUCCESS;
+ int may;
- if (!skb)
- goto out;
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ may = pskb_may_pull(skb, skb_gro_offset(skb));
+ BUG_ON(!may);
- err = NET_RX_SUCCESS;
+ skb->protocol = eth_type_trans(skb, napi->dev);
- switch (__napi_gro_receive(napi, skb)) {
- case -1:
- return netif_receive_skb(skb);
+ if (ret == GRO_NORMAL)
+ return netif_receive_skb(skb);
- case 0:
- goto out;
- }
+ skb_gro_pull(skb, -ETH_HLEN);
+ break;
- napi_reuse_skb(napi, skb);
+ case GRO_DROP:
+ err = NET_RX_DROP;
+ /* fall through */
+
+ case GRO_MERGED_FREE:
+ napi_reuse_skb(napi, skb);
+ break;
+ }
-out:
return err;
}
+EXPORT_SYMBOL(napi_frags_finish);
+
+int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+{
+ struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+
+ if (!skb)
+ return NET_RX_DROP;
+
+ return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+}
EXPORT_SYMBOL(napi_gro_frags);
static int process_backlog(struct napi_struct *napi, int quota)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 65eac77..a11d9a4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2587,16 +2587,23 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct sk_buff *p = *head;
struct sk_buff *nskb;
unsigned int headroom;
- unsigned int hlen = p->data - skb_mac_header(p);
+ unsigned int len = skb_gro_len(skb);
- if (hlen + p->len + skb->len >= 65536)
+ if (p->len + len >= 65536)
return -E2BIG;
if (skb_shinfo(p)->frag_list)
goto merge;
- else if (!skb_headlen(p) && !skb_headlen(skb) &&
- skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <
- MAX_SKB_FRAGS) {
+ else if (skb_headlen(skb) <= skb_gro_offset(skb)) {
+ if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags >
+ MAX_SKB_FRAGS)
+ return -E2BIG;
+
+ skb_shinfo(skb)->frags[0].page_offset +=
+ skb_gro_offset(skb) - skb_headlen(skb);
+ skb_shinfo(skb)->frags[0].size -=
+ skb_gro_offset(skb) - skb_headlen(skb);
+
memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
skb_shinfo(skb)->frags,
skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
@@ -2621,12 +2628,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
nskb->mac_len = p->mac_len;
skb_reserve(nskb, headroom);
+ __skb_put(nskb, skb_headlen(p));
- skb_set_mac_header(nskb, -hlen);
+ skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
skb_set_network_header(nskb, skb_network_offset(p));
skb_set_transport_header(nskb, skb_transport_offset(p));
- memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen);
+ __skb_pull(p, skb_gro_offset(p));
+ memcpy(skb_mac_header(nskb), skb_mac_header(p),
+ p->data - skb_mac_header(p));
*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
skb_shinfo(nskb)->frag_list = p;
@@ -2651,9 +2661,9 @@ merge:
done:
NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
- p->truesize += skb->len;
- p->len += skb->len;
+ p->data_len += len;
+ p->truesize += len;
+ p->len += len;
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f554..d6770f2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1253,10 +1253,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
int proto;
int id;
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ iph = skb_gro_header(skb, sizeof(*iph));
+ if (unlikely(!iph))
goto out;
- iph = ip_hdr(skb);
proto = iph->protocol & (MAX_INET_PROTOS - 1);
rcu_read_lock();
@@ -1270,7 +1270,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- flush = ntohs(iph->tot_len) != skb->len ||
+ flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
iph->frag_off != htons(IP_DF);
id = ntohs(iph->id);
@@ -1298,8 +1298,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
}
NAPI_GRO_CB(skb)->flush |= flush;
- __skb_pull(skb, sizeof(*iph));
- skb_reset_transport_header(skb);
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
pp = ops->gro_receive(head, skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ce572f9..0191d39 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2473,19 +2473,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
unsigned int mss = 1;
int flush = 1;
- if (!pskb_may_pull(skb, sizeof(*th)))
+ th = skb_gro_header(skb, sizeof(*th));
+ if (unlikely(!th))
goto out;
- th = tcp_hdr(skb);
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
- if (!pskb_may_pull(skb, thlen))
+ th = skb_gro_header(skb, thlen);
+ if (unlikely(!th))
goto out;
- th = tcp_hdr(skb);
- __skb_pull(skb, thlen);
+ skb_gro_pull(skb, thlen);
flags = tcp_flag_word(th);
@@ -2513,10 +2513,10 @@ found:
flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
- total = p->len;
+ total = skb_gro_len(p);
mss = skb_shinfo(p)->gso_size;
- flush |= skb->len > mss || skb->len <= 0;
+ flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb);
flush |= ntohl(th2->seq) + total != ntohl(th->seq);
if (flush || skb_gro_receive(head, skb)) {
@@ -2529,7 +2529,7 @@ found:
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
out_check_final:
- flush = skb->len < mss;
+ flush = skb_gro_len(skb) < mss;
flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
TCP_FLAG_SYN | TCP_FLAG_FIN);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 19d7b42..f6b962f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2355,7 +2355,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
- if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr,
+ if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
skb->csum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 94f74f5..bd91ead 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -797,24 +797,36 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
unsigned int nlen;
int flush = 1;
int proto;
+ __wsum csum;
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ iph = skb_gro_header(skb, sizeof(*iph));
+ if (unlikely(!iph))
goto out;
- iph = ipv6_hdr(skb);
- __skb_pull(skb, sizeof(*iph));
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
- flush += ntohs(iph->payload_len) != skb->len;
+ flush += ntohs(iph->payload_len) != skb_gro_len(skb);
rcu_read_lock();
- proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr);
- IPV6_GRO_CB(skb)->proto = proto;
+ proto = iph->nexthdr;
ops = rcu_dereference(inet6_protos[proto]);
- if (!ops || !ops->gro_receive)
- goto out_unlock;
+ if (!ops || !ops->gro_receive) {
+ __pskb_pull(skb, skb_gro_offset(skb));
+ proto = ipv6_gso_pull_exthdrs(skb, proto);
+ skb_gro_pull(skb, -skb_transport_offset(skb));
+ skb_reset_transport_header(skb);
+ __skb_push(skb, skb_gro_offset(skb));
+
+ if (!ops || !ops->gro_receive)
+ goto out_unlock;
+
+ iph = ipv6_hdr(skb);
+ }
+
+ IPV6_GRO_CB(skb)->proto = proto;
flush--;
- skb_reset_transport_header(skb);
nlen = skb_network_header_len(skb);
for (p = *head; p; p = p->next) {
@@ -839,8 +851,13 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
NAPI_GRO_CB(skb)->flush |= flush;
+ csum = skb->csum;
+ skb_postpull_rcsum(skb, iph, skb_network_header_len(skb));
+
pp = ops->gro_receive(head, skb);
+ skb->csum = csum;
+
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5b85d4..00f1269 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -948,7 +948,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb)
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
- if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr,
+ if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
skb->csum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-17 13:08 ` Herbert Xu
@ 2009-01-18 20:33 ` Divy Le Ray
2009-01-18 22:50 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Divy Le Ray @ 2009-01-18 20:33 UTC (permalink / raw)
To: Herbert Xu; +Cc: davem, netdev, Steve Wise
Herbert Xu wrote:
> On Sat, Jan 17, 2009 at 03:11:26AM -0800, Divy Le Ray wrote:
>> Yes.
>
> Great. Please give this patch a whirl.
>
Hi Herbert,
Thanks for the patch. It does improve the performance.
I'm now getting about 5.3Gbs.
Here is a oprofile output:
33.929900 copy_user_generic_unrolled vmlinux
7.150300 process_responses cxgb3.ko
6.261100 refill_fl cxgb3.ko
6.181700 memcpy vmlinux
6.168100 inet_gro_receive vmlinux
4.593700 put_page vmlinux
4.085600 dev_gro_receive vmlinux
3.833800 tcp_gro_receive vmlinux
1.817100 free_hot_cold_page vmlinux
1.715000 skb_copy_datagram_iovec vmlinux
1.705900 skb_gro_header vmlinux
1.676400 get_page_from_freelist vmlinux
1.256700 tcp_recvmsg vmlinux
0.893800 get_pageblock_flags_group vmlinux
0.850700 irq_entries_start vmlinux
0.803000 _raw_spin_lock vmlinux
0.782600 skb_gro_receive vmlinux
0.739500 memcpy_toiovec vmlinux
0.725900 napi_fraginfo_skb vmlinux
0.601200 kfree vmlinux
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-18 20:33 ` Divy Le Ray
@ 2009-01-18 22:50 ` Herbert Xu
0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-18 22:50 UTC (permalink / raw)
To: Divy Le Ray; +Cc: davem, netdev, Steve Wise
On Sun, Jan 18, 2009 at 12:33:42PM -0800, Divy Le Ray wrote:
>
> Thanks for the patch. It does improve the performance.
> I'm now getting about 5.3Gbs.
Thanks for testing Divy!
> Here is a oprofile output:
> 33.929900 copy_user_generic_unrolled vmlinux
> 7.150300 process_responses cxgb3.ko
> 6.261100 refill_fl cxgb3.ko
> 6.181700 memcpy vmlinux
I wonder why this is still up there.
Maybe this patch will help?
diff --git a/net/core/dev.c b/net/core/dev.c
index 3742397..3cc19fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2554,7 +2554,8 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
skb_shinfo(skb)->nr_frags = info->nr_frags;
- memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
+ memcpy(skb_shinfo(skb)->frags, info->frags,
+ info->nr_frags * sizeof(*info->frags));
skb->data_len = info->len;
skb->len += info->len;
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-01-15 21:14 Divy Le Ray
2009-01-15 23:58 ` Herbert Xu
@ 2009-01-20 1:03 ` David Miller
2009-01-20 2:03 ` David Miller
1 sibling, 1 reply; 23+ messages in thread
From: David Miller @ 2009-01-20 1:03 UTC (permalink / raw)
To: divy; +Cc: herbert, netdev, swise
From: Divy Le Ray <divy@chelsio.com>
Date: Thu, 15 Jan 2009 13:14:03 -0800
> cxgb3: Replace LRO with GRO
>
> This patch makes cxgb3 invoke the GRO hooks instead of LRO. As
> GRO has a compatible external interface to LRO this is a very
> straightforward replacement.
>
> I've kept the ioctl controls for per-queue LRO switches. However,
> we should not encourage anyone to use these.
>
> Because of that, I've also kept the skb construction code in
> cxgb3. Hopefully we can phase out those per-queue switches
> and then kill this too.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> Acked-by: Divy Le Ray <divy@chelsio.com>
I've added this to my net-next-2.6 tree.
I'm confident that you will work out the performance issues
with Herbert, else we'll simply revert this.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-01-20 1:03 ` David Miller
@ 2009-01-20 2:03 ` David Miller
2009-01-20 5:24 ` Herbert Xu
2009-01-20 10:04 ` Divy Le Ray
0 siblings, 2 replies; 23+ messages in thread
From: David Miller @ 2009-01-20 2:03 UTC (permalink / raw)
To: divy; +Cc: herbert, netdev, swise
From: David Miller <davem@davemloft.net>
Date: Mon, 19 Jan 2009 17:03:08 -0800 (PST)
> From: Divy Le Ray <divy@chelsio.com>
> Date: Thu, 15 Jan 2009 13:14:03 -0800
>
> > cxgb3: Replace LRO with GRO
> >
> > This patch makes cxgb3 invoke the GRO hooks instead of LRO. As
> > GRO has a compatible external interface to LRO this is a very
> > straightforward replacement.
> >
> > I've kept the ioctl controls for per-queue LRO switches. However,
> > we should not encourage anyone to use these.
> >
> > Because of that, I've also kept the skb construction code in
> > cxgb3. Hopefully we can phase out those per-queue switches
> > and then kill this too.
> >
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> > Acked-by: Divy Le Ray <divy@chelsio.com>
>
> I've added this to my net-next-2.6 tree.
>
> I'm confident that you will work out the performance issues
> with Herbert, else we'll simply revert this.
I had to add a fix due to some build bustage resulting from
this change.
drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
inclusion of linux/vmalloc.h coming from the inet_lro.h include
which is removed by this change.
In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'
I'll fix this up.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: cxgb3: Replace LRO with GRO
2009-01-20 2:03 ` David Miller
@ 2009-01-20 5:24 ` Herbert Xu
2009-01-20 10:04 ` Divy Le Ray
1 sibling, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-20 5:24 UTC (permalink / raw)
To: David Miller; +Cc: divy, netdev, swise
On Mon, Jan 19, 2009 at 06:03:30PM -0800, David Miller wrote:
>
> drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
> inclusion of linux/vmalloc.h coming from the inet_lro.h include
> which is removed by this change.
>
> In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'
Thanks Dave!
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: cxgb3: Replace LRO with GRO
2009-01-20 2:03 ` David Miller
2009-01-20 5:24 ` Herbert Xu
@ 2009-01-20 10:04 ` Divy Le Ray
1 sibling, 0 replies; 23+ messages in thread
From: Divy Le Ray @ 2009-01-20 10:04 UTC (permalink / raw)
To: David Miller; +Cc: herbert, netdev, Steve Wise
David Miller wrote:
> I had to add a fix due to some build bustage resulting from
> this change.
>
> drivers/scsi/cxgb3i/cxgb3i_ddp.h was depending upon the implicit
> inclusion of linux/vmalloc.h coming from the inet_lro.h include
> which is removed by this change.
>
> In file included from drivers/scsi/cxgb3i/cxgb3i_ddp.c:24:
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_alloc_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: error: implicit declaration of function 'vmalloc'
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:148: warning: assignment makes pointer from integer without a cast
> drivers/scsi/cxgb3i/cxgb3i_ddp.h: In function 'cxgb3i_free_big_mem':
> drivers/scsi/cxgb3i/cxgb3i_ddp.h:157: error: implicit declaration of function 'vfree'
>
> I'll fix this up.
Oops. Sorry about that. Thanks for fixing it.
Cheers,
Divy
^ permalink raw reply [flat|nested] 23+ messages in thread
* [1/2] e1000e: Invoke VLAN GRO handler
@ 2009-01-13 9:26 Herbert Xu
2009-01-15 6:59 ` cxgb3: Replace LRO with GRO Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2009-01-13 9:26 UTC (permalink / raw)
To: David S. Miller, Jeff Kirsher, netdev
Hi Dave:
I'm starting to convert the LRO users now. Here's the first
patch which is trivial. It simply completes the test driver
that I started out with.
e1000e: Invoke VLAN GRO handler
Now that VLAN has GRO support as well, we can call its GRO handler
as well.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 91817d0..f73faac 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -99,8 +99,8 @@ static void e1000_receive_skb(struct e1000_adapter *adapter,
skb->protocol = eth_type_trans(skb, netdev);
if (adapter->vlgrp && (status & E1000_RXD_STAT_VP))
- vlan_hwaccel_receive_skb(skb, adapter->vlgrp,
- le16_to_cpu(vlan));
+ vlan_gro_receive(&adapter->napi, adapter->vlgrp,
+ le16_to_cpu(vlan), skb);
else
napi_gro_receive(&adapter->napi, skb);
}
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread* cxgb3: Replace LRO with GRO
2009-01-13 9:26 [1/2] e1000e: Invoke VLAN GRO handler Herbert Xu
@ 2009-01-15 6:59 ` Herbert Xu
0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2009-01-15 6:59 UTC (permalink / raw)
To: David S. Miller, Divy Le Ray, netdev
Hi Dave:
This one is slightly different because the driver has chosen to
expose to user-space per-queue switches to control LRO. I've
kept those for now.
cxgb3: Replace LRO with GRO
This patch makes cxgb3 invoke the GRO hooks instead of LRO. As
GRO has a compatible external interface to LRO this is a very
straightforward replacement.
I've kept the ioctl controls for per-queue LRO switches. However,
we should not encourage anyone to use these.
Because of that, I've also kept the skb construction code in
cxgb3. Hopefully we can phase out those per-queue switches
and then kill this too.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ee37540..d6bf215 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2407,7 +2407,6 @@ config CHELSIO_T3
tristate "Chelsio Communications T3 10Gb Ethernet support"
depends on CHELSIO_T3_DEPENDS
select FW_LOADER
- select INET_LRO
help
This driver supports Chelsio T3-based gigabit and 10Gb Ethernet
adapters.
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 5b346f9..41b87a1 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -42,7 +42,6 @@
#include <linux/cache.h>
#include <linux/mutex.h>
#include <linux/bitops.h>
-#include <linux/inet_lro.h>
#include "t3cdev.h"
#include <asm/io.h>
@@ -173,15 +172,11 @@ enum { /* per port SGE statistics */
SGE_PSTAT_TX_CSUM, /* # of TX checksum offloads */
SGE_PSTAT_VLANEX, /* # of VLAN tag extractions */
SGE_PSTAT_VLANINS, /* # of VLAN tag insertions */
- SGE_PSTAT_LRO_AGGR, /* # of page chunks added to LRO sessions */
- SGE_PSTAT_LRO_FLUSHED, /* # of flushed LRO sessions */
- SGE_PSTAT_LRO_NO_DESC, /* # of overflown LRO sessions */
SGE_PSTAT_MAX /* must be last */
};
-#define T3_MAX_LRO_SES 8
-#define T3_MAX_LRO_MAX_PKTS 64
+struct napi_gro_fraginfo;
struct sge_qset { /* an SGE queue set */
struct adapter *adap;
@@ -189,12 +184,8 @@ struct sge_qset { /* an SGE queue set */
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
struct sge_txq txq[SGE_TXQ_PER_SET];
- struct net_lro_mgr lro_mgr;
- struct net_lro_desc lro_desc[T3_MAX_LRO_SES];
- struct skb_frag_struct *lro_frag_tbl;
- int lro_nfrags;
+ struct napi_gro_fraginfo lro_frag_tbl;
int lro_enabled;
- int lro_frag_len;
void *lro_va;
struct net_device *netdev;
struct netdev_queue *tx_q; /* associated netdev TX queue */
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 2847f94..e94430d 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -508,19 +508,9 @@ static void set_qset_lro(struct net_device *dev, int qset_idx, int val)
{
struct port_info *pi = netdev_priv(dev);
struct adapter *adapter = pi->adapter;
- int i, lro_on = 1;
adapter->params.sge.qset[qset_idx].lro = !!val;
adapter->sge.qs[qset_idx].lro_enabled = !!val;
-
- /* let ethtool report LRO on only if all queues are LRO enabled */
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; ++i)
- lro_on &= adapter->params.sge.qset[i].lro;
-
- if (lro_on)
- dev->features |= NETIF_F_LRO;
- else
- dev->features &= ~NETIF_F_LRO;
}
/**
@@ -1433,9 +1423,9 @@ static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_AGGR);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_FLUSHED);
- *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_LRO_NO_DESC);
+ *data++ = 0;
+ *data++ = 0;
+ *data++ = 0;
*data++ = s->rx_cong_drops;
*data++ = s->num_toggled;
@@ -1824,25 +1814,6 @@ static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
memset(&wol->sopass, 0, sizeof(wol->sopass));
}
-static int cxgb3_set_flags(struct net_device *dev, u32 data)
-{
- struct port_info *pi = netdev_priv(dev);
- int i;
-
- if (data & ETH_FLAG_LRO) {
- if (!pi->rx_csum_offload)
- return -EINVAL;
-
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
- set_qset_lro(dev, i, 1);
-
- } else
- for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++)
- set_qset_lro(dev, i, 0);
-
- return 0;
-}
-
static const struct ethtool_ops cxgb_ethtool_ops = {
.get_settings = get_settings,
.set_settings = set_settings,
@@ -1872,8 +1843,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
.get_regs = get_regs,
.get_wol = get_wol,
.set_tso = ethtool_op_set_tso,
- .get_flags = ethtool_op_get_flags,
- .set_flags = cxgb3_set_flags,
};
static int in_range(int val, int lo, int hi)
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 6c641a8..893783b 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -585,8 +585,7 @@ static void t3_reset_qset(struct sge_qset *q)
memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
q->txq_stopped = 0;
q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
- kfree(q->lro_frag_tbl);
- q->lro_nfrags = q->lro_frag_len = 0;
+ q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
}
@@ -1945,10 +1944,8 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
qs->port_stats[SGE_PSTAT_VLANEX]++;
if (likely(grp))
if (lro)
- lro_vlan_hwaccel_receive_skb(&qs->lro_mgr, skb,
- grp,
- ntohs(p->vlan),
- p);
+ vlan_gro_receive(&qs->napi, grp,
+ ntohs(p->vlan), skb);
else {
if (unlikely(pi->iscsi_ipv4addr &&
is_arp(skb))) {
@@ -1965,7 +1962,7 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
dev_kfree_skb_any(skb);
} else if (rq->polling) {
if (lro)
- lro_receive_skb(&qs->lro_mgr, skb, p);
+ napi_gro_receive(&qs->napi, skb);
else {
if (unlikely(pi->iscsi_ipv4addr && is_arp(skb)))
cxgb3_arp_process(adap, skb);
@@ -1981,59 +1978,6 @@ static inline int is_eth_tcp(u32 rss)
}
/**
- * lro_frame_ok - check if an ingress packet is eligible for LRO
- * @p: the CPL header of the packet
- *
- * Returns true if a received packet is eligible for LRO.
- * The following conditions must be true:
- * - packet is TCP/IP Ethernet II (checked elsewhere)
- * - not an IP fragment
- * - no IP options
- * - TCP/IP checksums are correct
- * - the packet is for this host
- */
-static inline int lro_frame_ok(const struct cpl_rx_pkt *p)
-{
- const struct ethhdr *eh = (struct ethhdr *)(p + 1);
- const struct iphdr *ih = (struct iphdr *)(eh + 1);
-
- return (*((u8 *)p + 1) & 0x90) == 0x10 && p->csum == htons(0xffff) &&
- eh->h_proto == htons(ETH_P_IP) && ih->ihl == (sizeof(*ih) >> 2);
-}
-
-static int t3_get_lro_header(void **eh, void **iph, void **tcph,
- u64 *hdr_flags, void *priv)
-{
- const struct cpl_rx_pkt *cpl = priv;
-
- if (!lro_frame_ok(cpl))
- return -1;
-
- *eh = (struct ethhdr *)(cpl + 1);
- *iph = (struct iphdr *)((struct ethhdr *)*eh + 1);
- *tcph = (struct tcphdr *)((struct iphdr *)*iph + 1);
-
- *hdr_flags = LRO_IPV4 | LRO_TCP;
- return 0;
-}
-
-static int t3_get_skb_header(struct sk_buff *skb,
- void **iph, void **tcph, u64 *hdr_flags,
- void *priv)
-{
- void *eh;
-
- return t3_get_lro_header(&eh, iph, tcph, hdr_flags, priv);
-}
-
-static int t3_get_frag_header(struct skb_frag_struct *frag, void **eh,
- void **iph, void **tcph, u64 *hdr_flags,
- void *priv)
-{
- return t3_get_lro_header(eh, iph, tcph, hdr_flags, priv);
-}
-
-/**
* lro_add_page - add a page chunk to an LRO session
* @adap: the adapter
* @qs: the associated queue set
@@ -2049,8 +1993,9 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
{
struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
struct cpl_rx_pkt *cpl;
- struct skb_frag_struct *rx_frag = qs->lro_frag_tbl;
- int nr_frags = qs->lro_nfrags, frag_len = qs->lro_frag_len;
+ struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
+ int nr_frags = qs->lro_frag_tbl.nr_frags;
+ int frag_len = qs->lro_frag_tbl.len;
int offset = 0;
if (!nr_frags) {
@@ -2069,13 +2014,13 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
rx_frag->page_offset = sd->pg_chunk.offset + offset;
rx_frag->size = len;
frag_len += len;
- qs->lro_nfrags++;
- qs->lro_frag_len = frag_len;
+ qs->lro_frag_tbl.nr_frags++;
+ qs->lro_frag_tbl.len = frag_len;
if (!complete)
return;
- qs->lro_nfrags = qs->lro_frag_len = 0;
+ qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
cpl = qs->lro_va;
if (unlikely(cpl->vlan_valid)) {
@@ -2084,35 +2029,15 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
struct vlan_group *grp = pi->vlan_grp;
if (likely(grp != NULL)) {
- lro_vlan_hwaccel_receive_frags(&qs->lro_mgr,
- qs->lro_frag_tbl,
- frag_len, frag_len,
- grp, ntohs(cpl->vlan),
- cpl, 0);
- return;
+ vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
+ &qs->lro_frag_tbl);
+ goto out;
}
}
- lro_receive_frags(&qs->lro_mgr, qs->lro_frag_tbl,
- frag_len, frag_len, cpl, 0);
-}
+ napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
-/**
- * init_lro_mgr - initialize a LRO manager object
- * @lro_mgr: the LRO manager object
- */
-static void init_lro_mgr(struct sge_qset *qs, struct net_lro_mgr *lro_mgr)
-{
- lro_mgr->dev = qs->netdev;
- lro_mgr->features = LRO_F_NAPI;
- lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
- lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
- lro_mgr->max_desc = T3_MAX_LRO_SES;
- lro_mgr->lro_arr = qs->lro_desc;
- lro_mgr->get_frag_header = t3_get_frag_header;
- lro_mgr->get_skb_header = t3_get_skb_header;
- lro_mgr->max_aggr = T3_MAX_LRO_MAX_PKTS;
- if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
- lro_mgr->max_aggr = MAX_SKB_FRAGS;
+out:
+ qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
}
/**
@@ -2356,10 +2281,6 @@ next_fl:
}
deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
- lro_flush_all(&qs->lro_mgr);
- qs->port_stats[SGE_PSTAT_LRO_AGGR] = qs->lro_mgr.stats.aggregated;
- qs->port_stats[SGE_PSTAT_LRO_FLUSHED] = qs->lro_mgr.stats.flushed;
- qs->port_stats[SGE_PSTAT_LRO_NO_DESC] = qs->lro_mgr.stats.no_desc;
if (sleeping)
check_ring_db(adap, qs, sleeping);
@@ -2906,7 +2827,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
{
int i, avail, ret = -ENOMEM;
struct sge_qset *q = &adapter->sge.qs[id];
- struct net_lro_mgr *lro_mgr = &q->lro_mgr;
init_qset_cntxt(q, id);
setup_timer(&q->tx_reclaim_timer, sge_timer_cb, (unsigned long)q);
@@ -2986,10 +2906,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
q->fl[0].order = FL0_PG_ORDER;
q->fl[1].order = FL1_PG_ORDER;
- q->lro_frag_tbl = kcalloc(MAX_FRAME_SIZE / FL1_PG_CHUNK_SIZE + 1,
- sizeof(struct skb_frag_struct),
- GFP_KERNEL);
- q->lro_nfrags = q->lro_frag_len = 0;
spin_lock_irq(&adapter->sge.reg_lock);
/* FL threshold comparison uses < */
@@ -3041,8 +2957,6 @@ int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
q->tx_q = netdevq;
t3_update_qset_coalesce(q, p);
- init_lro_mgr(q, lro_mgr);
-
avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
GFP_KERNEL | __GFP_COMP);
if (!avail) {
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related [flat|nested] 23+ messages in thread
end of thread, other threads:[~2009-04-13 15:24 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-20 10:14 cxgb3: Replace LRO with GRO Divy Le Ray
2009-01-21 8:29 ` Herbert Xu
2009-01-22 9:42 ` Divy Le Ray
2009-02-16 3:36 ` Herbert Xu
2009-02-16 3:47 ` Divy Le Ray
2009-03-13 7:28 ` Divy Le Ray
2009-04-13 15:24 ` Herbert Xu
-- strict thread matches above, loose matches on Subject: below --
2009-01-15 21:14 Divy Le Ray
2009-01-15 23:58 ` Herbert Xu
2009-01-16 8:06 ` Divy Le Ray
2009-01-16 8:56 ` Herbert Xu
2009-01-16 11:12 ` Divy Le Ray
2009-01-16 23:58 ` Herbert Xu
2009-01-17 5:08 ` Herbert Xu
2009-01-17 11:11 ` Divy Le Ray
2009-01-17 13:08 ` Herbert Xu
2009-01-18 20:33 ` Divy Le Ray
2009-01-18 22:50 ` Herbert Xu
2009-01-20 1:03 ` David Miller
2009-01-20 2:03 ` David Miller
2009-01-20 5:24 ` Herbert Xu
2009-01-20 10:04 ` Divy Le Ray
2009-01-13 9:26 [1/2] e1000e: Invoke VLAN GRO handler Herbert Xu
2009-01-15 6:59 ` cxgb3: Replace LRO with GRO Herbert Xu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).