* [3/5] [NET]: Add software TSOv4
2006-06-20 9:09 Herbert Xu
@ 2006-06-20 9:29 ` Herbert Xu
0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-20 9:29 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 361 bytes --]
Hi:
[NET]: Add software TSOv4
This patch adds the GSO implementation for IPv4 TCP.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p3.patch --]
[-- Type: text/plain, Size: 7982 bytes --]
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1299,6 +1299,7 @@ extern void skb_split(struct sk_b
struct sk_buff *skb1, const u32 len);
extern void skb_release_data(struct sk_buff *skb);
+extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg);
static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
int len, void *buffer)
diff --git a/include/net/protocol.h b/include/net/protocol.h
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,6 +37,7 @@
struct net_protocol {
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
+ struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg);
int no_policy;
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1087,6 +1087,8 @@ extern struct request_sock_ops tcp_reque
extern int tcp_v4_destroy_sock(struct sock *sk);
+extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg);
+
#ifdef CONFIG_PROC_FS
extern int tcp4_proc_init(void);
extern void tcp4_proc_exit(void);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1826,6 +1826,132 @@ unsigned char *skb_pull_rcsum(struct sk_
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @skb: buffer to segment
+ * @sg: whether scatter-gather can be used for generated segments
+ *
+ * This function performs segmentation on the given skb. It returns
+ * the segment at the given position. It returns NULL if there are
+ * no more segments to generate, or when an error is encountered.
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+ unsigned int doffset = skb->data - skb->mac.raw;
+ unsigned int offset = doffset;
+ unsigned int headroom;
+ unsigned int len;
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+
+ __skb_push(skb, doffset);
+ headroom = skb_headroom(skb);
+ pos = skb_headlen(skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *frag;
+ int hsize, nsize;
+ int k;
+ int size;
+
+ len = skb->len - offset;
+ if (len > mss)
+ len = mss;
+
+ hsize = skb_headlen(skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ nsize = hsize + doffset;
+ if (nsize > len + doffset || !sg)
+ nsize = len + doffset;
+
+ nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ nskb->dev = skb->dev;
+ nskb->priority = skb->priority;
+ nskb->protocol = skb->protocol;
+ nskb->dst = dst_clone(skb->dst);
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ nskb->pkt_type = skb->pkt_type;
+ nskb->mac_len = skb->mac_len;
+
+ skb_reserve(nskb, headroom);
+ nskb->mac.raw = nskb->data;
+ nskb->nh.raw = nskb->data + skb->mac_len;
+ nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
+ memcpy(skb_put(nskb, doffset), skb->data, doffset);
+
+ if (!sg) {
+ nskb->csum = skb_copy_and_csum_bits(skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ continue;
+ }
+
+ frag = skb_shinfo(nskb)->frags;
+ k = 0;
+
+ nskb->ip_summed = CHECKSUM_HW;
+ nskb->csum = skb->csum;
+ memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ *frag = skb_shinfo(skb)->frags[i];
+ get_page(frag->page);
+ size = frag->size;
+
+ if (pos < offset) {
+ frag->page_offset += offset - pos;
+ frag->size -= offset - pos;
+ }
+
+ k++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ pos += size;
+ } else {
+ frag->size -= pos + size - (offset + len);
+ break;
+ }
+
+ frag++;
+ }
+
+ skb_shinfo(nskb)->nr_frags = k;
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+ } while ((offset += len) < skb->len);
+
+ return segs;
+
+err:
+ while ((skb = segs)) {
+ segs = skb->next;
+ kfree(skb);
+ }
+ return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -68,6 +68,7 @@
*/
#include <linux/config.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *
EXPORT_SYMBOL(inet_sk_rebuild_header);
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct iphdr *iph;
+ struct net_protocol *ops;
+ int proto;
+ int ihl;
+ int id;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ goto out;
+
+ iph = skb->nh.iph;
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ if (!pskb_may_pull(skb, ihl))
+ goto out;
+
+ skb->h.raw = __skb_pull(skb, ihl);
+ iph = skb->nh.iph;
+ id = ntohs(iph->id);
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (ops && ops->gso_segment)
+ segs = ops->gso_segment(skb, sg);
+ rcu_read_unlock();
+
+ if (IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ iph = skb->nh.iph;
+ iph->id = htons(id++);
+ iph->tot_len = htons(skb->len - skb->mac_len);
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
+ } while ((skb = skb->next));
+
+out:
+ return segs;
+}
+
#ifdef CONFIG_IP_MULTICAST
static struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol
static struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
+ .gso_segment = tcp_tso_segment,
.no_policy = 1,
};
@@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void);
static struct packet_type ip_packet_type = {
.type = __constant_htons(ETH_P_IP),
.func = ip_rcv,
+ .gso_segment = inet_gso_segment,
};
static int __init inet_init(void)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -258,6 +258,7 @@
#include <linux/random.h>
#include <linux/bootmem.h>
#include <linux/cache.h>
+#include <linux/err.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *s
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct tcphdr *th;
+ unsigned thlen;
+ unsigned int seq;
+ unsigned int delta;
+ unsigned int oldlen;
+ unsigned int len;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ goto out;
+
+ th = skb->h.th;
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = ~htonl(skb->len);
+ __skb_pull(skb, thlen);
+
+ segs = skb_segment(skb, sg);
+ if (IS_ERR(segs))
+ goto out;
+
+ len = skb_shinfo(skb)->gso_size;
+ delta = csum_add(oldlen, htonl(thlen + len));
+
+ skb = segs;
+ th = skb->h.th;
+ seq = ntohl(th->seq);
+
+ do {
+ th->fin = th->psh = 0;
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ th->check = csum_fold(csum_partial(
+ skb->h.raw, thlen, csum_add(skb->csum, delta)));
+ }
+
+ seq += len;
+ skb = skb->next;
+ th = skb->h.th;
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
+ th->check = csum_fold(csum_partial(
+ skb->h.raw, thlen, csum_add(skb->csum, delta)));
+ }
+
+out:
+ return segs;
+}
+
extern void __skb_cb_too_small_for_tcp(int, int);
extern struct tcp_congestion_ops tcp_reno;
^ permalink raw reply [flat|nested] 23+ messages in thread
* [0/5] GSO: Generic Segmentation Offload
@ 2006-06-22 8:12 Herbert Xu
2006-06-22 8:12 ` [1/5] [NET]: Merge TSO/UFO fields in sk_buff Herbert Xu
` (7 more replies)
0 siblings, 8 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:12 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 3371 bytes --]
Hi:
This is a repost of the GSO patches. The main change is the fix to a bug
in the way dev->gso_skb is freed. This series requires the dev_deactivate
patch that I just posted.
Here is the original description:
This series adds Generic Segmentation Offload (GSO) support to the Linux
networking stack.
Many people have observed that a lot of the savings in TSO come from
traversing the networking stack once rather than many times for each
super-packet. These savings can be obtained without hardware support.
In fact, the concept can be applied to other protocols such as TCPv6,
UDP, or even DCCP.
The key to minimising the cost in implementing this is to postpone the
segmentation as late as possible. In the ideal world, the segmentation
would occur inside each NIC driver where they would rip the super-packet
apart and either produce SG lists which are directly fed to the hardware,
or linearise each segment into pre-allocated memory to be fed to the NIC.
This would elminate segmented skb's altogether.
Unfortunately this requires modifying each and every NIC driver so it
would take quite some time. A much easier solution is to perform the
segmentation just before the entry into the driver's xmit routine. This
series of patches does this.
I've attached some numbers to demonstrate the savings brought on by
doing this. The best scenario is obviously the case where the underlying
NIC supports SG. This means that we simply have to manipulate the SG
entries and place them into individual skb's before passing them to the
driver. The attached file lo-res shows this.
The test was performed through the loopback device which is a fairly good
approxmiation of an SG-capable NIC.
GSO like TSO is only effective if the MTU is significantly less than the
maximum value of 64K. So only the case where the MTU was set to 1500 is
of interest. There we can see that the throughput improved by 17.5%
(3061.05Mb/s => 3598.17Mb/s). The actual saving in transmission cost is
in fact a lot more than that as the majority of the time here is spent on
the RX side which still has to deal with 1500-byte packets.
The worst-case scenario is where the NIC does not support SG and the user
uses write(2) which means that we have to copy the data twice. The files
gso-off/gso-on provide data for this case (the test was carried out on
e100). As you can see, the cost of the extra copy is mostly offset by the
reduction in the cost of going through the networking stack.
For now GSO is off by default but can be enabled through ethtool. It is
conceivable that with enough optimisation GSO could be a win in most cases
and we could enable it by default.
However, even without enabling GSO explicitly it can still function on
bridged and forwarded packets. As it is, passing TSO packets through a
bridge only works if all constiuents support TSO. With GSO, it provides
a fallback so that we may enable TSO for a bridge even if some of its
constituents do not support TSO.
This provides massive savings for Xen as it uses a bridge-based architecture
and TSO/GSO produces a much larger effective MTU for internal traffic between
domains.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: lo-res --]
[-- Type: text/plain, Size: 1632 bytes --]
$ sudo ./ethtool -K lo gso on
$ sudo ifconfig lo mtu 1500
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 3598.17
$ sudo ./ethtool -K lo gso off
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 3061.05
$ sudo ifconfig lo mtu 60000
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 8245.05
$ sudo ./ethtool -K lo gso on
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 8563.36
$ sudo ifconfig lo mtu 16436
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 7359.95
$ sudo ./ethtool -K lo gso off
$ netperf -t TCP_STREAM
TCP STREAM TEST to localhost
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 7535.04
$
[-- Attachment #3: gso-off --]
[-- Type: text/plain, Size: 12446 bytes --]
CPU: PIII, speed 1200 MHz (estimated)
Counted CPU_CLK_UNHALTED events (clocks processor is not halted) with a unit mask of 0x00 (No unit mask) count 100000
samples % symbol name
1247 21.7551 csum_partial_copy_generic
294 5.1291 prep_new_page
240 4.1870 __alloc_skb
120 2.0935 tcp_sendmsg
113 1.9714 get_offset_pmtmr
113 1.9714 kfree
103 1.7969 skb_release_data
103 1.7969 timer_interrupt
101 1.7620 ip_queue_xmit
96 1.6748 skb_clone
94 1.6399 __kmalloc
94 1.6399 net_rx_action
86 1.5003 tcp_transmit_skb
80 1.3957 kmem_cache_free
76 1.3259 tcp_clean_rtx_queue
67 1.1689 ip_output
66 1.1514 mark_offset_pmtmr
65 1.1340 tcp_v4_rcv
64 1.1165 local_bh_enable
62 1.0816 kmem_cache_alloc
59 1.0293 irq_entries_start
59 1.0293 page_fault
57 0.9944 tcp_push_one
52 0.9072 kfree_skbmem
47 0.8200 __qdisc_run
47 0.8200 csum_partial
47 0.8200 netif_receive_skb
46 0.8025 __kfree_skb
46 0.8025 tcp_init_tso_segs
44 0.7676 __copy_to_user_ll
44 0.7676 dev_queue_xmit
39 0.6804 pfifo_fast_enqueue
39 0.6804 system_call
37 0.6455 __copy_from_user_ll
37 0.6455 ip_rcv
36 0.6281 __tcp_select_window
33 0.5757 sock_wfree
31 0.5408 __do_softirq
31 0.5408 tcp_v4_send_check
30 0.5234 eth_header
28 0.4885 tcp_rcv_established
27 0.4710 restore_nocheck
26 0.4536 pfifo_fast_dequeue
25 0.4361 __do_IRQ
25 0.4361 do_softirq
25 0.4361 tcp_build_and_update_options
25 0.4361 tcp_snd_test
23 0.4013 cache_alloc_refill
23 0.4013 handle_IRQ_event
23 0.4013 tcp_ack
22 0.3838 free_block
22 0.3838 ip_route_input
21 0.3664 __netif_rx_schedule
21 0.3664 schedule
20 0.3489 do_wp_page
20 0.3489 neigh_resolve_output
19 0.3315 do_IRQ
19 0.3315 do_page_fault
19 0.3315 do_select
19 0.3315 fget_light
19 0.3315 ip_local_deliver
18 0.3140 __tcp_push_pending_frames
18 0.3140 end_level_ioapic_irq
17 0.2966 cpu_idle
17 0.2966 delay_pmtmr
17 0.2966 tcp_select_window
16 0.2791 add_wait_queue
16 0.2791 rt_hash_code
16 0.2791 tcp_set_skb_tso_segs
15 0.2617 find_vma
15 0.2617 irq_exit
15 0.2617 update_send_head
14 0.2442 __switch_to
13 0.2268 __skb_checksum_complete
13 0.2268 common_interrupt
13 0.2268 dev_kfree_skb_any
13 0.2268 tcp_event_data_sent
13 0.2268 zap_pte_range
12 0.2094 __d_lookup
12 0.2094 __page_set_anon_rmap
12 0.2094 mod_timer
12 0.2094 ret_from_intr
12 0.2094 sock_poll
12 0.2094 tcp_current_mss
12 0.2094 tcp_write_xmit
11 0.1919 do_no_page
11 0.1919 error_code
11 0.1919 free_hot_cold_page
11 0.1919 i8042_interrupt
10 0.1745 __link_path_walk
10 0.1745 buffered_rmqueue
10 0.1745 sk_reset_timer
9 0.1570 __rmqueue
9 0.1570 dev_hard_start_xmit
9 0.1570 free_pages_bulk
9 0.1570 resume_kernel
9 0.1570 skb_checksum
9 0.1570 tcp_cong_avoid
9 0.1570 tcp_rtt_estimator
8 0.1396 do_anonymous_page
8 0.1396 eth_type_trans
8 0.1396 get_page_from_freelist
8 0.1396 tcp_ack_saw_tstamp
8 0.1396 tcp_v4_checksum_init
7 0.1221 __wake_up
7 0.1221 atomic_notifier_call_chain
7 0.1221 normal_poll
7 0.1221 sk_stream_write_space
7 0.1221 tcp_ack_packets_out
7 0.1221 tcp_check_space
7 0.1221 tcp_cwnd_validate
7 0.1221 tcp_reno_cong_avoid
6 0.1047 __pagevec_lru_add_active
6 0.1047 copy_from_user
6 0.1047 hrtimer_get_softirq_time
6 0.1047 lock_sock
6 0.1047 lookup_bh_lru
6 0.1047 net_tx_action
6 0.1047 remove_wait_queue
6 0.1047 tcp_new_space
6 0.1047 unmap_vmas
5 0.0872 __copy_user_intel
5 0.0872 __handle_mm_fault
5 0.0872 __page_cache_release
5 0.0872 core_sys_select
5 0.0872 del_timer
5 0.0872 dnotify_parent
5 0.0872 filemap_nopage
5 0.0872 find_get_page
5 0.0872 kfree_skb
5 0.0872 lru_cache_add_active
5 0.0872 max_select_fd
5 0.0872 mod_page_state_offset
5 0.0872 note_interrupt
5 0.0872 pipe_poll
5 0.0872 prepare_to_wait
5 0.0872 restore_all
5 0.0872 scheduler_tick
5 0.0872 slab_put_obj
5 0.0872 syscall_exit
5 0.0872 try_to_wake_up
5 0.0872 zone_watermark_ok
4 0.0698 __sk_dst_check
4 0.0698 copy_to_user
4 0.0698 do_poll
4 0.0698 do_pollfd
4 0.0698 fput
4 0.0698 inotify_dentry_parent_queue_event
4 0.0698 inotify_inode_queue_event
4 0.0698 memcpy
4 0.0698 sk_stream_wait_memory
4 0.0698 slab_get_obj
4 0.0698 sock_sendmsg
4 0.0698 strncpy_from_user
4 0.0698 strnlen_user
4 0.0698 tcp_should_expand_sndbuf
4 0.0698 tty_poll
3 0.0523 __alloc_pages
3 0.0523 __copy_user_zeroing_intel
3 0.0523 __d_path
3 0.0523 __find_get_block
3 0.0523 __follow_mount
3 0.0523 __netif_schedule
3 0.0523 __wake_up_bit
3 0.0523 __wake_up_common
3 0.0523 _atomic_dec_and_lock
3 0.0523 activate_task
3 0.0523 anon_vma_prepare
3 0.0523 bh_lru_install
3 0.0523 cond_resched
3 0.0523 do_lookup
3 0.0523 do_path_lookup
3 0.0523 do_readv_writev
3 0.0523 dup_fd
3 0.0523 effective_prio
3 0.0523 hrtimer_run_queues
3 0.0523 ing_filter
3 0.0523 link_path_walk
3 0.0523 notifier_call_chain
3 0.0523 preempt_schedule
3 0.0523 radix_tree_lookup
3 0.0523 release_pages
3 0.0523 run_timer_softirq
3 0.0523 run_workqueue
3 0.0523 sys_sendto
3 0.0523 sys_writev
3 0.0523 tty_ldisc_deref
3 0.0523 unmap_page_range
3 0.0523 vm_normal_page
2 0.0349 __brelse
2 0.0349 __find_get_block_slow
2 0.0349 __getblk
2 0.0349 __mod_page_state_offset
2 0.0349 __mod_timer
2 0.0349 acct_update_integrals
2 0.0349 adjtime_adjustment
2 0.0349 alloc_sock_iocb
2 0.0349 apic_timer_interrupt
2 0.0349 bit_waitqueue
2 0.0349 cache_flusharray
2 0.0349 cache_reap
2 0.0349 d_alloc
2 0.0349 dput
2 0.0349 fget
2 0.0349 finish_wait
2 0.0349 init_timer
2 0.0349 lock_timer_base
2 0.0349 opost_block
2 0.0349 page_remove_rmap
2 0.0349 permission
2 0.0349 poll_get_entry
2 0.0349 poll_initwait
2 0.0349 profile_munmap
2 0.0349 pty_chars_in_buffer
2 0.0349 put_page
2 0.0349 raise_softirq
2 0.0349 recalc_task_prio
2 0.0349 resume_userspace
2 0.0349 ret_from_exception
2 0.0349 rmqueue_bulk
2 0.0349 rw_verify_area
2 0.0349 sched_clock
2 0.0349 setup_frame
2 0.0349 skb_queue_head
2 0.0349 sock_aio_read
2 0.0349 sock_def_readable
2 0.0349 sys_ioctl
2 0.0349 sys_read
2 0.0349 task_curr
2 0.0349 task_timeslice
2 0.0349 tty_ldisc_try
2 0.0349 vfs_read
2 0.0349 vma_adjust
2 0.0349 vma_link
1 0.0174 __block_write_full_page
1 0.0174 __dentry_open
1 0.0174 __dequeue_signal
1 0.0174 __do_page_cache_readahead
1 0.0174 __fput
1 0.0174 __generic_file_aio_read
1 0.0174 __group_complete_signal
1 0.0174 __ip_route_output_key
1 0.0174 __lookup_mnt
1 0.0174 __mark_inode_dirty
1 0.0174 __pollwait
1 0.0174 __put_task_struct
1 0.0174 __put_user_4
1 0.0174 __queue_work
1 0.0174 __rcu_pending
1 0.0174 __sigqueue_alloc
1 0.0174 __vma_link_rb
1 0.0174 alloc_inode
1 0.0174 alloc_slabmgmt
1 0.0174 arch_unmap_area_topdown
1 0.0174 as_add_request
1 0.0174 as_fifo_expired
1 0.0174 as_find_next_arq
1 0.0174 autoremove_wake_function
1 0.0174 bio_init
1 0.0174 block_read_full_page
1 0.0174 cached_lookup
1 0.0174 can_vma_merge_before
1 0.0174 con_chars_in_buffer
1 0.0174 convert_fxsr_from_user
1 0.0174 copy_from_read_buf
1 0.0174 copy_pte_range
1 0.0174 cp_new_stat64
1 0.0174 d_splice_alias
1 0.0174 do_exit
1 0.0174 do_filp_open
1 0.0174 do_fork
1 0.0174 do_getname
1 0.0174 do_gettimeofday
1 0.0174 do_mpage_readpage
1 0.0174 do_sigaction
1 0.0174 do_sock_read
1 0.0174 do_sock_write
1 0.0174 do_sync_write
1 0.0174 do_timer
1 0.0174 drain_array
1 0.0174 dummy_inode_permission
1 0.0174 dup_mm
1 0.0174 dup_task_struct
1 0.0174 elv_queue_empty
1 0.0174 enqueue_hrtimer
1 0.0174 enqueue_task
1 0.0174 exit_mmap
1 0.0174 file_ra_state_init
1 0.0174 filesystems_read_proc
1 0.0174 find_vma_prev
1 0.0174 free_poll_entry
1 0.0174 generic_permission
1 0.0174 get_index
1 0.0174 get_signal_to_deliver
1 0.0174 get_vmalloc_info
1 0.0174 getname
1 0.0174 handle_signal
1 0.0174 hrtimer_try_to_cancel
1 0.0174 inet_csk_init_xmit_timers
1 0.0174 init_buffer_head
1 0.0174 inode_change_ok
1 0.0174 inode_init_once
1 0.0174 kbd_bh
1 0.0174 kmem_cache_zalloc
1 0.0174 kmem_getpages
1 0.0174 load_elf_binary
1 0.0174 locks_remove_posix
1 0.0174 memmove
1 0.0174 mempool_free
1 0.0174 mmput
1 0.0174 mutex_lock
1 0.0174 netlink_insert
1 0.0174 no_singlestep
1 0.0174 nr_blockdev_pages
1 0.0174 number
1 0.0174 open_namei
1 0.0174 page_add_new_anon_rmap
1 0.0174 path_release
1 0.0174 pipe_release
1 0.0174 poke_blanked_console
1 0.0174 proc_pid_readlink
1 0.0174 pty_unthrottle
1 0.0174 put_filp
1 0.0174 radix_tree_insert
1 0.0174 raise_softirq_irqoff
1 0.0174 rb_insert_color
1 0.0174 rcu_do_batch
1 0.0174 rcu_pending
1 0.0174 release_sock
1 0.0174 remove_vma
1 0.0174 restore_sigcontext
1 0.0174 search_binary_handler
1 0.0174 sk_wait_data
1 0.0174 skb_dequeue
1 0.0174 skb_queue_tail
1 0.0174 sock_aio_write
1 0.0174 sock_alloc_send_pskb
1 0.0174 sock_def_write_space
1 0.0174 sock_from_file
1 0.0174 sock_ioctl
1 0.0174 submit_bio
1 0.0174 sys_fcntl64
1 0.0174 sys_fstat64
1 0.0174 sys_rt_sigaction
1 0.0174 sys_rt_sigprocmask
1 0.0174 sys_send
1 0.0174 sys_sigreturn
1 0.0174 sys_socketcall
1 0.0174 sys_waitpid
1 0.0174 tcp_close
1 0.0174 tcp_data_queue
1 0.0174 tcp_fastretrans_alert
1 0.0174 tcp_grow_window
1 0.0174 tcp_mtu_probe
1 0.0174 tcp_v4_do_rcv
1 0.0174 tty_hung_up_p
1 0.0174 tty_insert_flip_string_flags
1 0.0174 tty_paranoia_check
1 0.0174 tty_wakeup
1 0.0174 tty_write
1 0.0174 unlock_buffer
1 0.0174 unmap_region
1 0.0174 update_process_times
1 0.0174 update_wall_time
1 0.0174 update_wall_time_one_tick
1 0.0174 vfs_ioctl
1 0.0174 vfs_permission
1 0.0174 vma_prio_tree_add
1 0.0174 wait_task_zombie
[-- Attachment #4: gso-on --]
[-- Type: text/plain, Size: 12774 bytes --]
CPU: PIII, speed 1200 MHz (estimated)
Counted CPU_CLK_UNHALTED events (clocks processor is not halted) with a unit mask of 0x00 (No unit mask) count 100000
samples % symbol name
1255 21.6865 csum_partial_copy_generic
398 6.8775 __copy_from_user_ll
343 5.9271 __alloc_skb
254 4.3891 prep_new_page
243 4.1991 skb_segment
110 1.9008 __kmalloc
106 1.8317 kfree
106 1.8317 timer_interrupt
105 1.8144 skb_copy_and_csum_bits
94 1.6243 net_rx_action
77 1.3306 kmem_cache_free
75 1.2960 tcp_v4_rcv
72 1.2442 kmem_cache_alloc
63 1.0886 page_fault
55 0.9504 mark_offset_pmtmr
54 0.9331 __kfree_skb
52 0.8986 skb_release_data
51 0.8813 csum_partial
50 0.8640 do_softirq
50 0.8640 inet_gso_segment
47 0.8122 get_offset_pmtmr
47 0.8122 irq_entries_start
45 0.7776 netif_receive_skb
43 0.7430 tcp_current_mss
41 0.7085 free_hot_cold_page
40 0.6912 tcp_clean_rtx_queue
36 0.6221 kfree_skbmem
35 0.6048 tcp_sendmsg
35 0.6048 tcp_write_xmit
34 0.5875 __do_softirq
31 0.5357 __do_IRQ
31 0.5357 ip_rcv
31 0.5357 tcp_rcv_established
30 0.5184 __pskb_trim_head
29 0.5011 system_call
28 0.4838 tcp_ack
28 0.4838 tcp_tso_segment
28 0.4838 tcp_tso_should_defer
27 0.4666 __copy_to_user_ll
26 0.4493 restore_nocheck
26 0.4493 rt_hash_code
25 0.4320 do_wp_page
24 0.4147 handle_IRQ_event
24 0.4147 schedule
23 0.3974 do_select
23 0.3974 tcp_tso_acked
22 0.3802 ip_local_deliver
22 0.3802 ip_route_input
21 0.3629 buffered_rmqueue
21 0.3629 free_block
20 0.3456 end_level_ioapic_irq
20 0.3456 tcp_init_tso_segs
19 0.3283 __netif_rx_schedule
19 0.3283 cache_alloc_refill
19 0.3283 dev_kfree_skb_any
18 0.3110 skb_split
17 0.2938 ret_from_intr
17 0.2938 tcp_mark_head_lost
16 0.2765 common_interrupt
16 0.2765 do_page_fault
16 0.2765 get_page_from_freelist
16 0.2765 slab_put_obj
15 0.2592 do_IRQ
15 0.2592 sock_poll
15 0.2592 zap_pte_range
14 0.2419 irq_exit
14 0.2419 tcp_trim_head
14 0.2419 tcp_v4_checksum_init
13 0.2246 __link_path_walk
13 0.2246 add_wait_queue
13 0.2246 delay_pmtmr
13 0.2246 tcp_rtt_estimator
12 0.2074 __skb_checksum_complete
12 0.2074 cpu_idle
12 0.2074 fget_light
12 0.2074 find_vma
12 0.2074 skb_checksum
12 0.2074 tcp_new_space
11 0.1901 copy_from_user
11 0.1901 put_page
11 0.1901 tcp_set_skb_tso_segs
10 0.1728 __d_lookup
10 0.1728 __switch_to
10 0.1728 error_code
10 0.1728 eth_type_trans
10 0.1728 i8042_interrupt
10 0.1728 skb_copy_bits
10 0.1728 tcp_transmit_skb
9 0.1555 dev_hard_start_xmit
9 0.1555 mod_page_state_offset
9 0.1555 strnlen_user
8 0.1382 __page_set_anon_rmap
8 0.1382 do_no_page
8 0.1382 ip_output
8 0.1382 resume_kernel
8 0.1382 skb_clone
8 0.1382 tcp_check_space
8 0.1382 tcp_xmit_retransmit_queue
7 0.1210 __mod_timer
7 0.1210 __tcp_push_pending_frames
7 0.1210 __tcp_select_window
7 0.1210 ip_queue_xmit
7 0.1210 mod_timer
7 0.1210 pipe_poll
7 0.1210 remove_wait_queue
7 0.1210 zone_watermark_ok
6 0.1037 __pagevec_lru_add_active
6 0.1037 core_sys_select
6 0.1037 do_pollfd
6 0.1037 find_get_page
6 0.1037 note_interrupt
6 0.1037 sk_stream_write_space
6 0.1037 skb_gso_segment
6 0.1037 sys_read
6 0.1037 tcp_ack_packets_out
6 0.1037 tcp_cong_avoid
6 0.1037 tcp_reno_cong_avoid
5 0.0864 __rmqueue
5 0.0864 __wake_up
5 0.0864 __wake_up_common
5 0.0864 dev_queue_xmit
5 0.0864 eth_header
5 0.0864 filemap_nopage
5 0.0864 fput
5 0.0864 free_pages_bulk
5 0.0864 internal_add_timer
5 0.0864 local_bh_enable
5 0.0864 lookup_bh_lru
5 0.0864 sys_socketcall
5 0.0864 syscall_exit
5 0.0864 tcp_mtu_probe
5 0.0864 tcp_v4_do_rcv
4 0.0691 __copy_user_intel
4 0.0691 __handle_mm_fault
4 0.0691 __mod_page_state_offset
4 0.0691 __page_cache_release
4 0.0691 __pollwait
4 0.0691 __qdisc_run
4 0.0691 adjtime_adjustment
4 0.0691 apic_timer_interrupt
4 0.0691 cond_resched
4 0.0691 hrtimer_run_queues
4 0.0691 kfree_skb
4 0.0691 lock_timer_base
4 0.0691 normal_poll
4 0.0691 opost_block
4 0.0691 pfifo_fast_enqueue
4 0.0691 preempt_schedule
4 0.0691 pskb_expand_head
4 0.0691 radix_tree_lookup
4 0.0691 resume_userspace
4 0.0691 sk_reset_timer
4 0.0691 skb_dequeue
4 0.0691 sys_send
4 0.0691 tcp_sacktag_write_queue
4 0.0691 tty_ldisc_try
4 0.0691 vfs_permission
3 0.0518 __alloc_pages
3 0.0518 __find_get_block
3 0.0518 __sk_dst_check
3 0.0518 anon_vma_prepare
3 0.0518 do_mmap_pgoff
3 0.0518 do_readv_writev
3 0.0518 do_sock_read
3 0.0518 dup_mm
3 0.0518 generic_permission
3 0.0518 hrtimer_get_softirq_time
3 0.0518 ing_filter
3 0.0518 lru_cache_add_active
3 0.0518 page_add_new_anon_rmap
3 0.0518 permission
3 0.0518 pfifo_fast_dequeue
3 0.0518 pty_chars_in_buffer
3 0.0518 raise_softirq
3 0.0518 rb_insert_color
3 0.0518 release_pages
3 0.0518 restore_all
3 0.0518 run_timer_softirq
3 0.0518 rw_verify_area
3 0.0518 slab_get_obj
3 0.0518 sock_wfree
3 0.0518 tcp_ack_saw_tstamp
3 0.0518 tcp_build_and_update_options
3 0.0518 tcp_event_data_sent
3 0.0518 tcp_should_expand_sndbuf
3 0.0518 tcp_v4_send_check
3 0.0518 tso_fragment
3 0.0518 unmap_vmas
3 0.0518 update_wall_time
3 0.0518 vsnprintf
2 0.0346 __rcu_pending
2 0.0346 _atomic_dec_and_lock
2 0.0346 account_system_time
2 0.0346 acct_update_integrals
2 0.0346 blk_recount_segments
2 0.0346 cache_flusharray
2 0.0346 cleanup_timers
2 0.0346 copy_pte_range
2 0.0346 cp_new_stat64
2 0.0346 current_fs_time
2 0.0346 d_instantiate
2 0.0346 default_wake_function
2 0.0346 dequeue_task
2 0.0346 dnotify_parent
2 0.0346 do_anonymous_page
2 0.0346 do_gettimeofday
2 0.0346 do_path_lookup
2 0.0346 do_setitimer
2 0.0346 do_sys_poll
2 0.0346 drain_array
2 0.0346 effective_prio
2 0.0346 find_next_zero_bit
2 0.0346 inode_init_once
2 0.0346 input_event
2 0.0346 max_select_fd
2 0.0346 memcpy
2 0.0346 memmove
2 0.0346 need_resched
2 0.0346 neigh_resolve_output
2 0.0346 no_singlestep
2 0.0346 notifier_call_chain
2 0.0346 page_remove_rmap
2 0.0346 poll_freewait
2 0.0346 prepare_to_wait
2 0.0346 recalc_task_prio
2 0.0346 rmqueue_bulk
2 0.0346 schedule_timeout
2 0.0346 scheduler_tick
2 0.0346 skb_queue_tail
2 0.0346 sock_aio_read
2 0.0346 sock_aio_write
2 0.0346 sock_from_file
2 0.0346 sock_sendmsg
2 0.0346 strncpy_from_user
2 0.0346 sys_gettimeofday
2 0.0346 sys_sendto
2 0.0346 tcp_cwnd_down
2 0.0346 tcp_data_queue
2 0.0346 tcp_fastretrans_alert
2 0.0346 tcp_parse_options
2 0.0346 tcp_push_one
2 0.0346 tcp_select_window
2 0.0346 tcp_snd_test
2 0.0346 transfer_objects
2 0.0346 try_to_wake_up
2 0.0346 tty_write
2 0.0346 unmap_page_range
2 0.0346 vfs_ioctl
2 0.0346 vma_adjust
1 0.0173 __bread
1 0.0173 __brelse
1 0.0173 __dentry_open
1 0.0173 __dequeue_signal
1 0.0173 __exit_signal
1 0.0173 __find_get_block_slow
1 0.0173 __group_complete_signal
1 0.0173 __insert_inode_hash
1 0.0173 __lookup_mnt
1 0.0173 __netif_schedule
1 0.0173 __pskb_pull_tail
1 0.0173 __pte_alloc
1 0.0173 __tasklet_schedule
1 0.0173 __wake_up_bit
1 0.0173 acct_process
1 0.0173 ack_edge_ioapic_irq
1 0.0173 acquire_console_sem
1 0.0173 activate_task
1 0.0173 alarm_setitimer
1 0.0173 alloc_new_pmd
1 0.0173 as_dispatch_request
1 0.0173 as_merged_request
1 0.0173 autoremove_wake_function
1 0.0173 bh_lru_install
1 0.0173 bit_waitqueue
1 0.0173 block_read_full_page
1 0.0173 cache_reap
1 0.0173 check_itimerval
1 0.0173 clear_user
1 0.0173 con_chars_in_buffer
1 0.0173 convert_fxsr_to_user
1 0.0173 copy_semundo
1 0.0173 copy_strings
1 0.0173 copy_to_user
1 0.0173 d_rehash
1 0.0173 deactivate_task
1 0.0173 dev_gso_segment
1 0.0173 do_fcntl
1 0.0173 do_generic_mapping_read
1 0.0173 do_lookup
1 0.0173 do_poll
1 0.0173 do_sigaction
1 0.0173 do_sync_read
1 0.0173 do_sys_open
1 0.0173 dummy_vm_enough_memory
1 0.0173 dup_fd
1 0.0173 enqueue_task
1 0.0173 exec_permission_lite
1 0.0173 file_ra_state_init
1 0.0173 file_update_time
1 0.0173 filp_close
1 0.0173 find_task_by_pid_type
1 0.0173 finish_wait
1 0.0173 flush_old_exec
1 0.0173 free_one_page
1 0.0173 free_page_and_swap_cache
1 0.0173 free_poll_entry
1 0.0173 free_uid
1 0.0173 get_empty_filp
1 0.0173 get_index
1 0.0173 get_signal_to_deliver
1 0.0173 get_task_mm
1 0.0173 get_vmalloc_info
1 0.0173 getname
1 0.0173 group_send_sig_info
1 0.0173 groups_search
1 0.0173 handle_signal
1 0.0173 hrtimer_try_to_cancel
1 0.0173 inode_setattr
1 0.0173 inode_sub_bytes
1 0.0173 inotify_dentry_parent_queue_event
1 0.0173 inotify_inode_queue_event
1 0.0173 kbd_keycode
1 0.0173 kmem_cache_zalloc
1 0.0173 kthread_should_stop
1 0.0173 locks_remove_flock
1 0.0173 lookup_create
1 0.0173 make_ahead_window
1 0.0173 mark_page_accessed
1 0.0173 math_state_restore
1 0.0173 may_expand_vm
1 0.0173 n_tty_receive_buf
1 0.0173 nameidata_to_filp
1 0.0173 opost
1 0.0173 page_waitqueue
1 0.0173 prio_tree_remove
1 0.0173 proc_file_read
1 0.0173 profile_munmap
1 0.0173 profile_tick
1 0.0173 put_io_context
1 0.0173 rb_next
1 0.0173 rcu_do_batch
1 0.0173 rcu_pending
1 0.0173 recalc_sigpending_tsk
1 0.0173 run_local_timers
1 0.0173 run_posix_cpu_timers
1 0.0173 save_i387
1 0.0173 sched_clock
1 0.0173 setup_frame
1 0.0173 signal_wake_up
1 0.0173 sk_stream_wait_memory
1 0.0173 skb_checksum_help
1 0.0173 slab_destroy
1 0.0173 smp_send_timer_broadcast_ipi
1 0.0173 sock_def_readable
1 0.0173 sock_ioctl
1 0.0173 sys_getpid
1 0.0173 sys_munmap
1 0.0173 syscall_call
1 0.0173 tcp_ack_update_window
1 0.0173 tcp_check_sack_reneging
1 0.0173 tcp_fast_parse_options
1 0.0173 tcp_fragment
1 0.0173 tcp_mtu_to_mss
1 0.0173 tcp_window_allows
1 0.0173 timespec_trunc
1 0.0173 tty_hung_up_p
1 0.0173 tty_ldisc_deref
1 0.0173 tty_poll
1 0.0173 unlink_file_vma
1 0.0173 vfs_getattr
1 0.0173 vfs_read
1 0.0173 vfs_write
1 0.0173 vm_normal_page
1 0.0173 vm_stat_account
1 0.0173 vma_prio_tree_add
1 0.0173 zone_statistics
^ permalink raw reply [flat|nested] 23+ messages in thread
* [1/5] [NET]: Merge TSO/UFO fields in sk_buff
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
@ 2006-06-22 8:12 ` Herbert Xu
2006-06-22 8:13 ` [2/5] [NET]: Add generic segmentation offload Herbert Xu
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:12 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 1331 bytes --]
Hi:
[NET]: Merge TSO/UFO fields in sk_buff
Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not
going to scale if we add any more segmentation methods (e.g., DCCP). So
let's merge them.
They were used to tell the protocol of a packet. This function has been
subsumed by the new gso_type field. This is essentially a set of netdev
feature bits (shifted by 16 bits) that are required to process a specific
skb. As such it's easy to tell whether a given device can process a GSO
skb: you just have to and the gso_type field and the netdev's features
field.
I've made gso_type a conjunction. The idea is that you have a base type
(e.g., SKB_GSO_TCPV4) that can be modified further to support new features.
For example, if we add a hardware TSO type that supports ECN, they would
declare NETIF_F_TSO | NETIF_F_TSO_ECN. All TSO packets with CWR set would
have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO
packets would be SKB_GSO_TCPV4. This means that only the CWR packets need
to be emulated in software.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p1.patch --]
[-- Type: text/plain, Size: 25944 bytes --]
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -792,7 +792,7 @@ static int cp_start_xmit (struct sk_buff
entry = cp->tx_head;
eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
if (dev->features & NETIF_F_TSO)
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
if (skb_shinfo(skb)->nr_frags == 0) {
struct cp_desc *txd = &cp->tx_ring[entry];
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp)
skb = tx_buf->skb;
#ifdef BCM_TSO
/* partial BD completions possible with TSO packets */
- if (skb_shinfo(skb)->tso_size) {
+ if (skb_shinfo(skb)->gso_size) {
u16 last_idx, last_ring_idx;
last_idx = sw_cons +
@@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, str
(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
}
#ifdef BCM_TSO
- if ((mss = skb_shinfo(skb)->tso_size) &&
+ if ((mss = skb_shinfo(skb)->gso_size) &&
(skb->len > (bp->dev->mtu + ETH_HLEN))) {
u32 tcp_opt_len, ip_tcp_len;
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, s
struct cpl_tx_pkt *cpl;
#ifdef NETIF_F_TSO
- if (skb_shinfo(skb)->tso_size) {
+ if (skb_shinfo(skb)->gso_size) {
int eth_type;
struct cpl_tx_pkt_lso *hdr;
@@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, s
hdr->ip_hdr_words = skb->nh.iph->ihl;
hdr->tcp_hdr_words = skb->h.th->doff;
hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
- skb_shinfo(skb)->tso_size));
+ skb_shinfo(skb)->gso_size));
hdr->len = htonl(skb->len - sizeof(*hdr));
cpl = (struct cpl_tx_pkt *)hdr;
sge->stats.tx_lso_pkts++;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter,
uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
int err;
- if (skb_shinfo(skb)->tso_size) {
+ if (skb_shinfo(skb)->gso_size) {
if (skb_header_cloned(skb)) {
err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
if (err)
@@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter,
}
hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
if (skb->protocol == htons(ETH_P_IP)) {
skb->nh.iph->tot_len = 0;
skb->nh.iph->check = 0;
@@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapt
* tso gets written back prematurely before the data is fully
* DMA'd to the controller */
if (!skb->data_len && tx_ring->last_tx_tso &&
- !skb_shinfo(skb)->tso_size) {
+ !skb_shinfo(skb)->gso_size) {
tx_ring->last_tx_tso = 0;
size -= 4;
}
@@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
}
#ifdef NETIF_F_TSO
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
/* The controller does a simple calculation to
* make sure there is enough room in the FIFO before
* initiating the DMA for each buffer. The calc is:
@@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
#ifdef NETIF_F_TSO
/* Controller Erratum workaround */
if (!skb->data_len && tx_ring->last_tx_tso &&
- !skb_shinfo(skb)->tso_size)
+ !skb_shinfo(skb)->gso_size)
count++;
#endif
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff
np->tx_skbuff[nr] = skb;
#ifdef NETIF_F_TSO
- if (skb_shinfo(skb)->tso_size)
- tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
+ if (skb_shinfo(skb)->gso_size)
+ tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
else
#endif
tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s
uint16_t ipcse, tucse, mss;
int err;
- if(likely(skb_shinfo(skb)->tso_size)) {
+ if(likely(skb_shinfo(skb)->gso_size)) {
if (skb_header_cloned(skb)) {
err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
if (err)
@@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s
}
hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
skb->nh.iph->tot_len = 0;
skb->nh.iph->check = 0;
skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -74,7 +74,7 @@ static void emulate_large_send_offload(s
struct iphdr *iph = skb->nh.iph;
struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
unsigned int doffset = (iph->ihl + th->doff) * 4;
- unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
+ unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
unsigned int offset = 0;
u32 seq = ntohl(th->seq);
u16 id = ntohs(iph->id);
@@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff
#endif
#ifdef LOOPBACK_TSO
- if (skb_shinfo(skb)->tso_size) {
+ if (skb_shinfo(skb)->gso_size) {
BUG_ON(skb->protocol != htons(ETH_P_IP));
BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1879,7 +1879,7 @@ again:
#ifdef NETIF_F_TSO
if (skb->len > (dev->mtu + ETH_HLEN)) {
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
if (mss != 0)
max_segments = MYRI10GE_MAX_SEND_DESC_TSO;
}
@@ -2113,7 +2113,7 @@ abort_linearize:
}
idx = (idx + 1) & tx->mask;
} while (idx != last_idx);
- if (skb_shinfo(skb)->tso_size) {
+ if (skb_shinfo(skb)->gso_size) {
printk(KERN_ERR
"myri10ge: %s: TSO but wanted to linearize?!?!?\n",
mgp->dev->name);
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl
static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
{
if (dev->features & NETIF_F_TSO) {
- u32 mss = skb_shinfo(skb)->tso_size;
+ u32 mss = skb_shinfo(skb)->gso_size;
if (mss)
return LargeSend | ((mss & MSSMask) << MSSShift);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3915,8 +3915,8 @@ static int s2io_xmit(struct sk_buff *skb
txdp->Control_1 = 0;
txdp->Control_2 = 0;
#ifdef NETIF_F_TSO
- mss = skb_shinfo(skb)->tso_size;
- if (mss) {
+ mss = skb_shinfo(skb)->gso_size;
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
txdp->Control_1 |= TXD_TCP_LSO_EN;
txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
}
@@ -3936,10 +3936,10 @@ static int s2io_xmit(struct sk_buff *skb
}
frg_len = skb->len - skb->data_len;
- if (skb_shinfo(skb)->ufo_size) {
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
int ufo_size;
- ufo_size = skb_shinfo(skb)->ufo_size;
+ ufo_size = skb_shinfo(skb)->gso_size;
ufo_size &= ~7;
txdp->Control_1 |= TXD_UFO_EN;
txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
@@ -3965,7 +3965,7 @@ static int s2io_xmit(struct sk_buff *skb
txdp->Host_Control = (unsigned long) skb;
txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
- if (skb_shinfo(skb)->ufo_size)
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
txdp->Control_1 |= TXD_UFO_EN;
frg_cnt = skb_shinfo(skb)->nr_frags;
@@ -3980,12 +3980,12 @@ static int s2io_xmit(struct sk_buff *skb
(sp->pdev, frag->page, frag->page_offset,
frag->size, PCI_DMA_TODEVICE);
txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
- if (skb_shinfo(skb)->ufo_size)
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
txdp->Control_1 |= TXD_UFO_EN;
}
txdp->Control_1 |= TXD_GATHER_CODE_LAST;
- if (skb_shinfo(skb)->ufo_size)
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
frg_cnt++; /* as Txd0 was used for inband header */
tx_fifo = mac_control->tx_FIFO_start[queue];
@@ -3999,7 +3999,7 @@ static int s2io_xmit(struct sk_buff *skb
if (mss)
val64 |= TX_FIFO_SPECIAL_FUNC;
#endif
- if (skb_shinfo(skb)->ufo_size)
+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
val64 |= TX_FIFO_SPECIAL_FUNC;
writeq(val64, &tx_fifo->List_Control);
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct s
count = sizeof(dma_addr_t) / sizeof(u32);
count += skb_shinfo(skb)->nr_frags * count;
- if (skb_shinfo(skb)->tso_size)
+ if (skb_shinfo(skb)->gso_size)
++count;
if (skb->ip_summed == CHECKSUM_HW)
@@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buf
}
/* Check for TCP Segmentation Offload */
- mss = skb_shinfo(skb)->tso_size;
+ mss = skb_shinfo(skb)->gso_size;
if (mss != 0) {
/* just drop the packet if non-linear expansion fails */
if (skb_header_cloned(skb) &&
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff
#if TG3_TSO_SUPPORT != 0
mss = 0;
if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
- (mss = skb_shinfo(skb)->tso_size) != 0) {
+ (mss = skb_shinfo(skb)->gso_size) != 0) {
int tcp_opt_len, ip_tcp_len;
if (skb_header_cloned(skb) &&
@@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct
#if TG3_TSO_SUPPORT != 0
mss = 0;
if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
- (mss = skb_shinfo(skb)->tso_size) != 0) {
+ (mss = skb_shinfo(skb)->gso_size) != 0) {
int tcp_opt_len, ip_tcp_len;
if (skb_header_cloned(skb) &&
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -340,7 +340,7 @@ enum state_values {
#endif
#if defined(NETIF_F_TSO)
-#define skb_tso_size(x) (skb_shinfo(x)->tso_size)
+#define skb_tso_size(x) (skb_shinfo(x)->gso_size)
#define TSO_NUM_DESCRIPTORS 2
#define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT
#else
diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c
--- a/drivers/s390/net/qeth_eddp.c
+++ b/drivers/s390/net/qeth_eddp.c
@@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth
}
tcph = eddp->skb->h.th;
while (eddp->skb_offset < eddp->skb->len) {
- data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
+ data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
(int)(eddp->skb->len - eddp->skb_offset));
/* prepare qdio hdr */
if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
@@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_edd
QETH_DBF_TEXT(trace, 5, "eddpcanp");
/* can we put multiple skbs in one page? */
- skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
+ skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
if (skbs_per_page > 1){
- ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
+ ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
skbs_per_page + 1;
ctx->elements_per_skb = 1;
} else {
/* no -> how many elements per skb? */
- ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
+ ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
PAGE_SIZE) >> PAGE_SHIFT;
ctx->num_pages = ctx->elements_per_skb *
- (skb_shinfo(skb)->tso_segs + 1);
+ (skb_shinfo(skb)->gso_segs + 1);
}
ctx->num_elements = ctx->elements_per_skb *
- (skb_shinfo(skb)->tso_segs + 1);
+ (skb_shinfo(skb)->gso_segs + 1);
}
static inline struct qeth_eddp_context *
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card,
struct qeth_eddp_context *ctx = NULL;
int tx_bytes = skb->len;
unsigned short nr_frags = skb_shinfo(skb)->nr_frags;
- unsigned short tso_size = skb_shinfo(skb)->tso_size;
+ unsigned short tso_size = skb_shinfo(skb)->gso_size;
int rc;
QETH_DBF_TEXT(trace, 6, "sendpkt");
@@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card,
queue = card->qdio.out_qs
[qeth_get_priority_queue(card, skb, ipv, cast_type)];
- if (skb_shinfo(skb)->tso_size)
+ if (skb_shinfo(skb)->gso_size)
large_send = card->options.large_send;
/*are we able to do TSO ? If so ,prepare and send it from here */
diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h
--- a/drivers/s390/net/qeth_tso.h
+++ b/drivers/s390/net/qeth_tso.h
@@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *c
hdr->ext.hdr_version = 1;
hdr->ext.hdr_len = 28;
/*insert non-fix values */
- hdr->ext.mss = skb_shinfo(skb)->tso_size;
+ hdr->ext.mss = skb_shinfo(skb)->gso_size;
hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
sizeof(struct qeth_hdr_tso));
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -309,9 +309,12 @@ struct net_device
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
-#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
#define NETIF_F_LLTX 4096 /* LockLess TX */
-#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/
+
+ /* Segmentation offload features */
+#define NETIF_F_GSO_SHIFT 16
+#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
+#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
@@ -980,6 +983,13 @@ extern void dev_seq_stop(struct seq_file
extern void linkwatch_run_queue(void);
+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
+{
+ int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT;
+ return skb_shinfo(skb)->gso_size &&
+ (dev->features & feature) != feature;
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_DEV_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -135,9 +135,10 @@ struct skb_frag_struct {
struct skb_shared_info {
atomic_t dataref;
unsigned short nr_frags;
- unsigned short tso_size;
- unsigned short tso_segs;
- unsigned short ufo_size;
+ unsigned short gso_size;
+ /* Warning: this field is not always filled in (UFO)! */
+ unsigned short gso_segs;
+ unsigned short gso_type;
unsigned int ip6_frag_id;
struct sk_buff *frag_list;
skb_frag_t frags[MAX_SKB_FRAGS];
@@ -169,6 +170,11 @@ enum {
SKB_FCLONE_CLONE,
};
+enum {
+ SKB_GSO_TCPV4 = 1 << 0,
+ SKB_GSO_UDPV4 = 1 << 1,
+};
+
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -570,13 +570,13 @@ struct tcp_skb_cb {
*/
static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
- return skb_shinfo(skb)->tso_segs;
+ return skb_shinfo(skb)->gso_segs;
}
/* This is valid iff tcp_skb_pcount() > 1. */
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
- return skb_shinfo(skb)->tso_size;
+ return skb_shinfo(skb)->gso_size;
}
static inline void tcp_dec_pcount_approx(__u32 *count,
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -34,8 +34,8 @@ static inline unsigned packet_length(con
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
- /* drop mtu oversized packets except tso */
- if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
+ /* drop mtu oversized packets except gso */
+ if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size)
kfree_skb(skb);
else {
#ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct s
{
if (skb->protocol == htons(ETH_P_IP) &&
skb->len > skb->dev->mtu &&
- !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+ !skb_shinfo(skb)->gso_size)
return ip_fragment(skb, br_dev_queue_push_xmit);
else
return br_dev_queue_push_xmit(skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
shinfo->nr_frags = 0;
- shinfo->tso_size = 0;
- shinfo->tso_segs = 0;
- shinfo->ufo_size = 0;
+ shinfo->gso_size = 0;
+ shinfo->gso_segs = 0;
+ shinfo->gso_type = 0;
shinfo->ip6_frag_id = 0;
shinfo->frag_list = NULL;
@@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kme
atomic_set(&(skb_shinfo(skb)->dataref), 1);
skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->tso_size = 0;
- skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_segs = 0;
+ skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->frag_list = NULL;
out:
return skb;
@@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_bu
#endif
skb_copy_secmark(new, old);
atomic_set(&new->users, 1);
- skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
- skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
/**
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -210,8 +210,7 @@ static inline int ip_finish_output(struc
return dst_output(skb);
}
#endif
- if (skb->len > dst_mtu(skb->dst) &&
- !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+ if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size)
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
@@ -362,7 +361,7 @@ packet_routed:
}
ip_select_ident_more(iph, &rt->u.dst, sk,
- (skb_shinfo(skb)->tso_segs ?: 1) - 1);
+ (skb_shinfo(skb)->gso_segs ?: 1) - 1);
/* Add an IP checksum. */
ip_send_check(iph);
@@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(str
(length - transhdrlen));
if (!err) {
/* specify the length of each IP datagram fragment*/
- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
__skb_queue_tail(&sk->sk_write_queue, skb);
return 0;
@@ -1087,14 +1087,16 @@ ssize_t ip_append_page(struct sock *sk,
inet->cork.length += size;
if ((sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO))
- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+ (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
+ }
while (size > 0) {
int i;
- if (skb_shinfo(skb)->ufo_size)
+ if (skb_shinfo(skb)->gso_size)
len = size;
else {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -571,7 +571,7 @@ new_segment:
skb->ip_summed = CHECKSUM_HW;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->gso_segs = 0;
if (!copied)
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
@@ -818,7 +818,7 @@ new_segment:
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk,
else
pkt_len = (end_seq -
TCP_SKB_CB(skb)->seq);
- if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+ if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
break;
pcount = tcp_skb_pcount(skb);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
} else {
unsigned int factor;
factor = skb->len + (mss_now - 1);
factor /= mss_now;
- skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = mss_now;
+ skb_shinfo(skb)->gso_segs = factor;
+ skb_shinfo(skb)->gso_size = mss_now;
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
}
}
@@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock
if (!tso_segs ||
(tso_segs > 1 &&
- skb_shinfo(skb)->tso_size != mss_now)) {
+ tcp_skb_mss(skb) != mss_now)) {
tcp_set_skb_tso_segs(sk, skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
@@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk,
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
if (!pskb_trim(skb, 0)) {
TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
skb->ip_summed = CHECKSUM_NONE;
skb->csum = 0;
}
@@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk)
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
TCP_SKB_CB(skb)->seq = tp->write_seq;
@@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
/* Send it off. */
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
@@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct
TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
@@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk)
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
TCP_ECN_send_syn(sk, tp, buff);
TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->tso_segs = 1;
- skb_shinfo(buff)->tso_size = 0;
+ skb_shinfo(buff)->gso_segs = 1;
+ skb_shinfo(buff)->gso_size = 0;
+ skb_shinfo(buff)->gso_type = 0;
buff->csum = 0;
TCP_SKB_CB(buff)->seq = tp->write_seq++;
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
@@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk)
buff->csum = 0;
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->tso_segs = 1;
- skb_shinfo(buff)->tso_size = 0;
+ skb_shinfo(buff)->gso_segs = 1;
+ skb_shinfo(buff)->gso_size = 0;
+ skb_shinfo(buff)->gso_type = 0;
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
@@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct soc
skb->csum = 0;
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(skb)->sacked = urgent;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *s
int ip6_output(struct sk_buff *skb)
{
- if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
+ if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) ||
dst_allfrag(skb->dst))
return ip6_fragment(skb, ip6_output2);
else
@@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(st
struct frag_hdr fhdr;
/* specify the length of each IP datagram fragment*/
- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
- sizeof(struct frag_hdr);
+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
+ sizeof(struct frag_hdr);
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
ipv6_select_ident(skb, &fhdr);
skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
__skb_queue_tail(&sk->sk_write_queue, skb);
^ permalink raw reply [flat|nested] 23+ messages in thread
* [2/5] [NET]: Add generic segmentation offload
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
2006-06-22 8:12 ` [1/5] [NET]: Merge TSO/UFO fields in sk_buff Herbert Xu
@ 2006-06-22 8:13 ` Herbert Xu
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
` (5 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:13 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 716 bytes --]
Hi:
[NET]: Add generic segmentation offload
This patch adds the infrastructure for generic segmentation offload.
The idea is to tap into the potential savings of TSO without hardware
support by postponing the allocation of segmented skb's until just
before the entry point into the NIC driver.
The same structure can be used to support software IPv6 TSO, as well as
UFO and segmentation offload for other relevant protocols, e.g., DCCP.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p2.patch --]
[-- Type: text/plain, Size: 7530 bytes --]
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -406,6 +406,9 @@ struct net_device
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
+ /* Partially transmitted GSO packet. */
+ struct sk_buff *gso_skb;
+
/* ingress path synchronizer */
spinlock_t ingress_lock;
struct Qdisc *qdisc_ingress;
@@ -540,6 +543,7 @@ struct packet_type {
struct net_device *,
struct packet_type *,
struct net_device *);
+ struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg);
void *af_packet_priv;
struct list_head list;
};
@@ -690,7 +694,8 @@ extern int dev_change_name(struct net_d
extern int dev_set_mtu(struct net_device *, int);
extern int dev_set_mac_address(struct net_device *,
struct sockaddr *);
-extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+extern int dev_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
extern void dev_init(void);
@@ -964,6 +969,7 @@ extern int netdev_max_backlog;
extern int weight_p;
extern int netdev_set_master(struct net_device *dev, struct net_device *master);
extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
#ifdef CONFIG_BUG
extern void netdev_rx_csum_fault(struct net_device *dev);
#else
diff --git a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -116,6 +116,7 @@
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
+#include <linux/err.h>
/*
* The list of packet types we will receive (as opposed to discard)
@@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct
* taps currently in use.
*/
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
@@ -1186,6 +1187,40 @@ out:
return ret;
}
+/**
+ * skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @sg: whether scatter-gather is supported on the target.
+ *
+ * This function segments the given skb and returns a list of segments.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_type *ptype;
+ int type = skb->protocol;
+
+ BUG_ON(skb_shinfo(skb)->frag_list);
+ BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+ skb->mac.raw = skb->data;
+ skb->mac_len = skb->nh.raw - skb->data;
+ __skb_pull(skb, skb->mac_len);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+ if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+ segs = ptype->gso_segment(skb, sg);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
@@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct
#define illegal_highdma(dev, skb) (0)
#endif
+struct dev_gso_cb {
+ void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+ struct dev_gso_cb *cb;
+
+ do {
+ struct sk_buff *nskb = skb->next;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ kfree_skb(nskb);
+ } while (skb->next);
+
+ cb = DEV_GSO_CB(skb);
+ if (cb->destructor)
+ cb->destructor(skb);
+}
+
+/**
+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
+ * @skb: buffer to segment
+ *
+ * This function segments the given skb and stores the list of segments
+ * in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, dev->features & NETIF_F_SG &&
+ !illegal_highdma(dev, skb));
+ if (unlikely(IS_ERR(segs)))
+ return PTR_ERR(segs);
+
+ skb->next = segs;
+ DEV_GSO_CB(skb)->destructor = skb->destructor;
+ skb->destructor = dev_gso_skb_destructor;
+
+ return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ if (likely(!skb->next)) {
+ if (netdev_nit)
+ dev_queue_xmit_nit(skb, dev);
+
+ if (!netif_needs_gso(dev, skb))
+ return dev->hard_start_xmit(skb, dev);
+
+ if (unlikely(dev_gso_segment(skb)))
+ goto out_kfree_skb;
+ }
+
+ do {
+ struct sk_buff *nskb = skb->next;
+ int rc;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ rc = dev->hard_start_xmit(nskb, dev);
+ if (unlikely(rc)) {
+ skb->next = nskb;
+ return rc;
+ }
+ } while (skb->next);
+
+ skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+ kfree_skb(skb);
+ return 0;
+}
+
#define HARD_TX_LOCK(dev, cpu) { \
if ((dev->features & NETIF_F_LLTX) == 0) { \
netif_tx_lock(dev); \
@@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb)
struct Qdisc *q;
int rc = -ENOMEM;
+ /* GSO will handle the following emulations directly. */
+ if (netif_needs_gso(dev, skb))
+ goto gso;
+
if (skb_shinfo(skb)->frag_list &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb))
@@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb)
if (skb_checksum_help(skb, 0))
goto out_kfree_skb;
+gso:
spin_lock_prefetch(&dev->queue_lock);
/* Disable soft irqs for various locks below. Also
@@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, cpu);
if (!netif_queue_stopped(dev)) {
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
-
rc = 0;
- if (!dev->hard_start_xmit(skb, dev)) {
+ if (!dev_hard_start_xmit(skb, dev)) {
HARD_TX_UNLOCK(dev);
goto out;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,8 +96,11 @@ static inline int qdisc_restart(struct n
struct sk_buff *skb;
/* Dequeue packet */
- if ((skb = q->dequeue(q)) != NULL) {
+ if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+ dev->gso_skb = NULL;
+
/*
* When the driver has LLTX set it does its own locking
* in start_xmit. No need to add additional overhead by
@@ -134,10 +137,8 @@ static inline int qdisc_restart(struct n
if (!netif_queue_stopped(dev)) {
int ret;
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
- ret = dev->hard_start_xmit(skb, dev);
+ ret = dev_hard_start_xmit(skb, dev);
if (ret == NETDEV_TX_OK) {
if (!nolock) {
netif_tx_unlock(dev);
@@ -171,7 +172,10 @@ static inline int qdisc_restart(struct n
*/
requeue:
- q->ops->requeue(skb, q);
+ if (skb->next)
+ dev->gso_skb = skb;
+ else
+ q->ops->requeue(skb, q);
netif_schedule(dev);
return 1;
}
@@ -576,6 +580,7 @@ void dev_activate(struct net_device *dev
void dev_deactivate(struct net_device *dev)
{
struct Qdisc *qdisc;
+ struct sk_buff *skb;
spin_lock_bh(&dev->queue_lock);
qdisc = dev->qdisc;
@@ -593,6 +598,11 @@ void dev_deactivate(struct net_device *d
/* Wait for outstanding qdisc_run calls. */
while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
yield();
+
+ if (dev->gso_skb) {
+ kfree_skb(dev->gso_skb);
+ dev->gso_skb = NULL;
+ }
}
void dev_init_scheduler(struct net_device *dev)
^ permalink raw reply [flat|nested] 23+ messages in thread
* [3/5] [NET]: Add software TSOv4
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
2006-06-22 8:12 ` [1/5] [NET]: Merge TSO/UFO fields in sk_buff Herbert Xu
2006-06-22 8:13 ` [2/5] [NET]: Add generic segmentation offload Herbert Xu
@ 2006-06-22 8:14 ` Herbert Xu
2006-06-22 8:23 ` Herbert Xu
` (2 more replies)
2006-06-22 8:14 ` [4/5] [NET]: Added GSO toggle Herbert Xu
` (4 subsequent siblings)
7 siblings, 3 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:14 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 361 bytes --]
Hi:
[NET]: Add software TSOv4
This patch adds the GSO implementation for IPv4 TCP.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p3.patch --]
[-- Type: text/plain, Size: 7982 bytes --]
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1299,6 +1299,7 @@ extern void skb_split(struct sk_b
struct sk_buff *skb1, const u32 len);
extern void skb_release_data(struct sk_buff *skb);
+extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg);
static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
int len, void *buffer)
diff --git a/include/net/protocol.h b/include/net/protocol.h
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,6 +37,7 @@
struct net_protocol {
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
+ struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg);
int no_policy;
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1087,6 +1087,8 @@ extern struct request_sock_ops tcp_reque
extern int tcp_v4_destroy_sock(struct sock *sk);
+extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg);
+
#ifdef CONFIG_PROC_FS
extern int tcp4_proc_init(void);
extern void tcp4_proc_exit(void);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1826,6 +1826,132 @@ unsigned char *skb_pull_rcsum(struct sk_
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @skb: buffer to segment
+ * @sg: whether scatter-gather can be used for generated segments
+ *
+ * This function performs segmentation on the given skb. It returns
+ * the segment at the given position. It returns NULL if there are
+ * no more segments to generate, or when an error is encountered.
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+ unsigned int doffset = skb->data - skb->mac.raw;
+ unsigned int offset = doffset;
+ unsigned int headroom;
+ unsigned int len;
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+
+ __skb_push(skb, doffset);
+ headroom = skb_headroom(skb);
+ pos = skb_headlen(skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *frag;
+ int hsize, nsize;
+ int k;
+ int size;
+
+ len = skb->len - offset;
+ if (len > mss)
+ len = mss;
+
+ hsize = skb_headlen(skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ nsize = hsize + doffset;
+ if (nsize > len + doffset || !sg)
+ nsize = len + doffset;
+
+ nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ nskb->dev = skb->dev;
+ nskb->priority = skb->priority;
+ nskb->protocol = skb->protocol;
+ nskb->dst = dst_clone(skb->dst);
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ nskb->pkt_type = skb->pkt_type;
+ nskb->mac_len = skb->mac_len;
+
+ skb_reserve(nskb, headroom);
+ nskb->mac.raw = nskb->data;
+ nskb->nh.raw = nskb->data + skb->mac_len;
+ nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
+ memcpy(skb_put(nskb, doffset), skb->data, doffset);
+
+ if (!sg) {
+ nskb->csum = skb_copy_and_csum_bits(skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ continue;
+ }
+
+ frag = skb_shinfo(nskb)->frags;
+ k = 0;
+
+ nskb->ip_summed = CHECKSUM_HW;
+ nskb->csum = skb->csum;
+ memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ *frag = skb_shinfo(skb)->frags[i];
+ get_page(frag->page);
+ size = frag->size;
+
+ if (pos < offset) {
+ frag->page_offset += offset - pos;
+ frag->size -= offset - pos;
+ }
+
+ k++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ pos += size;
+ } else {
+ frag->size -= pos + size - (offset + len);
+ break;
+ }
+
+ frag++;
+ }
+
+ skb_shinfo(nskb)->nr_frags = k;
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+ } while ((offset += len) < skb->len);
+
+ return segs;
+
+err:
+ while ((skb = segs)) {
+ segs = skb->next;
+ kfree(skb);
+ }
+ return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -68,6 +68,7 @@
*/
#include <linux/config.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *
EXPORT_SYMBOL(inet_sk_rebuild_header);
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct iphdr *iph;
+ struct net_protocol *ops;
+ int proto;
+ int ihl;
+ int id;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ goto out;
+
+ iph = skb->nh.iph;
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ if (!pskb_may_pull(skb, ihl))
+ goto out;
+
+ skb->h.raw = __skb_pull(skb, ihl);
+ iph = skb->nh.iph;
+ id = ntohs(iph->id);
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (ops && ops->gso_segment)
+ segs = ops->gso_segment(skb, sg);
+ rcu_read_unlock();
+
+ if (IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ iph = skb->nh.iph;
+ iph->id = htons(id++);
+ iph->tot_len = htons(skb->len - skb->mac_len);
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
+ } while ((skb = skb->next));
+
+out:
+ return segs;
+}
+
#ifdef CONFIG_IP_MULTICAST
static struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol
static struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
+ .gso_segment = tcp_tso_segment,
.no_policy = 1,
};
@@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void);
static struct packet_type ip_packet_type = {
.type = __constant_htons(ETH_P_IP),
.func = ip_rcv,
+ .gso_segment = inet_gso_segment,
};
static int __init inet_init(void)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -258,6 +258,7 @@
#include <linux/random.h>
#include <linux/bootmem.h>
#include <linux/cache.h>
+#include <linux/err.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *s
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct tcphdr *th;
+ unsigned thlen;
+ unsigned int seq;
+ unsigned int delta;
+ unsigned int oldlen;
+ unsigned int len;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ goto out;
+
+ th = skb->h.th;
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = ~htonl(skb->len);
+ __skb_pull(skb, thlen);
+
+ segs = skb_segment(skb, sg);
+ if (IS_ERR(segs))
+ goto out;
+
+ len = skb_shinfo(skb)->gso_size;
+ delta = csum_add(oldlen, htonl(thlen + len));
+
+ skb = segs;
+ th = skb->h.th;
+ seq = ntohl(th->seq);
+
+ do {
+ th->fin = th->psh = 0;
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ th->check = csum_fold(csum_partial(
+ skb->h.raw, thlen, csum_add(skb->csum, delta)));
+ }
+
+ seq += len;
+ skb = skb->next;
+ th = skb->h.th;
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
+ th->check = csum_fold(csum_partial(
+ skb->h.raw, thlen, csum_add(skb->csum, delta)));
+ }
+
+out:
+ return segs;
+}
+
extern void __skb_cb_too_small_for_tcp(int, int);
extern struct tcp_congestion_ops tcp_reno;
^ permalink raw reply [flat|nested] 23+ messages in thread
* [4/5] [NET]: Added GSO toggle
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (2 preceding siblings ...)
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
@ 2006-06-22 8:14 ` Herbert Xu
2006-06-22 8:14 ` [5/5] [IPSEC]: Handle GSO packets Herbert Xu
` (3 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:14 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 443 bytes --]
Hi:
[NET]: Added GSO toggle
This patch adds a generic segmentation offload toggle that can be turned
on/off for each net device. For now it only supports in TCPv4.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p4.patch --]
[-- Type: text/plain, Size: 3869 bytes --]
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -408,6 +408,8 @@ struct ethtool_ops {
#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */
#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */
#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */
+#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */
+#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */
/* compatibility with older code */
#define SPARC_ETH_GSET ETHTOOL_GSET
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -309,6 +309,7 @@ struct net_device
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
+#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
/* Segmentation offload features */
diff --git a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1031,9 +1031,13 @@ static inline void sk_setup_caps(struct
{
__sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_TSO;
if (sk->sk_route_caps & NETIF_F_TSO) {
if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
sk->sk_route_caps &= ~NETIF_F_TSO;
+ else
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
}
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -376,15 +376,20 @@ void br_features_recompute(struct net_br
features = br->feature_mask & ~NETIF_F_ALL_CSUM;
list_for_each_entry(p, &br->port_list, list) {
- if (checksum & NETIF_F_NO_CSUM &&
- !(p->dev->features & NETIF_F_NO_CSUM))
+ unsigned long feature = p->dev->features;
+
+ if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
- if (checksum & NETIF_F_HW_CSUM &&
- !(p->dev->features & NETIF_F_HW_CSUM))
+ if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
- if (!(p->dev->features & NETIF_F_IP_CSUM))
+ if (!(feature & NETIF_F_IP_CSUM))
checksum = 0;
- features &= p->dev->features;
+
+ if (feature & NETIF_F_GSO)
+ feature |= NETIF_F_TSO;
+ feature |= NETIF_F_GSO;
+
+ features &= feature;
}
br->dev->features = features | checksum | NETIF_F_LLTX;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_de
return dev->ethtool_ops->set_ufo(dev, edata.data);
}
+static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { ETHTOOL_GGSO };
+
+ edata.data = dev->features & NETIF_F_GSO;
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+ if (edata.data)
+ dev->features |= NETIF_F_GSO;
+ else
+ dev->features &= ~NETIF_F_GSO;
+ return 0;
+}
+
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
{
struct ethtool_test test;
@@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr)
case ETHTOOL_SUFO:
rc = ethtool_set_ufo(dev, useraddr);
break;
+ case ETHTOOL_GGSO:
+ rc = ethtool_get_gso(dev, useraddr);
+ break;
+ case ETHTOOL_SGSO:
+ rc = ethtool_set_gso(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
^ permalink raw reply [flat|nested] 23+ messages in thread
* [5/5] [IPSEC]: Handle GSO packets
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (3 preceding siblings ...)
2006-06-22 8:14 ` [4/5] [NET]: Added GSO toggle Herbert Xu
@ 2006-06-22 8:14 ` Herbert Xu
2006-06-22 8:15 ` [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (2 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:14 UTC (permalink / raw)
To: David S. Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 767 bytes --]
Hi:
[IPSEC]: Handle GSO packets
This patch segments GSO packets received by the IPsec stack. This can
happen when a NIC driver injects GSO packets into the stack which are
then forwarded to another host.
The primary application of this is going to be Xen where its backend
driver may inject GSO packets into dom0.
Of course this also can be used by other virtualisation schemes such as
VMWare or UML since the tap device could be modified to inject GSO packets
received through splice.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: p5.patch --]
[-- Type: text/plain, Size: 3351 bytes --]
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -9,6 +9,8 @@
*/
#include <linux/compiler.h>
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/netfilter_ipv4.h>
@@ -97,16 +99,10 @@ error_nolock:
goto out_exit;
}
-static int xfrm4_output_finish(struct sk_buff *skb)
+static int xfrm4_output_finish2(struct sk_buff *skb)
{
int err;
-#ifdef CONFIG_NETFILTER
- if (!skb->dst->xfrm) {
- IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
- }
-#endif
while (likely((err = xfrm4_output_one(skb)) == 0)) {
nf_reset(skb);
@@ -119,7 +115,7 @@ static int xfrm4_output_finish(struct sk
return dst_output(skb);
err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
- skb->dst->dev, xfrm4_output_finish);
+ skb->dst->dev, xfrm4_output_finish2);
if (unlikely(err != 1))
break;
}
@@ -127,6 +123,48 @@ static int xfrm4_output_finish(struct sk
return err;
}
+static int xfrm4_output_finish(struct sk_buff *skb)
+{
+ struct sk_buff *segs;
+
+#ifdef CONFIG_NETFILTER
+ if (!skb->dst->xfrm) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output(skb);
+ }
+#endif
+
+ if (!skb_shinfo(skb)->gso_size)
+ return xfrm4_output_finish2(skb);
+
+ skb->protocol = htons(ETH_P_IP);
+ segs = skb_gso_segment(skb, 0);
+ kfree_skb(skb);
+ if (unlikely(IS_ERR(segs)))
+ return PTR_ERR(segs);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = xfrm4_output_finish2(segs);
+
+ if (unlikely(err)) {
+ while ((segs = nskb)) {
+ nskb = segs->next;
+ segs->next = NULL;
+ kfree_skb(segs);
+ }
+ return err;
+ }
+
+ segs = nskb;
+ } while (segs);
+
+ return 0;
+}
+
int xfrm4_output(struct sk_buff *skb)
{
return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -94,7 +94,7 @@ error_nolock:
goto out_exit;
}
-static int xfrm6_output_finish(struct sk_buff *skb)
+static int xfrm6_output_finish2(struct sk_buff *skb)
{
int err;
@@ -110,7 +110,7 @@ static int xfrm6_output_finish(struct sk
return dst_output(skb);
err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
- skb->dst->dev, xfrm6_output_finish);
+ skb->dst->dev, xfrm6_output_finish2);
if (unlikely(err != 1))
break;
}
@@ -118,6 +118,41 @@ static int xfrm6_output_finish(struct sk
return err;
}
+static int xfrm6_output_finish(struct sk_buff *skb)
+{
+ struct sk_buff *segs;
+
+ if (!skb_shinfo(skb)->gso_size)
+ return xfrm6_output_finish2(skb);
+
+ skb->protocol = htons(ETH_P_IP);
+ segs = skb_gso_segment(skb, 0);
+ kfree_skb(skb);
+ if (unlikely(IS_ERR(segs)))
+ return PTR_ERR(segs);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = xfrm6_output_finish2(segs);
+
+ if (unlikely(err)) {
+ while ((segs = nskb)) {
+ nskb = segs->next;
+ segs->next = NULL;
+ kfree_skb(segs);
+ }
+ return err;
+ }
+
+ segs = nskb;
+ } while (segs);
+
+ return 0;
+}
+
int xfrm6_output(struct sk_buff *skb)
{
return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [0/5] GSO: Generic Segmentation Offload
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (4 preceding siblings ...)
2006-06-22 8:14 ` [5/5] [IPSEC]: Handle GSO packets Herbert Xu
@ 2006-06-22 8:15 ` Herbert Xu
2006-06-22 10:08 ` David Miller
2006-06-22 14:28 ` YOSHIFUJI Hideaki / 吉藤英明
7 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:15 UTC (permalink / raw)
To: David S. Miller, netdev
Hi:
If anyone is interested here is the incremental patch against the previous
series.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c68ab8..d293e0f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1415,7 +1415,7 @@ gso:
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
- local_bh_disable();
+ rcu_read_lock_bh();
/* Updates of qdisc are serialized by queue_lock.
* The struct Qdisc which is pointed to by qdisc is now a
@@ -1486,13 +1486,13 @@ #endif
}
rc = -ENETDOWN;
- local_bh_enable();
+ rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
return rc;
out:
- local_bh_enable();
+ rcu_read_unlock_bh();
return rc;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 472cb5a..4cdd6ca 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -185,9 +185,13 @@ requeue:
void __qdisc_run(struct net_device *dev)
{
+ if (unlikely(dev->qdisc == &noop_qdisc))
+ goto out;
+
while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
/* NOTHING */;
+out:
clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
}
@@ -581,20 +585,24 @@ void dev_deactivate(struct net_device *d
spin_lock_bh(&dev->queue_lock);
qdisc = dev->qdisc;
dev->qdisc = &noop_qdisc;
- skb = dev->gso_skb;
- dev->gso_skb = NULL;
qdisc_reset(qdisc);
spin_unlock_bh(&dev->queue_lock);
- kfree_skb(skb);
dev_watchdog_down(dev);
- while (test_bit(__LINK_STATE_SCHED, &dev->state))
+ /* Wait for outstanding dev_queue_xmit calls. */
+ synchronize_rcu();
+
+ /* Wait for outstanding qdisc_run calls. */
+ while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
yield();
- spin_unlock_wait(&dev->_xmit_lock);
+ if (dev->gso_skb) {
+ kfree_skb(dev->gso_skb);
+ dev->gso_skb = NULL;
+ }
}
void dev_init_scheduler(struct net_device *dev)
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
@ 2006-06-22 8:23 ` Herbert Xu
2006-06-22 15:04 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-23 19:33 ` Michael Chan
2 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-22 8:23 UTC (permalink / raw)
To: David S. Miller, netdev
On Thu, Jun 22, 2006 at 06:14:00PM +1000, herbert wrote:
>
> [NET]: Add software TSOv4
Doh, forgot to remove an unused declaration. Here is an updated version.
[NET]: Add software TSOv4
This patch adds the GSO implementation for IPv4 TCP.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -406,6 +406,9 @@ struct net_device
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
+ /* Partially transmitted GSO packet. */
+ struct sk_buff *gso_skb;
+
/* ingress path synchronizer */
spinlock_t ingress_lock;
struct Qdisc *qdisc_ingress;
@@ -540,6 +543,7 @@ struct packet_type {
struct net_device *,
struct packet_type *,
struct net_device *);
+ struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg);
void *af_packet_priv;
struct list_head list;
};
@@ -690,7 +694,8 @@ extern int dev_change_name(struct net_d
extern int dev_set_mtu(struct net_device *, int);
extern int dev_set_mac_address(struct net_device *,
struct sockaddr *);
-extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+extern int dev_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
extern void dev_init(void);
@@ -964,6 +969,7 @@ extern int netdev_max_backlog;
extern int weight_p;
extern int netdev_set_master(struct net_device *dev, struct net_device *master);
extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
#ifdef CONFIG_BUG
extern void netdev_rx_csum_fault(struct net_device *dev);
#else
diff --git a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -116,6 +116,7 @@
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
+#include <linux/err.h>
/*
* The list of packet types we will receive (as opposed to discard)
@@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct
* taps currently in use.
*/
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
@@ -1186,6 +1187,40 @@ out:
return ret;
}
+/**
+ * skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @sg: whether scatter-gather is supported on the target.
+ *
+ * This function segments the given skb and returns a list of segments.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_type *ptype;
+ int type = skb->protocol;
+
+ BUG_ON(skb_shinfo(skb)->frag_list);
+ BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+ skb->mac.raw = skb->data;
+ skb->mac_len = skb->nh.raw - skb->data;
+ __skb_pull(skb, skb->mac_len);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+ if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+ segs = ptype->gso_segment(skb, sg);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
@@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct
#define illegal_highdma(dev, skb) (0)
#endif
+struct dev_gso_cb {
+ void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+ struct dev_gso_cb *cb;
+
+ do {
+ struct sk_buff *nskb = skb->next;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ kfree_skb(nskb);
+ } while (skb->next);
+
+ cb = DEV_GSO_CB(skb);
+ if (cb->destructor)
+ cb->destructor(skb);
+}
+
+/**
+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
+ * @skb: buffer to segment
+ *
+ * This function segments the given skb and stores the list of segments
+ * in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, dev->features & NETIF_F_SG &&
+ !illegal_highdma(dev, skb));
+ if (unlikely(IS_ERR(segs)))
+ return PTR_ERR(segs);
+
+ skb->next = segs;
+ DEV_GSO_CB(skb)->destructor = skb->destructor;
+ skb->destructor = dev_gso_skb_destructor;
+
+ return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ if (likely(!skb->next)) {
+ if (netdev_nit)
+ dev_queue_xmit_nit(skb, dev);
+
+ if (!netif_needs_gso(dev, skb))
+ return dev->hard_start_xmit(skb, dev);
+
+ if (unlikely(dev_gso_segment(skb)))
+ goto out_kfree_skb;
+ }
+
+ do {
+ struct sk_buff *nskb = skb->next;
+ int rc;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ rc = dev->hard_start_xmit(nskb, dev);
+ if (unlikely(rc)) {
+ skb->next = nskb;
+ return rc;
+ }
+ } while (skb->next);
+
+ skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+ kfree_skb(skb);
+ return 0;
+}
+
#define HARD_TX_LOCK(dev, cpu) { \
if ((dev->features & NETIF_F_LLTX) == 0) { \
netif_tx_lock(dev); \
@@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb)
struct Qdisc *q;
int rc = -ENOMEM;
+ /* GSO will handle the following emulations directly. */
+ if (netif_needs_gso(dev, skb))
+ goto gso;
+
if (skb_shinfo(skb)->frag_list &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb))
@@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb)
if (skb_checksum_help(skb, 0))
goto out_kfree_skb;
+gso:
spin_lock_prefetch(&dev->queue_lock);
/* Disable soft irqs for various locks below. Also
@@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, cpu);
if (!netif_queue_stopped(dev)) {
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
-
rc = 0;
- if (!dev->hard_start_xmit(skb, dev)) {
+ if (!dev_hard_start_xmit(skb, dev)) {
HARD_TX_UNLOCK(dev);
goto out;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,8 +96,11 @@ static inline int qdisc_restart(struct n
struct sk_buff *skb;
/* Dequeue packet */
- if ((skb = q->dequeue(q)) != NULL) {
+ if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+ dev->gso_skb = NULL;
+
/*
* When the driver has LLTX set it does its own locking
* in start_xmit. No need to add additional overhead by
@@ -134,10 +137,8 @@ static inline int qdisc_restart(struct n
if (!netif_queue_stopped(dev)) {
int ret;
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
- ret = dev->hard_start_xmit(skb, dev);
+ ret = dev_hard_start_xmit(skb, dev);
if (ret == NETDEV_TX_OK) {
if (!nolock) {
netif_tx_unlock(dev);
@@ -171,7 +172,10 @@ static inline int qdisc_restart(struct n
*/
requeue:
- q->ops->requeue(skb, q);
+ if (skb->next)
+ dev->gso_skb = skb;
+ else
+ q->ops->requeue(skb, q);
netif_schedule(dev);
return 1;
}
@@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *d
/* Wait for outstanding qdisc_run calls. */
while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
yield();
+
+ if (dev->gso_skb) {
+ kfree_skb(dev->gso_skb);
+ dev->gso_skb = NULL;
+ }
}
void dev_init_scheduler(struct net_device *dev)
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [0/5] GSO: Generic Segmentation Offload
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (5 preceding siblings ...)
2006-06-22 8:15 ` [0/5] GSO: Generic Segmentation Offload Herbert Xu
@ 2006-06-22 10:08 ` David Miller
2006-06-22 14:28 ` YOSHIFUJI Hideaki / 吉藤英明
7 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2006-06-22 10:08 UTC (permalink / raw)
To: herbert; +Cc: netdev
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 18:12:11 +1000
> This is a repost of the GSO patches. The main change is the fix to a bug
> in the way dev->gso_skb is freed. This series requires the dev_deactivate
> patch that I just posted.
Applied, thanks a lot Herbert.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [0/5] GSO: Generic Segmentation Offload
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
` (6 preceding siblings ...)
2006-06-22 10:08 ` David Miller
@ 2006-06-22 14:28 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-24 5:36 ` Herbert Xu
7 siblings, 1 reply; 23+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2006-06-22 14:28 UTC (permalink / raw)
To: herbert; +Cc: davem, netdev, yoshfuji
Hello.
Yes, I genrally like this idea.
In article <20060622081211.GA22505@gondor.apana.org.au> (at Thu, 22 Jun 2006 18:12:11 +1000), Herbert Xu <herbert@gondor.apana.org.au> says:
> GSO like TSO is only effective if the MTU is significantly less than the
> maximum value of 64K. So only the case where the MTU was set to 1500 is
> of interest. There we can see that the throughput improved by 17.5%
> (3061.05Mb/s => 3598.17Mb/s). The actual saving in transmission cost is
> in fact a lot more than that as the majority of the time here is spent on
> the RX side which still has to deal with 1500-byte packets.
Can you measure some with other sizes,
e.g. 4kByte, 8kByte, 9000Byte?
--yoshfuji
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
2006-06-22 8:23 ` Herbert Xu
@ 2006-06-22 15:04 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-22 21:32 ` David Miller
2006-06-23 19:33 ` Michael Chan
2 siblings, 1 reply; 23+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2006-06-22 15:04 UTC (permalink / raw)
To: herbert; +Cc: davem, netdev, yoshfuji
In article <20060622081400.GC22671@gondor.apana.org.au> (at Thu, 22 Jun 2006 18:14:00 +1000), Herbert Xu <herbert@gondor.apana.org.au> says:
> [NET]: Add software TSOv4
>
> This patch adds the GSO implementation for IPv4 TCP.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
I'd appreciate if you code up IPv6 TCP as well. :-)
Regards,
--yoshfuji
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-22 15:04 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2006-06-22 21:32 ` David Miller
2006-06-24 0:28 ` Ravinandan Arakali
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2006-06-22 21:32 UTC (permalink / raw)
To: yoshfuji; +Cc: herbert, netdev
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 23 Jun 2006 00:04:03 +0900 (JST)
> In article <20060622081400.GC22671@gondor.apana.org.au> (at Thu, 22 Jun 2006 18:14:00 +1000), Herbert Xu <herbert@gondor.apana.org.au> says:
>
> > [NET]: Add software TSOv4
> >
> > This patch adds the GSO implementation for IPv4 TCP.
> >
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> I'd appreciate if you code up IPv6 TCP as well. :-)
To my understanding doing IPV6 TCP TSO is a non-trivial task, even in
software.
The header editing is a lot more complicated because things like
routing and other extension headers can sit between IPV6 and TCP
header.
It is probably why IPV6 TSO hardware does not exist yet :)
Do not take this to mean I think it should not be implemented, I think
it should.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
2006-06-22 8:23 ` Herbert Xu
2006-06-22 15:04 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2006-06-23 19:33 ` Michael Chan
2006-06-23 21:26 ` Michael Chan
2 siblings, 1 reply; 23+ messages in thread
From: Michael Chan @ 2006-06-23 19:33 UTC (permalink / raw)
To: Herbert Xu; +Cc: David S. Miller, netdev
On Thu, 2006-06-22 at 18:14 +1000, Herbert Xu wrote:
> [NET]: Add software TSOv4
>
> This patch adds the GSO implementation for IPv4 TCP.
Herbert, Looks like there were some problems in the CHECKSUM_HW case.
This patch should fix it. Please double-check my checksum math.
[NET]: Fix CHECKSUM_HW GSO problems.
Fix the following 2 problems in the GSO code path for CHECKSUM_HW
packets:
1. Adjust ipv4 TCP pseudo header checksum.
2. Initialize skb->tail.
Signed-off-by: Michael Chan <mchan@broadcom.com>
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e5044b..3f19b3d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1954,6 +1954,7 @@ struct sk_buff *skb_segment(struct sk_bu
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
+ nskb->tail += nskb->data_len;
} while ((offset += len) < skb->len);
return segs;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e029c4..3399110 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2186,7 +2186,8 @@ struct sk_buff *tcp_tso_segment(struct s
if (skb->ip_summed == CHECKSUM_NONE) {
th->check = csum_fold(csum_partial(
skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ } else if (skb->ip_summed == CHECKSUM_HW)
+ th->check = ~csum_fold(csum_add(th->check, delta));
seq += len;
skb = skb->next;
@@ -2196,11 +2197,12 @@ struct sk_buff *tcp_tso_segment(struct s
th->cwr = 0;
} while (skb->next);
+ delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
if (skb->ip_summed == CHECKSUM_NONE) {
- delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
th->check = csum_fold(csum_partial(
skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ } else if (skb->ip_summed == CHECKSUM_HW)
+ th->check = ~csum_fold(csum_add(th->check, delta));
out:
return segs;
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-23 19:33 ` Michael Chan
@ 2006-06-23 21:26 ` Michael Chan
2006-06-23 23:38 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Michael Chan @ 2006-06-23 21:26 UTC (permalink / raw)
To: Herbert Xu; +Cc: David S. Miller, netdev
On Fri, 2006-06-23 at 12:33 -0700, Michael Chan wrote:
> On Thu, 2006-06-22 at 18:14 +1000, Herbert Xu wrote:
> > [NET]: Add software TSOv4
> >
> > This patch adds the GSO implementation for IPv4 TCP.
>
> Herbert, Looks like there were some problems in the CHECKSUM_HW case.
> This patch should fix it. Please double-check my checksum math.
This patch is more correct. Please ignore the previous one.
[NET]: Fix CHECKSUM_HW GSO problems.
Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
The ipv4 TCP pseudo header checksum has to be adjusted for GSO
segmented packets.
Signed-off-by: Michael Chan <mchan@broadcom.com>
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e029c4..b9c37f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2186,7 +2186,8 @@ struct sk_buff *tcp_tso_segment(struct s
if (skb->ip_summed == CHECKSUM_NONE) {
th->check = csum_fold(csum_partial(
skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ } else if (skb->ip_summed == CHECKSUM_HW)
+ th->check = ~csum_fold(csum_add(th->check, delta));
seq += len;
skb = skb->next;
@@ -2200,6 +2201,10 @@ struct sk_buff *tcp_tso_segment(struct s
delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
th->check = csum_fold(csum_partial(
skb->h.raw, thlen, csum_add(skb->csum, delta)));
+ } else if (skb->ip_summed == CHECKSUM_HW) {
+ delta = csum_add(oldlen, htonl(skb->len -
+ (skb->h.raw - skb->data)));
+ th->check = ~csum_fold(csum_add(th->check, delta));
}
out:
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-23 21:26 ` Michael Chan
@ 2006-06-23 23:38 ` Herbert Xu
2006-06-23 23:53 ` Herbert Xu
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2006-06-23 23:38 UTC (permalink / raw)
To: Michael Chan; +Cc: David S. Miller, netdev
On Fri, Jun 23, 2006 at 02:26:16PM -0700, Michael Chan wrote:
>
> This patch is more correct. Please ignore the previous one.
>
> [NET]: Fix CHECKSUM_HW GSO problems.
>
> Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
>
> The ipv4 TCP pseudo header checksum has to be adjusted for GSO
> segmented packets.
>
> Signed-off-by: Michael Chan <mchan@broadcom.com>
Good catch. Obviously the only CHECKSUM_HW I tested was loop :)
Looking at this again it seems that we can optimise it further so
how about this?
[NET]: Fix CHECKSUM_HW GSO problems.
Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
The ipv4 TCP pseudo header checksum has to be adjusted for GSO
segmented packets.
The adjustment is needed because the length field in the pseudo-header
changes. However, because we have the inequality oldlen > newlen, we
know that delta = (u16)~oldlen + newlen is still a 16-bit quantity.
This also means that htonl(delta) + th->check still fits in 32 bits.
Therefore we don't have to use csum_add on this operations.
This is based on a patch by Michael Chan <mchan@broadcom.com>.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e029c4..10f1a8c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2166,7 +2166,7 @@ struct sk_buff *tcp_tso_segment(struct s
if (!pskb_may_pull(skb, thlen))
goto out;
- oldlen = ~htonl(skb->len);
+ oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
segs = skb_segment(skb, sg);
@@ -2174,7 +2174,7 @@ struct sk_buff *tcp_tso_segment(struct s
goto out;
len = skb_shinfo(skb)->gso_size;
- delta = csum_add(oldlen, htonl(thlen + len));
+ delta = htonl(oldlen + (thlen + len));
skb = segs;
th = skb->h.th;
@@ -2183,10 +2183,10 @@ struct sk_buff *tcp_tso_segment(struct s
do {
th->fin = th->psh = 0;
- if (skb->ip_summed == CHECKSUM_NONE) {
- th->check = csum_fold(csum_partial(
- skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ th->check = ~csum_fold(th->check + delta);
+ if (skb->ip_summed != CHECKSUM_HW)
+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
+ skb->csum));
seq += len;
skb = skb->next;
@@ -2196,11 +2196,11 @@ struct sk_buff *tcp_tso_segment(struct s
th->cwr = 0;
} while (skb->next);
- if (skb->ip_summed == CHECKSUM_NONE) {
- delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
- th->check = csum_fold(csum_partial(
- skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ delta = htonl(oldlen + (skb->tail - skb->h.raw));
+ th->check = ~csum_fold(th->check + delta);
+ if (skb->ip_summed != CHECKSUM_HW)
+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
+ skb->csum));
out:
return segs;
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-23 23:38 ` Herbert Xu
@ 2006-06-23 23:53 ` Herbert Xu
2006-06-24 3:08 ` Michael Chan
0 siblings, 1 reply; 23+ messages in thread
From: Herbert Xu @ 2006-06-23 23:53 UTC (permalink / raw)
To: Michael Chan; +Cc: David S. Miller, netdev
On Sat, Jun 24, 2006 at 09:38:40AM +1000, herbert wrote:
>
> Good catch. Obviously the only CHECKSUM_HW I tested was loop :)
> Looking at this again it seems that we can optimise it further so
> how about this?
Nevermind, I obviously complete ignored your other fix to the length of
the last segment :) Here is a fixed version.
[NET]: Fix CHECKSUM_HW GSO problems.
Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
The ipv4 TCP pseudo header checksum has to be adjusted for GSO
segmented packets.
The adjustment is needed because the length field in the pseudo-header
changes. However, because we have the inequality oldlen > newlen, we
know that delta = (u16)~oldlen + newlen is still a 16-bit quantity.
This also means that htonl(delta) + th->check still fits in 32 bits.
Therefore we don't have to use csum_add on this operations.
This is based on a patch by Michael Chan <mchan@broadcom.com>.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
cfecbf18c32a6dca8954538b5d5fb7186ed336d1
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e029c4..c04176b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2166,7 +2166,7 @@ struct sk_buff *tcp_tso_segment(struct s
if (!pskb_may_pull(skb, thlen))
goto out;
- oldlen = ~htonl(skb->len);
+ oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
segs = skb_segment(skb, sg);
@@ -2174,7 +2174,7 @@ struct sk_buff *tcp_tso_segment(struct s
goto out;
len = skb_shinfo(skb)->gso_size;
- delta = csum_add(oldlen, htonl(thlen + len));
+ delta = htonl(oldlen + (thlen + len));
skb = segs;
th = skb->h.th;
@@ -2183,10 +2183,10 @@ struct sk_buff *tcp_tso_segment(struct s
do {
th->fin = th->psh = 0;
- if (skb->ip_summed == CHECKSUM_NONE) {
- th->check = csum_fold(csum_partial(
- skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ th->check = ~csum_fold(th->check + delta);
+ if (skb->ip_summed != CHECKSUM_HW)
+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
+ skb->csum));
seq += len;
skb = skb->next;
@@ -2196,11 +2196,11 @@ struct sk_buff *tcp_tso_segment(struct s
th->cwr = 0;
} while (skb->next);
- if (skb->ip_summed == CHECKSUM_NONE) {
- delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
- th->check = csum_fold(csum_partial(
- skb->h.raw, thlen, csum_add(skb->csum, delta)));
- }
+ delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
+ th->check = ~csum_fold(th->check + delta);
+ if (skb->ip_summed != CHECKSUM_HW)
+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
+ skb->csum));
out:
return segs;
^ permalink raw reply related [flat|nested] 23+ messages in thread
* RE: [3/5] [NET]: Add software TSOv4
2006-06-22 21:32 ` David Miller
@ 2006-06-24 0:28 ` Ravinandan Arakali
2006-06-24 1:32 ` YOSHIFUJI Hideaki / 吉藤英明
0 siblings, 1 reply; 23+ messages in thread
From: Ravinandan Arakali @ 2006-06-24 0:28 UTC (permalink / raw)
To: 'David Miller', yoshfuji; +Cc: herbert, netdev
Neterion's Xframe adapter supports TSO over IPv6.
Ravi
-----Original Message-----
From: netdev-owner@vger.kernel.org
[mailto:netdev-owner@vger.kernel.org]On Behalf Of David Miller
Sent: Thursday, June 22, 2006 2:32 PM
To: yoshfuji@linux-ipv6.org
Cc: herbert@gondor.apana.org.au; netdev@vger.kernel.org
Subject: Re: [3/5] [NET]: Add software TSOv4
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 23 Jun 2006 00:04:03 +0900 (JST)
> In article <20060622081400.GC22671@gondor.apana.org.au> (at Thu, 22 Jun
2006 18:14:00 +1000), Herbert Xu <herbert@gondor.apana.org.au> says:
>
> > [NET]: Add software TSOv4
> >
> > This patch adds the GSO implementation for IPv4 TCP.
> >
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> I'd appreciate if you code up IPv6 TCP as well. :-)
To my understanding doing IPV6 TCP TSO is a non-trivial task, even in
software.
The header editing is a lot more complicated because things like
routing and other extension headers can sit between IPV6 and TCP
header.
It is probably why IPV6 TSO hardware does not exist yet :)
Do not take this to mean I think it should not be implemented, I think
it should.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-24 0:28 ` Ravinandan Arakali
@ 2006-06-24 1:32 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-26 18:33 ` Ravinandan Arakali
0 siblings, 1 reply; 23+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2006-06-24 1:32 UTC (permalink / raw)
To: ravinandan.arakali; +Cc: davem, herbert, netdev, yoshfuji
In article <004201c69725$14123700$4110100a@pc.s2io.com> (at Fri, 23 Jun 2006 17:28:12 -0700), "Ravinandan Arakali" <ravinandan.arakali@neterion.com> says:
> Neterion's Xframe adapter supports TSO over IPv6.
I remember you posted some patches.
Would you post revised version reflecting Stephen's comment, please?
--yoshfuji
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-23 23:53 ` Herbert Xu
@ 2006-06-24 3:08 ` Michael Chan
2006-06-26 6:55 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Michael Chan @ 2006-06-24 3:08 UTC (permalink / raw)
To: Herbert Xu; +Cc: David S. Miller, netdev
On Sat, 2006-06-24 at 09:53 +1000, Herbert Xu wrote:
> Nevermind, I obviously complete ignored your other fix to the length of
> the last segment :) Here is a fixed version.
>
> [NET]: Fix CHECKSUM_HW GSO problems.
>
> Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
>
> The ipv4 TCP pseudo header checksum has to be adjusted for GSO
> segmented packets.
>
> The adjustment is needed because the length field in the pseudo-header
> changes. However, because we have the inequality oldlen > newlen, we
> know that delta = (u16)~oldlen + newlen is still a 16-bit quantity.
> This also means that htonl(delta) + th->check still fits in 32 bits.
> Therefore we don't have to use csum_add on this operations.
>
> This is based on a patch by Michael Chan <mchan@broadcom.com>.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
Yes, this should work. ACK.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [0/5] GSO: Generic Segmentation Offload
2006-06-22 14:28 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2006-06-24 5:36 ` Herbert Xu
0 siblings, 0 replies; 23+ messages in thread
From: Herbert Xu @ 2006-06-24 5:36 UTC (permalink / raw)
To: YOSHIFUJI Hideaki / ?$B5HF#1QL@; +Cc: davem, netdev
On Thu, Jun 22, 2006 at 11:28:01PM +0900, YOSHIFUJI Hideaki / ?$B5HF#1QL@ wrote:
>
> Can you measure some with other sizes,
> e.g. 4kByte, 8kByte, 9000Byte?
GSO like TSO is less effective when the MTU is larger. However, NICs
supporting larger MTUs also support SG. So the figures I included for
lo should apply. In those scenarios, GSO is basically on par with the
default segmentation.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [3/5] [NET]: Add software TSOv4
2006-06-24 3:08 ` Michael Chan
@ 2006-06-26 6:55 ` David Miller
0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2006-06-26 6:55 UTC (permalink / raw)
To: mchan; +Cc: herbert, netdev
From: "Michael Chan" <mchan@broadcom.com>
Date: Fri, 23 Jun 2006 20:08:41 -0700
> On Sat, 2006-06-24 at 09:53 +1000, Herbert Xu wrote:
>
> > Nevermind, I obviously complete ignored your other fix to the length of
> > the last segment :) Here is a fixed version.
> >
> > [NET]: Fix CHECKSUM_HW GSO problems.
> >
> > Fix checksum problems in the GSO code path for CHECKSUM_HW packets.
> >
> > The ipv4 TCP pseudo header checksum has to be adjusted for GSO
> > segmented packets.
> >
> > The adjustment is needed because the length field in the pseudo-header
> > changes. However, because we have the inequality oldlen > newlen, we
> > know that delta = (u16)~oldlen + newlen is still a 16-bit quantity.
> > This also means that htonl(delta) + th->check still fits in 32 bits.
> > Therefore we don't have to use csum_add on this operations.
> >
> > This is based on a patch by Michael Chan <mchan@broadcom.com>.
> >
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> >
> Yes, this should work. ACK.
Applied, thanks a lot guys.
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [3/5] [NET]: Add software TSOv4
2006-06-24 1:32 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2006-06-26 18:33 ` Ravinandan Arakali
0 siblings, 0 replies; 23+ messages in thread
From: Ravinandan Arakali @ 2006-06-26 18:33 UTC (permalink / raw)
To: 'YOSHIFUJI Hideaki / <g"!?p-?'; +Cc: davem, herbert, netdev
We are working on it.
Ravi
-----Original Message-----
From: YOSHIFUJI Hideaki / <g"!?p-? [mailto:yoshfuji@linux-ipv6.org]
Sent: Friday, June 23, 2006 6:33 PM
To: ravinandan.arakali@neterion.com
Cc: davem@davemloft.net; herbert@gondor.apana.org.au;
netdev@vger.kernel.org; yoshfuji@linux-ipv6.org
Subject: Re: [3/5] [NET]: Add software TSOv4
In article <004201c69725$14123700$4110100a@pc.s2io.com> (at Fri, 23 Jun 2006
17:28:12 -0700), "Ravinandan Arakali" <ravinandan.arakali@neterion.com>
says:
> Neterion's Xframe adapter supports TSO over IPv6.
I remember you posted some patches.
Would you post revised version reflecting Stephen's comment, please?
--yoshfuji
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2006-06-26 18:34 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-22 8:12 [0/5] GSO: Generic Segmentation Offload Herbert Xu
2006-06-22 8:12 ` [1/5] [NET]: Merge TSO/UFO fields in sk_buff Herbert Xu
2006-06-22 8:13 ` [2/5] [NET]: Add generic segmentation offload Herbert Xu
2006-06-22 8:14 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
2006-06-22 8:23 ` Herbert Xu
2006-06-22 15:04 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-22 21:32 ` David Miller
2006-06-24 0:28 ` Ravinandan Arakali
2006-06-24 1:32 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-26 18:33 ` Ravinandan Arakali
2006-06-23 19:33 ` Michael Chan
2006-06-23 21:26 ` Michael Chan
2006-06-23 23:38 ` Herbert Xu
2006-06-23 23:53 ` Herbert Xu
2006-06-24 3:08 ` Michael Chan
2006-06-26 6:55 ` David Miller
2006-06-22 8:14 ` [4/5] [NET]: Added GSO toggle Herbert Xu
2006-06-22 8:14 ` [5/5] [IPSEC]: Handle GSO packets Herbert Xu
2006-06-22 8:15 ` [0/5] GSO: Generic Segmentation Offload Herbert Xu
2006-06-22 10:08 ` David Miller
2006-06-22 14:28 ` YOSHIFUJI Hideaki / 吉藤英明
2006-06-24 5:36 ` Herbert Xu
-- strict thread matches above, loose matches on Subject: below --
2006-06-20 9:09 Herbert Xu
2006-06-20 9:29 ` [3/5] [NET]: Add software TSOv4 Herbert Xu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).