* [PATCH] TSO Reloaded
@ 2005-05-05 6:07 David S. Miller
2005-05-05 22:35 ` David S. Miller
` (2 more replies)
0 siblings, 3 replies; 27+ messages in thread
From: David S. Miller @ 2005-05-05 6:07 UTC (permalink / raw)
To: netdev
Ok, here it is, first draft of the new TSO handling
I promised so long ago :-) I was lazy and waited until
today to implement the entire thing.
It works with basic testing over tg3.
I'll discuss the changes in more detail tomorrow, but
the only potentially sore spot right now is the tcp_push_one()
avoidance done in tcp_sendmsg() and tcp_sendpages(). It may
need to be changed to do so mething like "if not TSO then
tcp_push_one(), else wait for N packets to accumulate" where
N is configurable or dynamically measured in some way.
It would be nice to know that this thing works with e1000 and
other TSO capable cards, also I did not do much sendfile() testing
at all.
Thanks.
--- ./include/linux/tcp.h.~1~ 2005-04-20 10:18:11.000000000 -0700
+++ ./include/linux/tcp.h 2005-05-04 15:21:50.000000000 -0700
@@ -280,8 +280,7 @@ struct tcp_sock {
__u32 snd_wnd; /* The window we expect to receive */
__u32 max_window; /* Maximal window ever seen from peer */
__u32 pmtu_cookie; /* Last pmtu seen by socket */
- __u32 mss_cache; /* Cached effective mss, not including SACKS */
- __u16 mss_cache_std; /* Like mss_cache, but without TSO */
+ __u16 mss_cache; /* Cached effective mss, not including SACKS */
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
__u8 ca_state; /* State of fast-retransmit machine */
__u8 retransmits; /* Number of unrecovered RTO timeouts. */
--- ./include/net/tcp.h.~1~ 2005-05-03 14:32:02.000000000 -0700
+++ ./include/net/tcp.h 2005-05-04 22:42:49.000000000 -0700
@@ -943,7 +943,6 @@ extern int tcp_write_xmit(struct sock *,
extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
extern void tcp_xmit_retransmit_queue(struct sock *);
extern void tcp_simple_retransmit(struct sock *);
-extern int tcp_trim_head(struct sock *, struct sk_buff *, u32);
extern void tcp_send_probe0(struct sock *);
extern void tcp_send_partial(struct sock *);
@@ -962,7 +961,7 @@ extern void tcp_clear_xmit_timers(struct
extern void tcp_delete_keepalive_timer(struct sock *);
extern void tcp_reset_keepalive_timer(struct sock *, unsigned long);
extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
-extern unsigned int tcp_current_mss(struct sock *sk, int large);
+extern unsigned int tcp_current_mss(struct sock *sk);
#ifdef TCP_DEBUG
extern const char tcp_timer_bug_msg[];
@@ -1054,7 +1053,7 @@ static inline void tcp_reset_xmit_timer(
static inline void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned int hint = min(tp->advmss, tp->mss_cache_std);
+ unsigned int hint = min(tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd/2);
hint = min(hint, TCP_MIN_RCVMSS);
@@ -1163,45 +1162,16 @@ struct tcp_skb_cb {
#include <net/tcp_ecn.h>
-/* Due to TSO, an SKB can be composed of multiple actual
- * packets. To keep these tracked properly, we use this.
- */
-static inline int tcp_skb_pcount(const struct sk_buff *skb)
-{
- return skb_shinfo(skb)->tso_segs;
-}
-
-/* This is valid iff tcp_skb_pcount() > 1. */
-static inline int tcp_skb_mss(const struct sk_buff *skb)
-{
- return skb_shinfo(skb)->tso_size;
-}
-
-static inline void tcp_dec_pcount_approx(__u32 *count,
- const struct sk_buff *skb)
-{
- if (*count) {
- *count -= tcp_skb_pcount(skb);
- if ((int)*count < 0)
- *count = 0;
- }
-}
-
static inline void tcp_packets_out_inc(struct sock *sk,
- struct tcp_sock *tp,
- const struct sk_buff *skb)
+ struct tcp_sock *tp)
{
- int orig = tp->packets_out;
-
- tp->packets_out += tcp_skb_pcount(skb);
- if (!orig)
+ if (!tp->packets_out++)
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
-static inline void tcp_packets_out_dec(struct tcp_sock *tp,
- const struct sk_buff *skb)
+static inline void tcp_packets_out_dec(struct tcp_sock *tp)
{
- tp->packets_out -= tcp_skb_pcount(skb);
+ tp->packets_out--;
}
/* This determines how many packets are "in the network" to the best
@@ -1397,57 +1367,39 @@ static __inline__ void tcp_minshall_upda
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- 1. It is full sized.
- 2. Or it contains FIN.
- 3. Or TCP_NODELAY was set.
- 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- With Minshall's modification: all sent small packets are ACKed.
- */
-
-static __inline__ int
-tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb,
- unsigned mss_now, int nonagle)
-{
- return (skb->len < mss_now &&
- !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
- ((nonagle&TCP_NAGLE_CORK) ||
- (!nonagle &&
- tp->packets_out &&
- tcp_minshall_check(tp))));
-}
-
-extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *);
-
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now.
+/* This determines how many packets, starting with skb,
+ * should be put on the wire right now. It is guarenteed
+ * that this many valid packets are in the socket write
+ * queue, and all of which are in-window.
*/
-static __inline__ int tcp_snd_test(struct sock *sk,
- struct sk_buff *skb,
- unsigned cur_mss, int nonagle)
+static __inline__ unsigned int tcp_snd_test(struct sock *sk,
+ struct sk_buff *skb,
+ unsigned cur_mss, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
- int pkts = tcp_skb_pcount(skb);
-
- if (!pkts) {
- tcp_set_skb_tso_segs(sk, skb);
- pkts = tcp_skb_pcount(skb);
- }
+ unsigned int in_flight, cwnd;
+ int nagle_check, nagle_allows;
/* RFC 1122 - section 4.2.3.4
*
- * We must queue if
- *
- * a) The right edge of this frame exceeds the window
- * b) There are packets in flight and we have a small segment
- * [SWS avoidance and Nagle algorithm]
- * (part of SWS is done on packetization)
- * Minshall version sounds: there are no _small_
- * segments in flight. (tcp_nagle_check)
- * c) We have too many packets 'in flight'
+ * We must queue if the right edge of this frame exceeds
+ * the window.
+ */
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd))
+ return 0;
+
+ /* If we're looking at the final FIN frame, just sent it
+ * out now.
+ */
+ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ return 1;
+
+ /* We must queue if there are packets in flight and we have
+ * a small segment (SWS avoidance and Nagle algorithm, part
+ * of SWS is done on packetization). Minshall version sounds:
+ * there are no _small_ segments in flight.
*
- * Don't use the nagle rule for urgent data (or
- * for the final FIN -DaveM).
+ * Don't use the nagle rule for urgent data.
*
* Also, Nagle rule does not apply to frames, which
* sit in the middle of queue (they have no chances
@@ -1455,14 +1407,36 @@ static __inline__ int tcp_snd_test(struc
* not enough to save something seriously (<32 for now).
*/
- /* Don't be strict about the congestion window for the
- * final FIN frame. -DaveM
- */
- return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
- || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
- (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
- !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
+ nagle_check = (skb->len < cur_mss &&
+ ((nonagle&TCP_NAGLE_CORK) ||
+ (!nonagle &&
+ tp->packets_out &&
+ tcp_minshall_check(tp))));
+ nagle_allows = ((nonagle & TCP_NAGLE_PUSH) ||
+ tp->urg_mode ||
+ !nagle_check);
+ if (!nagle_allows)
+ return 0;
+
+ /* We must queue if we have too many packets 'in flight'. */
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tp->snd_cwnd;
+ if (in_flight < cwnd) {
+ unsigned int ret = 0;
+
+ cwnd -= in_flight;
+ while (cwnd--) {
+ ret++;
+ skb = skb->next;
+ if (skb == (struct sk_buff *)&sk->sk_write_queue ||
+ after(TCP_SKB_CB(skb)->end_seq,
+ tp->snd_una+tp->snd_wnd))
+ break;
+ }
+ return ret;
+ }
+
+ return 0;
}
static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
@@ -1501,7 +1475,7 @@ static __inline__ void __tcp_push_pendin
static __inline__ void tcp_push_pending_frames(struct sock *sk,
struct tcp_sock *tp)
{
- __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
+ __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk), tp->nonagle);
}
static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
@@ -1509,7 +1483,7 @@ static __inline__ int tcp_may_send_now(s
struct sk_buff *skb = sk->sk_send_head;
return (skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
}
@@ -1986,7 +1960,7 @@ static inline void tcp_westwood_update_r
static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
{
return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
- (__u32) (tp->mss_cache_std),
+ (__u32) (tp->mss_cache),
2U);
}
--- ./include/net/sock.h.~1~ 2005-05-03 11:34:28.000000000 -0700
+++ ./include/net/sock.h 2005-05-04 16:35:59.000000000 -0700
@@ -1128,13 +1128,16 @@ static inline void sk_stream_moderate_sn
static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
int size, int mem, int gfp)
{
- struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp);
+ struct sk_buff *skb;
+ int hdr_len;
+ hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
+ skb = alloc_skb(size + hdr_len, gfp);
if (skb) {
skb->truesize += mem;
if (sk->sk_forward_alloc >= (int)skb->truesize ||
sk_stream_mem_schedule(sk, skb->truesize, 0)) {
- skb_reserve(skb, sk->sk_prot->max_header);
+ skb_reserve(skb, hdr_len);
return skb;
}
__kfree_skb(skb);
--- ./net/ipv4/tcp_output.c.~1~ 2005-04-24 19:06:29.000000000 -0700
+++ ./net/ipv4/tcp_output.c 2005-05-04 22:55:15.000000000 -0700
@@ -41,6 +41,7 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
+#include <linux/kallsyms.h>
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
@@ -58,7 +59,7 @@ static inline void update_send_head(stru
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
sk->sk_send_head = NULL;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, tp, skb);
+ tcp_packets_out_inc(sk, tp);
}
/* SND.NXT, if window was not shrunk.
@@ -274,12 +275,13 @@ static int tcp_transmit_skb(struct sock
int sysctl_flags;
int err;
- BUG_ON(!tcp_skb_pcount(skb));
-
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
+ /* Callers must make sure this is set to 1 or greater. */
+ BUG_ON(!skb_shinfo(skb)->tso_segs);
+
sysctl_flags = 0;
if (tcb->flags & TCPCB_FLAG_SYN) {
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -409,58 +411,32 @@ static void tcp_queue_skb(struct sock *s
sk->sk_send_head = skb;
}
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
- /* Force push to be on for any TSO frames to workaround
- * problems with busted implementations like Mac OS-X that
- * hold off socket receive wakeups until push is seen.
- */
- if (tcp_skb_pcount(skb) > 1)
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
+ * true push pending frames to setup probe timer etc. Since we are
+ * sending only one frame, at most, there is no reason to try to
+ * cons up a TSO frame here.
*/
void tcp_push_one(struct sock *sk, unsigned cur_mss)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = sk->sk_send_head;
- if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
+ if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH) != 0) {
/* Send it out now. */
+#if 0
+ printk("TCP: tcp_push_one() PACKETS_OUT(%d) CWND(%d) WRITE_QLEN(%d)\n",
+ tp->packets_out, tp->snd_cwnd, skb_queue_len(&sk->sk_write_queue));
+#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
sk->sk_send_head = NULL;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, tp, skb);
+ tcp_packets_out_inc(sk, tp);
return;
}
}
}
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (skb->len <= tp->mss_cache_std ||
- !(sk->sk_route_caps & NETIF_F_TSO)) {
- /* Avoid the costly divide in the normal
- * non-TSO case.
- */
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
- } else {
- unsigned int factor;
-
- factor = skb->len + (tp->mss_cache_std - 1);
- factor /= tp->mss_cache_std;
- skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = tp->mss_cache_std;
- }
-}
-
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -468,7 +444,6 @@ void tcp_set_skb_tso_segs(struct sock *s
*/
static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int nsize;
u16 flags;
@@ -488,6 +463,10 @@ static int tcp_fragment(struct sock *sk,
return -ENOMEM; /* We'll just try again later. */
sk_charge_skb(sk, buff);
+ /* Init TSO state. */
+ skb_shinfo(buff)->tso_segs = 1;
+ skb_shinfo(buff)->tso_size = 0;
+
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -522,93 +501,12 @@ static int tcp_fragment(struct sock *sk,
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->left_out -= tcp_skb_pcount(skb);
- }
-
- /* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(skb);
- tp->left_out += tcp_skb_pcount(skb);
- }
-
- if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(buff);
- tp->left_out += tcp_skb_pcount(buff);
- }
-
/* Link BUFF into the send queue. */
__skb_append(skb, buff);
return 0;
}
-/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
- * eventually). The difference is that pulled data not copied, but
- * immediately discarded.
- */
-static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
-{
- int i, k, eat;
-
- eat = len;
- k = 0;
- for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
- } else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
- if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
- eat = 0;
- }
- k++;
- }
- }
- skb_shinfo(skb)->nr_frags = k;
-
- skb->tail = skb->data;
- skb->data_len -= len;
- skb->len = skb->data_len;
- return skb->tail;
-}
-
-int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
-{
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
- return -ENOMEM;
-
- if (len <= skb_headlen(skb)) {
- __skb_pull(skb, len);
- } else {
- if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
- return -ENOMEM;
- }
-
- TCP_SKB_CB(skb)->seq += len;
- skb->ip_summed = CHECKSUM_HW;
-
- skb->truesize -= len;
- sk->sk_wmem_queued -= len;
- sk->sk_forward_alloc += len;
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
-
- /* Any change of skb->len requires recalculation of tso
- * factor and mss.
- */
- if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb);
-
- return 0;
-}
-
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -662,7 +560,7 @@ unsigned int tcp_sync_mss(struct sock *s
/* And store cached results */
tp->pmtu_cookie = pmtu;
- tp->mss_cache = tp->mss_cache_std = mss_now;
+ tp->mss_cache = mss_now;
return mss_now;
}
@@ -675,56 +573,306 @@ unsigned int tcp_sync_mss(struct sock *s
* is not a big flaw.
*/
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
- unsigned int do_large, mss_now;
+ unsigned int mss_now;
- mss_now = tp->mss_cache_std;
+ mss_now = tp->mss_cache;
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != tp->pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
- do_large = (large &&
- (sk->sk_route_caps & NETIF_F_TSO) &&
- !tp->urg_mode);
-
- if (do_large) {
- unsigned int large_mss, factor, limit;
-
- large_mss = 65535 - tp->af_specific->net_header_len -
- tp->ext_header_len - tp->tcp_header_len;
-
- if (tp->max_window && large_mss > (tp->max_window>>1))
- large_mss = max((tp->max_window>>1),
- 68U - tp->tcp_header_len);
-
- factor = large_mss / mss_now;
-
- /* Always keep large mss multiple of real mss, but
- * do not exceed 1/tso_win_divisor of the congestion window
- * so we can keep the ACK clock ticking and minimize
- * bursting.
+ if (tp->rx_opt.eff_sacks)
+ mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+ return mss_now;
+}
+
+static inline int tcp_skb_data_all_paged(struct sk_buff *skb)
+{
+ return (skb->len == skb->data_len);
+}
+
+/* If possible, append paged data of SRC_SKB onto the
+ * tail of DST_SKB.
+ *
+ * The only truly complicated part about this is cleanly
+ * unwinding the state when we hit MAX_SKB_FRAGS. We defer
+ * updating nr_frags and data_len until all frags are appended
+ * successfully.
+ */
+static int skb_append_pages(struct sk_buff *dst_skb, struct sk_buff *src_skb)
+{
+ int i, dst_nr_frags, dst_new_data_len, err;
+ int first_new_frag = -1;
+ int orig_tail_frag_size = -1;
+
+ if (!tcp_skb_data_all_paged(src_skb)) {
+#if 0
+ printk("skb_append_data: SRC skb not all paged, len(%d) data_len(%d)\n",
+ src_skb->len, src_skb->data_len);
+#endif
+ return -EINVAL;
+ }
+
+ dst_nr_frags = skb_shinfo(dst_skb)->nr_frags;
+ dst_new_data_len = 0;
+ if (dst_nr_frags != 0) {
+ skb_frag_t *frag = &skb_shinfo(dst_skb)->frags[dst_nr_frags-1];
+
+ orig_tail_frag_size = frag->size;
+ }
+ for (i = 0; i < skb_shinfo(src_skb)->nr_frags; i++) {
+ skb_frag_t *src_frag = &skb_shinfo(src_skb)->frags[i];
+ skb_frag_t *dst_frag;
+ int dst_frag_idx;
+
+ dst_frag_idx = dst_nr_frags;
+
+ if (skb_can_coalesce(dst_skb, dst_frag_idx,
+ src_frag->page, src_frag->page_offset)) {
+ dst_frag = &skb_shinfo(dst_skb)->frags[dst_frag_idx-1];
+ dst_frag->size += src_frag->size;
+ } else {
+ err = -EMSGSIZE;
+ if (dst_frag_idx >= MAX_SKB_FRAGS) {
+#if 0
+ printk("skb_append_data: Hit MAX_SKB_FRAGS, unwinding.\n");
+#endif
+ goto unwind_state;
+ }
+
+ if (first_new_frag == -1)
+ first_new_frag = dst_frag_idx;
+ dst_frag = &skb_shinfo(dst_skb)->frags[dst_frag_idx];
+ dst_nr_frags = dst_frag_idx + 1;
+
+ dst_frag->page = src_frag->page;
+ get_page(src_frag->page);
+
+ dst_frag->page_offset = src_frag->page_offset;
+ dst_frag->size = src_frag->size;
+ }
+ dst_new_data_len += src_frag->size;
+ }
+ skb_shinfo(dst_skb)->nr_frags = dst_nr_frags;
+ dst_skb->len += dst_new_data_len;
+ dst_skb->data_len += dst_new_data_len;
+ dst_skb->truesize += dst_new_data_len;
+ TCP_SKB_CB(dst_skb)->end_seq += dst_new_data_len;
+ TCP_SKB_CB(dst_skb)->flags |=
+ (TCP_SKB_CB(src_skb)->flags & (TCPCB_FLAG_FIN|TCPCB_FLAG_PSH));
+
+ return 0;
+
+unwind_state:
+ /* Release any coalesced data. */
+ if (orig_tail_frag_size != -1) {
+ int nr_frags = skb_shinfo(dst_skb)->nr_frags;
+ skb_frag_t *frag = &skb_shinfo(dst_skb)->frags[nr_frags-1];
+
+ frag->size = orig_tail_frag_size;
+ }
+
+ /* Release any pages we added. */
+ if (first_new_frag != -1) {
+ for (i = first_new_frag; i < dst_nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(dst_skb)->frags[i];
+
+ BUG_ON(!frag->page);
+ put_page(frag->page);
+ frag->page = NULL;
+ }
+ }
+
+ return err;
+}
+
+static inline struct sk_buff *alloc_tso_skb(struct sock *sk,
+ struct tcp_sock *tp,
+ struct sk_buff *cur,
+ struct sk_buff *head)
+{
+ int hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
+
+ if (cur)
+ return cur;
+
+ if (!(sk->sk_route_caps & NETIF_F_TSO) ||
+ tp->urg_mode)
+ return NULL;
+
+ cur = alloc_skb(hdr_len, GFP_ATOMIC);
+ if (cur) {
+ skb_reserve(cur, hdr_len);
+ skb_shinfo(cur)->tso_segs = 1;
+ skb_shinfo(cur)->tso_size = 0;
+ TCP_SKB_CB(cur)->seq = TCP_SKB_CB(head)->seq;
+ TCP_SKB_CB(cur)->end_seq = TCP_SKB_CB(head)->seq;
+ TCP_SKB_CB(cur)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(cur)->sacked = 0;
+ cur->ip_summed = head->ip_summed;
+ }
+ return cur;
+}
+
+static inline int tcp_transmit_tso_skb(struct sock *sk, unsigned int mss_now, unsigned int tso_count, struct sk_buff *tso_skb, struct sk_buff *last)
+{
+ int err;
+
+ BUG_ON(!last);
+ if (tso_skb->len > mss_now) {
+ unsigned int len = tso_skb->len;
+
+ len += (mss_now - 1);
+ skb_shinfo(tso_skb)->tso_segs = len / mss_now;
+ skb_shinfo(tso_skb)->tso_size = mss_now;
+ }
+
+ err = tcp_transmit_skb(sk, tso_skb);
+ if (!err) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ int orig;
+
+ sk->sk_send_head = last->next;
+ if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
+ sk->sk_send_head = NULL;
+ tp->snd_nxt = TCP_SKB_CB(tso_skb)->end_seq;
+
+ orig = tp->packets_out;
+ tp->packets_out += tso_count;
+ if (!orig)
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+
+ tcp_minshall_update(tp, mss_now, tso_skb);
+ }
+
+ return err;
+}
+
+/* Transmit N packets starting at SKB. If possible, coalesce packets
+ * into TSO frames, else just send them one-by-one. Return the number
+ * of frames actually sent.
+ */
+static unsigned int segment_and_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, unsigned int n)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tso_skb, *last;
+ unsigned int sent = 0, tso_count;
+
+#if 0
+ printk("TCP: segment_and_xmit() N(%d) PACKETS_OUT(%d) CWND(%d) WRITE_QLEN(%d) ",
+ n, tp->packets_out, tp->snd_cwnd, skb_queue_len(&sk->sk_write_queue));
+ print_symbol("from(%s)\n", (unsigned long)__builtin_return_address(0));
+#endif
+ tso_skb = alloc_tso_skb(sk, tp, NULL, skb);
+ last = NULL;
+ tso_count = 0;
+ while (n--) {
+ struct sk_buff *next;
+
+ /* tcp_snd_test() guarentees this for us. */
+ BUG_ON(skb == (struct sk_buff *)&sk->sk_write_queue ||
+ after(TCP_SKB_CB(skb)->end_seq,
+ tp->snd_una+tp->snd_wnd));
+
+ if (skb->len > mss_now &&
+ tcp_fragment(sk, skb, mss_now))
+ break;
+
+ /* This must be after the tcp_fragment() call. */
+ next = skb->next;
+
+ /* Always update transmit stamp, even when doing TSO
+ * gathering.
*/
- limit = tp->snd_cwnd;
- if (sysctl_tcp_tso_win_divisor)
- limit /= sysctl_tcp_tso_win_divisor;
- limit = max(1U, limit);
- if (factor > limit)
- factor = limit;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ if (tso_skb) {
+ int err;
+
+ do_append:
+ err = skb_append_pages(tso_skb, skb);
+ if (!err) {
+ last = skb;
+ tso_count++;
+ goto next_packet;
+ }
+
+ /* Either we hit the MAX_SKB_FRAGS limit, or
+ * we hit a packet that has non-paged data.
+ * Regardless, we first send off the existing
+ * TSO frame we've been building if it contains
+ * any data.
+ */
+ if (tso_skb->len) {
+ TCP_SKB_CB(tso_skb)->when = tcp_time_stamp;
+ if (tcp_transmit_tso_skb(sk, mss_now, tso_count,
+ tso_skb, last))
+ break;
+ tso_skb = NULL;
+ }
+ tso_skb = alloc_tso_skb(sk, tp, tso_skb, skb);
+ last = NULL;
+ tso_count = 0;
+ if (tso_skb && err == -EMSGSIZE)
+ goto do_append;
- tp->mss_cache = mss_now * factor;
+ /* Fallthrough to send the non-paged SKB. */
+ }
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ break;
+
+ update_send_head(sk, tp, skb);
- mss_now = tp->mss_cache;
+ tcp_minshall_update(tp, mss_now, skb);
+
+ next_packet:
+ sent++;
+
+ skb = next;
+ }
+ if (tso_skb) {
+ if (tso_skb->len) {
+ TCP_SKB_CB(tso_skb)->when = tcp_time_stamp;
+ tcp_transmit_tso_skb(sk, mss_now, tso_count,
+ tso_skb, last);
+ } else
+ kfree_skb(tso_skb);
}
- if (tp->rx_opt.eff_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
- return mss_now;
+ return sent;
+}
+
+/* Send as much of the send queue as possible, possibly coalescing
+ * single frames into TSO frames. Return whether any packets were
+ * sent successfully.
+ */
+static inline unsigned int emit_send_queue(struct sock *sk, int nonagle)
+{
+ struct sk_buff *skb = sk->sk_send_head;
+ unsigned int mss_now = tcp_current_mss(sk);
+ unsigned int packets_allowed;
+ unsigned int sent_pkts = 0;
+
+ if (!skb)
+ goto out;
+
+ packets_allowed = tcp_snd_test(sk, skb, mss_now,
+ tcp_skb_is_last(sk, skb) ?
+ nonagle :
+ TCP_NAGLE_PUSH);
+ if (!packets_allowed)
+ goto out;
+
+ sent_pkts = segment_and_xmit(sk, skb, mss_now, packets_allowed);
+
+out:
+ return sent_pkts;
}
/* This routine writes packets to the network. It advances the
@@ -736,48 +884,14 @@ unsigned int tcp_current_mss(struct sock
*/
int tcp_write_xmit(struct sock *sk, int nonagle)
{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned int mss_now;
-
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
* will be happy.
*/
if (sk->sk_state != TCP_CLOSE) {
- struct sk_buff *skb;
- int sent_pkts = 0;
-
- /* Account for SACKS, we may need to fragment due to this.
- * It is just like the real MSS changing on us midstream.
- * We also handle things correctly when the user adds some
- * IP options mid-stream. Silly to do, but cover it.
- */
- mss_now = tcp_current_mss(sk, 1);
-
- while ((skb = sk->sk_send_head) &&
- tcp_snd_test(sk, skb, mss_now,
- tcp_skb_is_last(sk, skb) ? nonagle :
- TCP_NAGLE_PUSH)) {
- if (skb->len > mss_now) {
- if (tcp_fragment(sk, skb, mss_now))
- break;
- }
-
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
- if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
- break;
-
- /* Advance the send_head. This one is sent out.
- * This call will increment packets_out.
- */
- update_send_head(sk, tp, skb);
-
- tcp_minshall_update(tp, mss_now, skb);
- sent_pkts = 1;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
- if (sent_pkts) {
+ if (emit_send_queue(sk, nonagle)) {
tcp_cwnd_validate(sk, tp);
return 0;
}
@@ -928,9 +1042,6 @@ static void tcp_retrans_try_collapse(str
((skb_size + next_skb_size) > mss_now))
return;
- BUG_ON(tcp_skb_pcount(skb) != 1 ||
- tcp_skb_pcount(next_skb) != 1);
-
/* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, next_skb->list);
@@ -954,22 +1065,23 @@ static void tcp_retrans_try_collapse(str
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(next_skb);
+ tp->retrans_out--;
if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
- tp->lost_out -= tcp_skb_pcount(next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
+ tp->lost_out--;
+ tp->left_out--;
}
/* Reno case is special. Sigh... */
if (!tp->rx_opt.sack_ok && tp->sacked_out) {
- tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
+ tp->sacked_out--;
+ tp->left_out--;
}
/* Not quite right: it can be > snd.fack, but
* it is better to underestimate fackets.
*/
- tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
- tcp_packets_out_dec(tp, next_skb);
+ if (tp->fackets_out)
+ tp->fackets_out--;
+ tcp_packets_out_dec(tp);
sk_stream_free_skb(sk, next_skb);
}
}
@@ -982,7 +1094,7 @@ void tcp_simple_retransmit(struct sock *
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
int lost = 0;
sk_stream_for_retrans_queue(skb, sk) {
@@ -990,11 +1102,11 @@ void tcp_simple_retransmit(struct sock *
!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retrans_out--;
}
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
lost = 1;
}
}
@@ -1027,7 +1139,7 @@ void tcp_simple_retransmit(struct sock *
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned int cur_mss = tcp_current_mss(sk, 0);
+ unsigned int cur_mss = tcp_current_mss(sk);
int err;
/* Do not sent more than we queued. 1/4 is reserved for possible
@@ -1037,20 +1149,6 @@ int tcp_retransmit_skb(struct sock *sk,
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
return -EAGAIN;
- if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
- if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- BUG();
-
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
- }
-
- if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
- return -ENOMEM;
- }
-
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
@@ -1061,16 +1159,11 @@ int tcp_retransmit_skb(struct sock *sk,
return -EAGAIN;
if (skb->len > cur_mss) {
- int old_factor = tcp_skb_pcount(skb);
- int new_factor;
-
if (tcp_fragment(sk, skb, cur_mss))
return -ENOMEM; /* We'll try again later. */
/* New SKB created, account for it. */
- new_factor = tcp_skb_pcount(skb);
- tp->packets_out -= old_factor - new_factor;
- tp->packets_out += tcp_skb_pcount(skb->next);
+ tp->packets_out++;
}
/* Collapse two adjacent packets if worthwhile and we can. */
@@ -1079,7 +1172,6 @@ int tcp_retransmit_skb(struct sock *sk,
(skb->next != sk->sk_send_head) &&
(skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
(skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
- (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -1095,8 +1187,6 @@ int tcp_retransmit_skb(struct sock *sk,
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
if (!pskb_trim(skb, 0)) {
TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
- skb_shinfo(skb)->tso_segs = 1;
- skb_shinfo(skb)->tso_size = 0;
skb->ip_summed = CHECKSUM_NONE;
skb->csum = 0;
}
@@ -1106,7 +1196,6 @@ int tcp_retransmit_skb(struct sock *sk,
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
pskb_copy(skb, GFP_ATOMIC):
@@ -1125,7 +1214,7 @@ int tcp_retransmit_skb(struct sock *sk,
}
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
- tp->retrans_out += tcp_skb_pcount(skb);
+ tp->retrans_out++;
/* Save stamp of the first retransmit. */
if (!tp->retrans_stamp)
@@ -1184,8 +1273,7 @@ void tcp_xmit_retransmit_queue(struct so
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
- packet_cnt -= tcp_skb_pcount(skb);
- if (packet_cnt <= 0)
+ if (--packet_cnt <= 0)
break;
}
}
@@ -1254,7 +1342,7 @@ void tcp_send_fin(struct sock *sk)
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
- mss_now = tcp_current_mss(sk, 1);
+ mss_now = tcp_current_mss(sk);
if (sk->sk_send_head != NULL) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -1510,7 +1598,7 @@ int tcp_connect(struct sock *sk)
skb_header_release(buff);
__skb_queue_tail(&sk->sk_write_queue, buff);
sk_charge_skb(sk, buff);
- tp->packets_out += tcp_skb_pcount(buff);
+ tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
@@ -1655,7 +1743,7 @@ int tcp_write_wakeup(struct sock *sk)
if ((skb = sk->sk_send_head) != NULL &&
before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
int err;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
@@ -1671,19 +1759,10 @@ int tcp_write_wakeup(struct sock *sk)
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
if (tcp_fragment(sk, skb, seg_size))
return -1;
- /* SWS override triggered forced fragmentation.
- * Disable TSO, the connection is too sick. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- sk->sk_route_caps &= ~NETIF_F_TSO;
- tp->mss_cache = tp->mss_cache_std;
- }
- } else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb);
+ }
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
if (!err) {
update_send_head(sk, tp, skb);
--- ./net/ipv4/tcp_input.c.~1~ 2005-04-25 21:39:24.000000000 -0700
+++ ./net/ipv4/tcp_input.c 2005-05-04 22:42:23.000000000 -0700
@@ -805,10 +805,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp,
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
- if (tp->mss_cache_std > 1460)
+ if (tp->mss_cache > 1460)
cwnd = 2;
else
- cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+ cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -974,14 +974,6 @@ tcp_sacktag_write_queue(struct sock *sk,
int flag = 0;
int i;
- /* So, SACKs for already sent large segments will be lost.
- * Not good, but alternative is to resegment the queue. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
- }
-
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
@@ -1038,7 +1030,7 @@ tcp_sacktag_write_queue(struct sock *sk,
if(!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- fack_count += tcp_skb_pcount(skb);
+ fack_count++;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
@@ -1082,8 +1074,8 @@ tcp_sacktag_write_queue(struct sock *sk,
*/
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->lost_out--;
+ tp->retrans_out--;
}
} else {
/* New sack for not retransmitted frame,
@@ -1095,13 +1087,13 @@ tcp_sacktag_write_queue(struct sock *sk,
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- tp->lost_out -= tcp_skb_pcount(skb);
+ tp->lost_out--;
}
}
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
flag |= FLAG_DATA_SACKED;
- tp->sacked_out += tcp_skb_pcount(skb);
+ tp->sacked_out++;
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
@@ -1118,7 +1110,7 @@ tcp_sacktag_write_queue(struct sock *sk,
if (dup_sack &&
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retrans_out--;
}
}
}
@@ -1142,12 +1134,12 @@ tcp_sacktag_write_queue(struct sock *sk,
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
- tp->mss_cache_std))) {
+ tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retrans_out--;
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
flag |= FLAG_DATA_SACKED;
NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
@@ -1222,7 +1214,7 @@ static void tcp_enter_frto_loss(struct s
tp->fackets_out = 0;
sk_stream_for_retrans_queue(skb, sk) {
- cnt += tcp_skb_pcount(skb);
+ cnt++;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
@@ -1232,10 +1224,10 @@ static void tcp_enter_frto_loss(struct s
if (!after(TCP_SKB_CB(skb)->end_seq,
tp->frto_highmark)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
}
} else {
- tp->sacked_out += tcp_skb_pcount(skb);
+ tp->sacked_out++;
tp->fackets_out = cnt;
}
}
@@ -1297,16 +1289,16 @@ void tcp_enter_loss(struct sock *sk, int
tp->undo_marker = tp->snd_una;
sk_stream_for_retrans_queue(skb, sk) {
- cnt += tcp_skb_pcount(skb);
+ cnt++;
if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
} else {
- tp->sacked_out += tcp_skb_pcount(skb);
+ tp->sacked_out++;
tp->fackets_out = cnt;
}
}
@@ -1542,12 +1534,12 @@ static void tcp_mark_head_lost(struct so
BUG_TRAP(cnt <= tp->packets_out);
sk_stream_for_retrans_queue(skb, sk) {
- cnt -= tcp_skb_pcount(skb);
+ cnt--;
if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
}
}
tcp_sync_left_out(tp);
@@ -1578,7 +1570,7 @@ static void tcp_update_scoreboard(struct
if (tcp_skb_timedout(tp, skb) &&
!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ tp->lost_out++;
}
}
tcp_sync_left_out(tp);
@@ -2170,7 +2162,7 @@ static void vegas_cong_avoid(struct tcp_
* is the cwnd during the previous RTT.
*/
old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
- tp->mss_cache_std;
+ tp->mss_cache;
old_snd_cwnd = tp->vegas.beg_snd_cwnd;
/* Save the extent of the current window so we can use this
@@ -2348,72 +2340,6 @@ static inline void tcp_ack_packets_out(s
}
}
-/* There is one downside to this scheme. Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement. WARNING: it is not trivial to make.
- */
-static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
- __u32 now, __s32 *seq_rtt)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- __u32 seq = tp->snd_una;
- __u32 packets_acked;
- int acked = 0;
-
- /* If we get here, the whole TSO packet has not been
- * acked.
- */
- BUG_ON(!after(scb->end_seq, seq));
-
- packets_acked = tcp_skb_pcount(skb);
- if (tcp_trim_head(sk, skb, seq - scb->seq))
- return 0;
- packets_acked -= tcp_skb_pcount(skb);
-
- if (packets_acked) {
- __u8 sacked = scb->sacked;
-
- acked |= FLAG_DATA_ACKED;
- if (sacked) {
- if (sacked & TCPCB_RETRANS) {
- if (sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= packets_acked;
- acked |= FLAG_RETRANS_DATA_ACKED;
- *seq_rtt = -1;
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
- if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= packets_acked;
- if (sacked & TCPCB_LOST)
- tp->lost_out -= packets_acked;
- if (sacked & TCPCB_URG) {
- if (tp->urg_mode &&
- !before(seq, tp->snd_up))
- tp->urg_mode = 0;
- }
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
-
- if (tp->fackets_out) {
- __u32 dval = min(tp->fackets_out, packets_acked);
- tp->fackets_out -= dval;
- }
- tp->packets_out -= packets_acked;
-
- BUG_ON(tcp_skb_pcount(skb) == 0);
- BUG_ON(!before(scb->seq, scb->end_seq));
- }
-
- return acked;
-}
-
-
/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
@@ -2432,12 +2358,8 @@ static int tcp_clean_rtx_queue(struct so
* discard it as it's confirmed to have arrived at
* the other end.
*/
- if (after(scb->end_seq, tp->snd_una)) {
- if (tcp_skb_pcount(skb) > 1)
- acked |= tcp_tso_acked(sk, skb,
- now, &seq_rtt);
+ if (after(scb->end_seq, tp->snd_una))
break;
- }
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -2456,15 +2378,15 @@ static int tcp_clean_rtx_queue(struct so
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if(sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retrans_out--;
acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
} else if (seq_rtt < 0)
seq_rtt = now - scb->when;
if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= tcp_skb_pcount(skb);
+ tp->sacked_out--;
if (sacked & TCPCB_LOST)
- tp->lost_out -= tcp_skb_pcount(skb);
+ tp->lost_out--;
if (sacked & TCPCB_URG) {
if (tp->urg_mode &&
!before(scb->end_seq, tp->snd_up))
@@ -2472,8 +2394,9 @@ static int tcp_clean_rtx_queue(struct so
}
} else if (seq_rtt < 0)
seq_rtt = now - scb->when;
- tcp_dec_pcount_approx(&tp->fackets_out, skb);
- tcp_packets_out_dec(tp, skb);
+ if (tp->fackets_out)
+ tp->fackets_out--;
+ tcp_packets_out_dec(tp);
__skb_unlink(skb, skb->list);
sk_stream_free_skb(sk, skb);
}
@@ -2799,19 +2722,19 @@ static void westwood_dupack_update(struc
{
struct tcp_sock *tp = tcp_sk(sk);
- tp->westwood.accounted += tp->mss_cache_std;
- tp->westwood.cumul_ack = tp->mss_cache_std;
+ tp->westwood.accounted += tp->mss_cache;
+ tp->westwood.cumul_ack = tp->mss_cache;
}
static inline int westwood_may_change_cumul(struct tcp_sock *tp)
{
- return (tp->westwood.cumul_ack > tp->mss_cache_std);
+ return (tp->westwood.cumul_ack > tp->mss_cache);
}
static inline void westwood_partial_update(struct tcp_sock *tp)
{
tp->westwood.accounted -= tp->westwood.cumul_ack;
- tp->westwood.cumul_ack = tp->mss_cache_std;
+ tp->westwood.cumul_ack = tp->mss_cache;
}
static inline void westwood_complete_update(struct tcp_sock *tp)
@@ -3952,7 +3875,7 @@ static void tcp_new_space(struct sock *s
!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
!tcp_memory_pressure &&
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
--- ./net/ipv4/tcp.c.~1~ 2005-04-20 10:18:18.000000000 -0700
+++ ./net/ipv4/tcp.c 2005-05-04 22:23:45.000000000 -0700
@@ -646,7 +646,7 @@ static ssize_t do_tcp_sendpages(struct s
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ mss_now = tcp_current_mss(sk);
copied = 0;
err = -EPIPE;
@@ -702,7 +702,8 @@ new_segment:
skb->ip_summed = CHECKSUM_HW;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
if (!copied)
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
@@ -718,8 +719,15 @@ new_segment:
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == sk->sk_send_head)
- tcp_push_one(sk, mss_now);
+ } else if (skb == sk->sk_send_head) {
+ /* If we can potentially do TSO, it is better to queue
+ * things up and accumulate then to push the frame right
+ * now.
+ */
+ if (!(sk->sk_route_caps & NETIF_F_TSO) ||
+ tp->urg_mode)
+ tcp_push_one(sk, mss_now);
+ }
continue;
wait_for_sndbuf:
@@ -731,7 +739,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ mss_now = tcp_current_mss(sk);
}
out:
@@ -773,15 +781,11 @@ ssize_t tcp_sendpage(struct socket *sock
static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
- int tmp = tp->mss_cache_std;
+ int tmp = tp->mss_cache;
- if (sk->sk_route_caps & NETIF_F_SG) {
- int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+ if (sk->sk_route_caps & NETIF_F_SG)
+ tmp = 0;
- if (tmp >= pgbreak &&
- tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
- tmp = pgbreak;
- }
return tmp;
}
@@ -810,7 +814,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ mss_now = tcp_current_mss(sk);
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
@@ -949,7 +953,8 @@ new_segment:
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->tso_segs = 1;
+ skb_shinfo(skb)->tso_size = 0;
from += copy;
copied += copy;
@@ -962,8 +967,15 @@ new_segment:
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == sk->sk_send_head)
- tcp_push_one(sk, mss_now);
+ } else if (skb == sk->sk_send_head) {
+ /* If we can potentially do TSO, it is better to queue
+ * things up and accumulate then to push the frame right
+ * now.
+ */
+ if (!(sk->sk_route_caps & NETIF_F_TSO) ||
+ tp->urg_mode)
+ tcp_push_one(sk, mss_now);
+ }
continue;
wait_for_sndbuf:
@@ -975,7 +987,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ mss_now = tcp_current_mss(sk);
}
}
@@ -2135,7 +2147,7 @@ void tcp_get_info(struct sock *sk, struc
info->tcpi_rto = jiffies_to_usecs(tp->rto);
info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
- info->tcpi_snd_mss = tp->mss_cache_std;
+ info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = tp->ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
@@ -2185,7 +2197,7 @@ int tcp_getsockopt(struct sock *sk, int
switch (optname) {
case TCP_MAXSEG:
- val = tp->mss_cache_std;
+ val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
break;
--- ./net/ipv4/tcp_ipv4.c.~1~ 2005-05-03 14:36:08.000000000 -0700
+++ ./net/ipv4/tcp_ipv4.c 2005-05-04 15:17:16.000000000 -0700
@@ -2060,7 +2060,7 @@ static int tcp_v4_init_sock(struct sock
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
--- ./net/ipv6/tcp_ipv6.c.~1~ 2005-05-03 14:36:44.000000000 -0700
+++ ./net/ipv6/tcp_ipv6.c 2005-05-04 15:17:09.000000000 -0700
@@ -2021,7 +2021,7 @@ static int tcp_v6_init_sock(struct sock
*/
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-05 6:07 [PATCH] TSO Reloaded David S. Miller
@ 2005-05-05 22:35 ` David S. Miller
2005-05-05 22:56 ` Jon Mason
` (4 more replies)
2005-05-06 4:36 ` Anton Blanchard
2005-05-18 2:26 ` John Heffner
2 siblings, 5 replies; 27+ messages in thread
From: David S. Miller @ 2005-05-05 22:35 UTC (permalink / raw)
To: netdev
Is anyone testing this or is everyone too scared? :-)
A lot of people wanted this TSO stuff fixed, so I spent
time implementing the fix. Yet nobody seems interested
enough to even say "it works" or "it's busted" :-/
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-05 22:35 ` David S. Miller
@ 2005-05-05 22:56 ` Jon Mason
[not found] ` <20050505154955.0dcb4b38.davem@davemloft.net>
2005-05-05 23:11 ` Anton Blanchard
` (3 subsequent siblings)
4 siblings, 1 reply; 27+ messages in thread
From: Jon Mason @ 2005-05-05 22:56 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
On Thursday 05 May 2005 05:35 pm, David S. Miller wrote:
>
> Is anyone testing this or is everyone too scared? :-)
>
> A lot of people wanted this TSO stuff fixed, so I spent
> time implementing the fix. Yet nobody seems interested
> enough to even say "it works" or "it's busted" :-/
I can give it a quick sniff on e1000 and r8169 (on amd64), though only to
verify that its works (ie, no real perf). Is that acceptable?
Thanks,
Jon
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-05 22:35 ` David S. Miller
2005-05-05 22:56 ` Jon Mason
@ 2005-05-05 23:11 ` Anton Blanchard
2005-05-05 23:14 ` David S. Miller
2005-05-05 23:18 ` Nivedita Singhvi
` (2 subsequent siblings)
4 siblings, 1 reply; 27+ messages in thread
From: Anton Blanchard @ 2005-05-05 23:11 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
> Is anyone testing this or is everyone too scared? :-)
>
> A lot of people wanted this TSO stuff fixed, so I spent
> time implementing the fix. Yet nobody seems interested
> enough to even say "it works" or "it's busted" :-/
When I can get my machines booting to userspace Im definitely going to
try it out :)
Anton
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-05 23:11 ` Anton Blanchard
@ 2005-05-05 23:14 ` David S. Miller
2005-05-06 1:03 ` Anton Blanchard
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-05 23:14 UTC (permalink / raw)
To: Anton Blanchard; +Cc: netdev
On Fri, 6 May 2005 09:11:41 +1000
Anton Blanchard <anton@samba.org> wrote:
>
> > Is anyone testing this or is everyone too scared? :-)
> >
> > A lot of people wanted this TSO stuff fixed, so I spent
> > time implementing the fix. Yet nobody seems interested
> > enough to even say "it works" or "it's busted" :-/
>
> When I can get my machines booting to userspace Im definitely going to
> try it out :)
Do you need anything besides that early-packet fix from
Herbert?
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-05 22:35 ` David S. Miller
2005-05-05 22:56 ` Jon Mason
2005-05-05 23:11 ` Anton Blanchard
@ 2005-05-05 23:18 ` Nivedita Singhvi
2005-05-05 23:24 ` Stephen Hemminger
2005-05-06 3:20 ` Leonid Grossman
4 siblings, 0 replies; 27+ messages in thread
From: Nivedita Singhvi @ 2005-05-05 23:18 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
David S. Miller wrote:
> Is anyone testing this or is everyone too scared? :-)
>
> A lot of people wanted this TSO stuff fixed, so I spent
> time implementing the fix. Yet nobody seems interested
> enough to even say "it works" or "it's busted" :-/
>
It's in the queue, Dave :).
Let you know if there are problems...
thanks,
Nivedita
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-05 22:35 ` David S. Miller
` (2 preceding siblings ...)
2005-05-05 23:18 ` Nivedita Singhvi
@ 2005-05-05 23:24 ` Stephen Hemminger
2005-05-05 23:16 ` David S. Miller
2005-05-06 3:20 ` Leonid Grossman
4 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2005-05-05 23:24 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
On Thu, 5 May 2005 15:35:32 -0700
"David S. Miller" <davem@davemloft.net> wrote:
>
> Is anyone testing this or is everyone too scared? :-)
>
> A lot of people wanted this TSO stuff fixed, so I spent
> time implementing the fix. Yet nobody seems interested
> enough to even say "it works" or "it's busted" :-/
It works on tg3 that is too stupid to do TSO at least. More tests coming
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-05 23:24 ` Stephen Hemminger
@ 2005-05-05 23:16 ` David S. Miller
0 siblings, 0 replies; 27+ messages in thread
From: David S. Miller @ 2005-05-05 23:16 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
On Thu, 5 May 2005 16:24:48 -0700
Stephen Hemminger <shemminger@osdl.org> wrote:
> On Thu, 5 May 2005 15:35:32 -0700
> "David S. Miller" <davem@davemloft.net> wrote:
>
> >
> > Is anyone testing this or is everyone too scared? :-)
> >
> > A lot of people wanted this TSO stuff fixed, so I spent
> > time implementing the fix. Yet nobody seems interested
> > enough to even say "it works" or "it's busted" :-/
>
> It works on tg3 that is too stupid to do TSO at least. More tests coming
It's off by default, "ethtool -K eth0 tso on" is necessary.
I'll change that soon.
Or do you have a chip revision that really can't do TSO?
^ permalink raw reply [flat|nested] 27+ messages in thread
* RE: [PATCH] TSO Reloaded
2005-05-05 22:35 ` David S. Miller
` (3 preceding siblings ...)
2005-05-05 23:24 ` Stephen Hemminger
@ 2005-05-06 3:20 ` Leonid Grossman
2005-05-06 3:30 ` David S. Miller
4 siblings, 1 reply; 27+ messages in thread
From: Leonid Grossman @ 2005-05-06 3:20 UTC (permalink / raw)
To: 'David S. Miller', netdev
We will be testing on 10GbE NICs in the next couple weeks; will let you
know.
BTW - any plans for IPv6 support?
> -----Original Message-----
> From: netdev-bounce@oss.sgi.com [mailto:netdev-bounce@oss.sgi.com] On
> Behalf Of David S. Miller
> Sent: Thursday, May 05, 2005 3:36 PM
> To: netdev@oss.sgi.com
> Subject: Re: [PATCH] TSO Reloaded
>
>
> Is anyone testing this or is everyone too scared? :-)
>
> A lot of people wanted this TSO stuff fixed, so I spent
> time implementing the fix. Yet nobody seems interested
> enough to even say "it works" or "it's busted" :-/
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] TSO Reloaded
2005-05-06 3:20 ` Leonid Grossman
@ 2005-05-06 3:30 ` David S. Miller
2005-05-06 14:09 ` Leonid Grossman
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-06 3:30 UTC (permalink / raw)
To: Leonid Grossman; +Cc: netdev
On Thu, 5 May 2005 20:20:56 -0700
"Leonid Grossman" <leonid.grossman@neterion.com> wrote:
> We will be testing on 10GbE NICs in the next couple weeks; will let you
> know.
> BTW - any plans for IPv6 support?
What exactly does your NIC expect? Do you use the
NETIF_F_HW_CSUM flag to indicate generic checksumming support?
Otherwise, there is no other way to support ipv6 checksum offload
at the moment, and that is a requirement for ipv6 TSO.
For TSO, the ipv6 header handling seems very non-trivial.
What is supposed to happen in cases where certain optional
extension headers should be present in some of the frames
but not the others?
A specification of how your NIC support ipv6 TSO is necessary
in order for support to be written, see? Seems like you are
the most qualified person to write the support, therefore :-)
Really, it isn't that hard and you have something on which
to test whatever you write, whereas I don't.
^ permalink raw reply [flat|nested] 27+ messages in thread
* RE: [PATCH] TSO Reloaded
2005-05-06 3:30 ` David S. Miller
@ 2005-05-06 14:09 ` Leonid Grossman
0 siblings, 0 replies; 27+ messages in thread
From: Leonid Grossman @ 2005-05-06 14:09 UTC (permalink / raw)
To: 'David S. Miller'; +Cc: netdev
> -----Original Message-----
> From: David S. Miller [mailto:davem@davemloft.net]
> Sent: Thursday, May 05, 2005 8:31 PM
> To: Leonid Grossman
> Cc: netdev@oss.sgi.com
> Subject: Re: [PATCH] TSO Reloaded
>
> On Thu, 5 May 2005 20:20:56 -0700
> "Leonid Grossman" <leonid.grossman@neterion.com> wrote:
>
> > We will be testing on 10GbE NICs in the next couple weeks; will let you
> > know.
> > BTW - any plans for IPv6 support?
>
> What exactly does your NIC expect? Do you use the
> NETIF_F_HW_CSUM flag to indicate generic checksumming support?
> Otherwise, there is no other way to support ipv6 checksum offload
> at the moment, and that is a requirement for ipv6 TSO.
>
> For TSO, the ipv6 header handling seems very non-trivial.
> What is supposed to happen in cases where certain optional
> extension headers should be present in some of the frames
> but not the others?
Our ASIC supports ipv6 CSUM and TSO (and header splitting) even if extension
headers are present, but I suspect the majority ipv6-capable NICs will not
implement this; the stack needs to query NIC header-processing capabilities
(for both CSUM and TSO) and act accordingly.
>
> A specification of how your NIC support ipv6 TSO is necessary
> in order for support to be written, see?
We are planning to release the ASIC programming manual to the community
fairly soon, this will provide a better view on IPv6 LSO and some other
features.
>Seems like you are
> the most qualified person to write the support, therefore :-)
> Really, it isn't that hard and you have something on which
> to test whatever you write, whereas I don't.
We will probably get to it sometime down the road; at the moment support for
UDP LSO and receive side offloads are much higher on our list.
Getting a vanilla (no support for extension headers) implementation from
someone who knows the stack better than we do would be a good thing :-), I
suspect this should be useful for more than one NIC vendor.
Leonid
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-05 6:07 [PATCH] TSO Reloaded David S. Miller
2005-05-05 22:35 ` David S. Miller
@ 2005-05-06 4:36 ` Anton Blanchard
2005-05-06 5:45 ` Anton Blanchard
2005-05-18 2:26 ` John Heffner
2 siblings, 1 reply; 27+ messages in thread
From: Anton Blanchard @ 2005-05-06 4:36 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
Hi Dave,
> Ok, here it is, first draft of the new TSO handling
> I promised so long ago :-) I was lazy and waited until
> today to implement the entire thing.
I fired off socklib (just a TCP stream test) and things look much
better. I dumped the size of the data+frags for each packet that the
e1000 created. It looks like frags are working in multiples of MTU, but
we also have to split when we hit page boundaries. Is this expected?
BTW the 4 bytes at the end is due to an e1000 workaround for a hw bug.
Thanks for working on this :)
Anton
data: 42 frags:
data: 74 frags:
data: 66 frags: 1448 1444 4
data: 66 frags: 1024 424 1444 4
data: 66 frags: 1448 512 932 4
data: 66 frags: 1448 1444 4
data: 66 frags: 1448
data: 66 frags: 1448 1024 424 1448 1448 512 936 1444 4
data: 66 frags: 1448 1444 4
data: 66 frags: 1448 1024 420 4
data: 66 frags: 1448 1444 4
data: 66 frags: 512 936 1448 1448 1448 1448 1024 424 1448 1448 512 936 1444 4
data: 66 frags: 1448 1448 1444 4
data: 66 frags: 1024 424 1448 1448 512 936 1448 1448 1444 4
data: 66 frags: 1448 1024 424 1448 1448 372 4
data: 66 frags: 128 1320 1448 1152 296 1448 1448 640 808 1444 4
data: 66 frags: 1448 128 1320 1444 4
data: 66 frags: 1152 296 1448 1448 640 804 4
data: 66 frags: 1448 1448 128 1316 4
data: 66 frags: 1448 1152 296 1444 4
data: 66 frags: 1448 640 808 1448 1448 128 1320 1448 1152 296 1448 1448 640 808 1448 1444 4
data: 66 frags: 128 1320 1448 1152 296 1448 1444 4
data: 66 frags: 640 808 1448 1444 4
data: 66 frags: 128 1320 1448 1152 296 1448 1448 372 4
data: 66 frags: 256 1192 1448 1280 168 1448 1448 768 680 1448 1448 256 1192 1448 1280 168 1444 4
data: 66 frags: 1448 768 680 1448 1444 4
data: 66 frags: 256 1192 1448 1280 168 1448 1444 4
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-06 4:36 ` Anton Blanchard
@ 2005-05-06 5:45 ` Anton Blanchard
2005-05-06 6:13 ` David S. Miller
0 siblings, 1 reply; 27+ messages in thread
From: Anton Blanchard @ 2005-05-06 5:45 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
Hi,
With a 9000 byte MTU I see page sized frags as expected. There are still
some pages broken in two which is interesting.
Anton
data: 66 frags: 3584 4096 1268 2816 4096 2032 4
data: 66 frags: 2048 4096 2804 1280 4096 3568 4
data: 66 frags: 512 4096 4096 244 3840 4096 1008 4
data: 66 frags: 3072 4096 1780 2304 4096 2544 4
data: 66 frags: 1536 4096 3316 768 4096 4080 4
data: 66 frags: 4096 4096 756 3328 4096 1520 4
data: 66 frags: 2560 4096 2292 1792 4096 3056 4
data: 66 frags: 1024 4096 3828 256 4096 4096 496 4
data: 66 frags: 3584 4096 1268 2816 4096 2032 4
data: 66 frags: 2048 4096 2804 1280 4096 3568 4
data: 66 frags: 512 4096 4096 244 3840 4096 1008 4
data: 66 frags: 3072 4096 1780 2304 4096 2544 4
data: 66 frags: 1536 4096 3316 768 4096 4080 4
data: 66 frags: 4096 4096 756 3328 4096 1520 4
data: 66 frags: 2560 4096 2292 1792 4096 3056 4
data: 66 frags: 1024 4096 3828 256 4096 4096 496 4
data: 66 frags: 3584 4096 1268 2816 4096 2032 4
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-06 5:45 ` Anton Blanchard
@ 2005-05-06 6:13 ` David S. Miller
2005-05-06 7:10 ` Anton Blanchard
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-06 6:13 UTC (permalink / raw)
To: Anton Blanchard; +Cc: netdev
On Fri, 6 May 2005 15:45:16 +1000
Anton Blanchard <anton@samba.org> wrote:
> With a 9000 byte MTU I see page sized frags as expected. There are still
> some pages broken in two which is interesting.
I know what is causing the breaks, it's the "cache line align" crap
in tcp_sendmsg().
Comment out this alignment code in net/ipv4/tcp.c and see how it goes:
/* If page is cached, align
* offset to L1 cache boundary
*/
off = (off + L1_CACHE_BYTES - 1) &
~(L1_CACHE_BYTES - 1);
This alignment causes TSO coalescing to fail at these spots.
You should not see this effect when sendfile() is used for
the transfer.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-06 6:13 ` David S. Miller
@ 2005-05-06 7:10 ` Anton Blanchard
2005-05-06 8:19 ` David S. Miller
0 siblings, 1 reply; 27+ messages in thread
From: Anton Blanchard @ 2005-05-06 7:10 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
> I know what is causing the breaks, it's the "cache line align" crap
> in tcp_sendmsg().
>
> Comment out this alignment code in net/ipv4/tcp.c and see how it goes:
>
> /* If page is cached, align
> * offset to L1 cache boundary
> */
> off = (off + L1_CACHE_BYTES - 1) &
> ~(L1_CACHE_BYTES - 1);
>
> This alignment causes TSO coalescing to fail at these spots.
>
> You should not see this effect when sendfile() is used for
> the transfer.
Nice work! Its looking much better now.
Anton
1500 MTU:
eth0 data: 74 frags:
eth0 data: 66 frags: 2892 4
eth0 data: 66 frags: 1200 1692 4
eth0 data: 66 frags: 2400 492 4
eth0 data: 66 frags: 2892 4
eth0 data: 66 frags: 704 744
eth0 data: 66 frags: 3352 4096 1236 4
eth0 data: 66 frags: 2856 36 4
1500 MTU+sendfile:
eth0 data: 74 frags:
eth0 data: 66 frags: 2892 4
eth0 data: 66 frags: 1200 1692 4
eth0 data: 66 frags: 2400 492 4
eth0 data: 66 frags: 2892 4
eth0 data: 66 frags: 704 744
eth0 data: 66 frags: 3348 4
eth0 data: 66 frags: 4092 4
9000 MTU:
eth0 data: 74 frags:
eth0 data: 66 frags: 4096 4096 4096 4096 1508 4
eth0 data: 66 frags: 2584 4096 2268
eth0 data: 66 frags: 1828 4096 3024
eth0 data: 66 frags: 1072 4096 4096 4096 4096 4096 4096 1192 4
eth0 data: 66 frags: 2900 4096 4096 4096 2704 4
eth0 data: 66 frags: 1388 4096 3464
eth0 data: 66 frags: 632 4096 4096 4096 4096 876 4
9000 MTU+sendfile:
eth0 data: 74 frags:
eth0 data: 66 frags: 4096 4096 4092 4
eth0 data: 66 frags: 4096 4096 4096 4096 1508 4
eth0 data: 66 frags: 2584 4096 4092 4
eth0 data: 66 frags: 4096 4096 4096 4096 1508 4
eth0 data: 66 frags: 2584 4096 4096 4096 3020 4
eth0 data: 66 frags: 1072 4096 4096 4096 4096 436 4
eth0 data: 66 frags: 3656 4096 4096 4096 1948 4
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-06 7:10 ` Anton Blanchard
@ 2005-05-06 8:19 ` David S. Miller
2005-05-06 8:36 ` Anton Blanchard
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-06 8:19 UTC (permalink / raw)
To: Anton Blanchard; +Cc: netdev
On Fri, 6 May 2005 17:10:05 +1000
Anton Blanchard <anton@samba.org> wrote:
> Nice work! Its looking much better now.
Great, thanks for testing.
BTW:
> eth0 data: 66 frags: 2892 4
What does this "66 frags" mean? Is it "tso_segs" from
the SKB? skb_shinfo(skb)->nr_frags can never be that
large (MAX_SKB_FRAGS is (65536/PAGE_SIZE + 2)) so it
can't be that.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-06 8:19 ` David S. Miller
@ 2005-05-06 8:36 ` Anton Blanchard
0 siblings, 0 replies; 27+ messages in thread
From: Anton Blanchard @ 2005-05-06 8:36 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
> > eth0 data: 66 frags: 2892 4
>
> What does this "66 frags" mean? Is it "tso_segs" from
> the SKB? skb_shinfo(skb)->nr_frags can never be that
> large (MAX_SKB_FRAGS is (65536/PAGE_SIZE + 2)) so it
> can't be that.
Nah I just dump the skb mapping then the subsequent frags. So 66 is the
size of the header. In this case there is only 2 frags.
Anton
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-05 6:07 [PATCH] TSO Reloaded David S. Miller
2005-05-05 22:35 ` David S. Miller
2005-05-06 4:36 ` Anton Blanchard
@ 2005-05-18 2:26 ` John Heffner
2005-05-18 2:28 ` David S. Miller
2 siblings, 1 reply; 27+ messages in thread
From: John Heffner @ 2005-05-18 2:26 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
Though it may be moot by now, I just ran some tests of my own with this
patch. This was on a dual 2.4 GHz Xeon booted with a UP kernel,
running iperf over an e1000 at a 1500 byte MTU. I measured idle CPU by
running a process which sits in a gettimeofday() loop. Numbers are CPU
utilization, all turned out +/- 1%.
No TSO: 86.6%
Old TSO: 61.0%
New TSO: 88.5%
-John
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-18 2:26 ` John Heffner
@ 2005-05-18 2:28 ` David S. Miller
2005-05-18 2:51 ` John Heffner
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-18 2:28 UTC (permalink / raw)
To: jheffner; +Cc: netdev
From: John Heffner <jheffner@psc.edu>
Date: Tue, 17 May 2005 22:26:09 -0400
> Though it may be moot by now, I just ran some tests of my own with this
> patch. This was on a dual 2.4 GHz Xeon booted with a UP kernel,
> running iperf over an e1000 at a 1500 byte MTU. I measured idle CPU by
> running a process which sits in a gettimeofday() loop. Numbers are CPU
> utilization, all turned out +/- 1%.
>
> No TSO: 86.6%
> Old TSO: 61.0%
> New TSO: 88.5%
Yeah, TSO Reloaded really stinks. :-)
Try the "Super TSO" patch I just posted instead.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-18 2:28 ` David S. Miller
@ 2005-05-18 2:51 ` John Heffner
2005-05-18 3:00 ` David S. Miller
0 siblings, 1 reply; 27+ messages in thread
From: John Heffner @ 2005-05-18 2:51 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
On May 17, 2005, at 10:28 PM, David S. Miller wrote:
> From: John Heffner <jheffner@psc.edu>
> Date: Tue, 17 May 2005 22:26:09 -0400
>
>> Though it may be moot by now, I just ran some tests of my own with
>> this
>> patch. This was on a dual 2.4 GHz Xeon booted with a UP kernel,
>> running iperf over an e1000 at a 1500 byte MTU. I measured idle CPU
>> by
>> running a process which sits in a gettimeofday() loop. Numbers are
>> CPU
>> utilization, all turned out +/- 1%.
>>
>> No TSO: 86.6%
>> Old TSO: 61.0%
>> New TSO: 88.5%
>
> Yeah, TSO Reloaded really stinks. :-)
>
> Try the "Super TSO" patch I just posted instead.
Interesting timing. :)
With the new patch I'm getting 78.0%.
-John
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-18 2:51 ` John Heffner
@ 2005-05-18 3:00 ` David S. Miller
2005-05-18 3:47 ` John Heffner
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-18 3:00 UTC (permalink / raw)
To: jheffner; +Cc: netdev
From: John Heffner <jheffner@psc.edu>
Date: Tue, 17 May 2005 22:51:53 -0400
> On May 17, 2005, at 10:28 PM, David S. Miller wrote:
>
> > From: John Heffner <jheffner@psc.edu>
> > Date: Tue, 17 May 2005 22:26:09 -0400
> >
> >> No TSO: 86.6%
> >> Old TSO: 61.0%
> >> New TSO: 88.5%
> >
> > Yeah, TSO Reloaded really stinks. :-)
> >
> > Try the "Super TSO" patch I just posted instead.
>
> Interesting timing. :)
> With the new patch I'm getting 78.0%.
Thanks for testing.
I actually expected the new code to do better.
Splitting could be a little bit expensive, but
that only occurs at the beginning of the connection
as we ramp up the congestion and send window.
Afterwards, we should be able to release full unsplit
TSO frames onto the wire.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-18 3:00 ` David S. Miller
@ 2005-05-18 3:47 ` John Heffner
2005-05-18 5:59 ` David S. Miller
0 siblings, 1 reply; 27+ messages in thread
From: John Heffner @ 2005-05-18 3:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
On May 17, 2005, at 11:00 PM, David S. Miller wrote:
> From: John Heffner <jheffner@psc.edu>
> Date: Tue, 17 May 2005 22:51:53 -0400
>
>> With the new patch I'm getting 78.0%.
>
> Thanks for testing.
>
> I actually expected the new code to do better.
> Splitting could be a little bit expensive, but
> that only occurs at the beginning of the connection
> as we ramp up the congestion and send window.
> Afterwards, we should be able to release full unsplit
> TSO frames onto the wire.
With different (larger than default) buffer sizes, I'm getting 63.4%.
Not surprising I guess.
-John
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] TSO Reloaded
2005-05-18 3:47 ` John Heffner
@ 2005-05-18 5:59 ` David S. Miller
2005-05-18 13:48 ` John Heffner
0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2005-05-18 5:59 UTC (permalink / raw)
To: jheffner; +Cc: netdev
From: John Heffner <jheffner@psc.edu>
Date: Tue, 17 May 2005 23:47:48 -0400
> On May 17, 2005, at 11:00 PM, David S. Miller wrote:
>
> > I actually expected the new code to do better.
> > Splitting could be a little bit expensive, but
> > that only occurs at the beginning of the connection
> > as we ramp up the congestion and send window.
> > Afterwards, we should be able to release full unsplit
> > TSO frames onto the wire.
>
> With different (larger than default) buffer sizes, I'm getting 63.4%.
> Not surprising I guess.
Thanks for the data.
How long are your transfers? Just curious.
I think what I need to investigate is some kind of light cwnd
prediction. This, plus some TSO packet coalescing logic when we
undershoot, should do the trick.
But first I'll study the segmenting behavior of the current code. It
could be simply a matter of tweaking when we wake up the user when he
sleeps on the send buffer filling up.
^ permalink raw reply [flat|nested] 27+ messages in thread
[parent not found: <20050506123411.7073cf15.davem@davemloft.net>]
* RE: [PATCH] TSO Reloaded
[not found] <20050506123411.7073cf15.davem@davemloft.net>
@ 2005-05-07 4:48 ` Leonid Grossman
0 siblings, 0 replies; 27+ messages in thread
From: Leonid Grossman @ 2005-05-07 4:48 UTC (permalink / raw)
To: 'David S. Miller'; +Cc: netdev
> -----Original Message-----
> From: netdev-bounce@oss.sgi.com [mailto:netdev-bounce@oss.sgi.com] On
> Behalf Of David S. Miller
> Sent: Friday, May 06, 2005 12:34 PM
> To: Leonid Grossman
> Cc: netdev@oss.sgi.com
> Subject: Re: [PATCH] TSO Reloaded
>
> On Fri, 6 May 2005 07:09:17 -0700
> "Leonid Grossman" <leonid.grossman@neterion.com> wrote:
>
> > Our ASIC supports ipv6 CSUM and TSO (and header splitting) even if
> extension
> > headers are present, but I suspect the majority ipv6-capable NICs will
> not
> > implement this; the stack needs to query NIC header-processing
> capabilities
> > (for both CSUM and TSO) and act accordingly.
>
> Any particular reason for adding protocol specific checksumming
> instead of a protocol agnostic one? That is what NETIF_F_CSUM_HW
> represents, and we were hoping NIC vendors would do.
I guess some NIC vendors choose to support a single pass/fail flag (rather
than a 16-bit checksum field), because increasing the size of receive
descriptors may not come for free.
Our older card supports NETIF_F_IP_CSUM only, the newer card can support
NETIF_F_HW_CSUM so we will probably change the driver at some point.
>
> If you supported this, then you wouldn't need to keep adding checksum
> support for new protocols, NETIF_F_CSUM_HW works for anything which
> uses a 16-bit 2's complement checksum stuffed to an arbitray 16-bit
> location. The chip merely needs to:
>
> 1) calculate a 2's complement sum starting at the offset:
> (skb->h.raw - skb->data)
> into the packet, to the end.
> 2) Place the 16-bit result at offset:
> (skb->h.raw + skb->csum) - skb->data
>
> and that's it. This single checksumming engine works for ipv4 as well.
>
> See drivers/net/sunhme.c:happy_meal_start_xmit() and drivers/net/sungem.c:
> gem_start_xmit() for two example implementations.
Thanks for the pointer!
Leonid
>
> I can't belive Sun understood this years ago and yet they are pretty
> much the only vendor who has made networking cards supporting this
> scheme. Probably the Microsoft's NDIS driver specification is to blame :-
> /
^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2005-05-18 13:48 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-05 6:07 [PATCH] TSO Reloaded David S. Miller
2005-05-05 22:35 ` David S. Miller
2005-05-05 22:56 ` Jon Mason
[not found] ` <20050505154955.0dcb4b38.davem@davemloft.net>
2005-05-06 3:53 ` Jon Mason
2005-05-05 23:11 ` Anton Blanchard
2005-05-05 23:14 ` David S. Miller
2005-05-06 1:03 ` Anton Blanchard
2005-05-05 23:18 ` Nivedita Singhvi
2005-05-05 23:24 ` Stephen Hemminger
2005-05-05 23:16 ` David S. Miller
2005-05-06 3:20 ` Leonid Grossman
2005-05-06 3:30 ` David S. Miller
2005-05-06 14:09 ` Leonid Grossman
2005-05-06 4:36 ` Anton Blanchard
2005-05-06 5:45 ` Anton Blanchard
2005-05-06 6:13 ` David S. Miller
2005-05-06 7:10 ` Anton Blanchard
2005-05-06 8:19 ` David S. Miller
2005-05-06 8:36 ` Anton Blanchard
2005-05-18 2:26 ` John Heffner
2005-05-18 2:28 ` David S. Miller
2005-05-18 2:51 ` John Heffner
2005-05-18 3:00 ` David S. Miller
2005-05-18 3:47 ` John Heffner
2005-05-18 5:59 ` David S. Miller
2005-05-18 13:48 ` John Heffner
[not found] <20050506123411.7073cf15.davem@davemloft.net>
2005-05-07 4:48 ` Leonid Grossman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).