# This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/09/27 21:50:11-07:00 davem@nuts.davemloft.net # [TCP]: Fix congestion window expansion when using TSO. # # We only do congestion window expansion on full packet # ACKs. We should do it for ACKs of sub-packets of a # TSO frame as well. # # Signed-off-by: David S. Miller # # net/ipv4/tcp_output.c # 2004/09/27 21:48:59-07:00 davem@nuts.davemloft.net +35 -2 # [TCP]: Fix congestion window expansion when using TSO. # # net/ipv4/tcp_input.c # 2004/09/27 21:48:59-07:00 davem@nuts.davemloft.net +85 -1 # [TCP]: Fix congestion window expansion when using TSO. # # include/net/tcp.h # 2004/09/27 21:48:59-07:00 davem@nuts.davemloft.net +2 -1 # [TCP]: Fix congestion window expansion when using TSO. # diff -Nru a/include/net/tcp.h b/include/net/tcp.h --- a/include/net/tcp.h 2004-09-28 14:30:28 -07:00 +++ b/include/net/tcp.h 2004-09-28 14:30:28 -07:00 @@ -1180,7 +1180,8 @@ __u16 urg_ptr; /* Valid w/URG flags is set. */ __u32 ack_seq; /* Sequence number ACK'd */ - __u32 tso_factor; + __u16 tso_factor; /* If > 1, TSO frame */ + __u16 tso_mss; /* MSS that FACTOR's in terms of*/ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c 2004-09-28 14:30:28 -07:00 +++ b/net/ipv4/tcp_input.c 2004-09-28 14:30:28 -07:00 @@ -2355,6 +2355,86 @@ } } +/* There is one downside to this scheme. Although we keep the + * ACK clock ticking, adjusting packet counters and advancing + * congestion window, we do not liberate socket send buffer + * space. + * + * Mucking with skb->truesize and sk->sk_wmem_alloc et al. + * then making a write space wakeup callback is a possible + * future enhancement. WARNING: it is not trivial to make. + */ +static int tcp_tso_acked(struct tcp_opt *tp, struct sk_buff *skb, + __u32 now, __s32 *seq_rtt) +{ + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u32 mss = scb->tso_mss; + __u32 snd_una = tp->snd_una; + __u32 seq = scb->seq; + __u32 packets_acked = 0; + int acked = 0; + + /* If we get here, the whole TSO packet has not been + * acked. + */ + BUG_ON(!after(scb->end_seq, snd_una)); + + while (!after(seq + mss, snd_una)) { + packets_acked++; + seq += mss; + } + + if (packets_acked) { + __u8 sacked = scb->sacked; + + /* We adjust scb->seq but we do not pskb_pull() the + * SKB. We let tcp_retransmit_skb() handle this case + * by checking skb->len against the data sequence span. + * This way, we avoid the pskb_pull() work unless we + * actually need to retransmit the SKB. + */ + scb->seq = seq; + + acked |= FLAG_DATA_ACKED; + if (sacked) { + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tcp_dec_pcount_explicit(&tp->retrans_out, + packets_acked); + acked |= FLAG_RETRANS_DATA_ACKED; + *seq_rtt = -1; + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + if (sacked & TCPCB_SACKED_ACKED) + tcp_dec_pcount_explicit(&tp->sacked_out, + packets_acked); + if (sacked & TCPCB_LOST) + tcp_dec_pcount_explicit(&tp->lost_out, + packets_acked); + if (sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(scb->seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + + if (tcp_get_pcount(&tp->fackets_out)) { + __u32 dval = min(tcp_get_pcount(&tp->fackets_out), + packets_acked); + tcp_dec_pcount_explicit(&tp->fackets_out, dval); + } + tcp_dec_pcount_explicit(&tp->packets_out, packets_acked); + scb->tso_factor -= packets_acked; + + BUG_ON(scb->tso_factor == 0); + BUG_ON(!before(scb->seq, scb->end_seq)); + } + + return acked; +} + + /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2373,8 +2453,12 @@ * discard it as it's confirmed to have arrived at * the other end. */ - if (after(scb->end_seq, tp->snd_una)) + if (after(scb->end_seq, tp->snd_una)) { + if (scb->tso_factor > 1) + acked |= tcp_tso_acked(tp, skb, + now, &seq_rtt); break; + } /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not diff -Nru a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c --- a/net/ipv4/tcp_output.c 2004-09-28 14:30:28 -07:00 +++ b/net/ipv4/tcp_output.c 2004-09-28 14:30:28 -07:00 @@ -436,6 +436,7 @@ factor /= mss_std; TCP_SKB_CB(skb)->tso_factor = factor; } + TCP_SKB_CB(skb)->tso_mss = mss_std; } /* Function to create two new TCP segments. Shrinks the given segment @@ -552,7 +553,7 @@ return skb->tail; } -static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +static int __tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -565,11 +566,20 @@ return -ENOMEM; } - TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; return 0; } +static inline int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + int err = __tcp_trim_head(sk, skb, len); + + if (!err) + TCP_SKB_CB(skb)->seq += len; + + return err; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -949,6 +959,7 @@ { struct tcp_opt *tp = tcp_sk(sk); unsigned int cur_mss = tcp_current_mss(sk, 0); + __u32 data_seq, data_end_seq; int err; /* Do not sent more than we queued. 1/4 is reserved for possible @@ -958,6 +969,22 @@ min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + /* What is going on here? When TSO packets are partially ACK'd, + * we adjust the TCP_SKB_CB(skb)->seq value forward but we do + * not adjust the data area of the SKB. We defer that to here + * so that we can avoid the work unless we really retransmit + * the packet. + */ + data_seq = TCP_SKB_CB(skb)->seq; + data_end_seq = TCP_SKB_CB(skb)->end_seq; + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + data_end_seq--; + + if (skb->len != (data_end_seq - data_seq)) { + if (__tcp_trim_head(sk, skb, data_end_seq - data_seq)) + return -ENOMEM; + } + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -1191,6 +1218,7 @@ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->tso_factor = 1; + TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1223,6 +1251,7 @@ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->tso_factor = 1; + TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -1304,6 +1333,7 @@ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->tso_factor = 1; + TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -1406,6 +1436,7 @@ TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; TCP_SKB_CB(buff)->tso_factor = 1; + TCP_SKB_CB(buff)->tso_mss = tp->mss_cache_std; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -1506,6 +1537,7 @@ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; TCP_SKB_CB(buff)->tso_factor = 1; + TCP_SKB_CB(buff)->tso_mss = tp->mss_cache_std; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -1541,6 +1573,7 @@ TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; TCP_SKB_CB(skb)->tso_factor = 1; + TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just