* [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh
@ 2008-03-01 20:59 Patrick McManus
2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus
2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus
0 siblings, 2 replies; 5+ messages in thread
From: Patrick McManus @ 2008-03-01 20:59 UTC (permalink / raw)
To: netdev@vger.kernel.org
timeout associated with SO_DEFER_ACCEPT wasn't being honored if it was
less than the timeout allowed by the maximum syn-recv queue size
algorithm. Fix by using the SO_DEFER_ACCEPT value if the ack has
arrived.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
* Acked-by: Eric Dumazet <dada1@cosmosbay.com>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b189278..03cc323 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -461,8 +461,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
+ if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
&& !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
unsigned long timeo;
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack 2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus @ 2008-03-01 21:01 ` Patrick McManus 2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus 1 sibling, 0 replies; 5+ messages in thread From: Patrick McManus @ 2008-03-01 21:01 UTC (permalink / raw) To: netdev@vger.kernel.org a socket in LISTEN that had completed its 3 way handshake, but not notified userspace because of SO_DEFER_ACCEPT, would retransmit the already acked syn-ack during the time it was waiting for the first data byte from the peer. Signed-off-by: Patrick McManus <mcmanus@ducksong.com> Acked-by: Eric Dumazet <dada1@cosmosbay.com> ---- diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 03cc323..7216f5e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -461,8 +461,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) + && (inet_rsk(req)->acked || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) { unsigned long timeo; if (req->retrans++ == 0) ^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established 2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus 2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus @ 2008-03-01 21:05 ` Patrick McManus 2008-03-01 22:03 ` Ilpo Järvinen 1 sibling, 1 reply; 5+ messages in thread From: Patrick McManus @ 2008-03-01 21:05 UTC (permalink / raw) To: netdev@vger.kernel.org Change TCP_DEFER_ACCEPT implementation so that it transitions a connection to ESTABLISHED after handshake is complete instead of leaving it in SYN-RECV until some data arrvies. Place connection in accept queue when first data packet arrives from slow path. Benefits: - established connection is now reset if it never makes it to the accept queue - diagnostic state of established matches with the packet traces showing completed handshake - TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be enforced with reasonable accuracy instead of rounding up to next exponential back-off of syn-ack retry. Signed-off-by: Patrick McManus <mcmanus@ducksong.com> --- diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 08027f1..5843344 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,6 +239,13 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +struct tcp_deferred_accept_info +{ + struct sock *listen_sk; + struct request_sock *request; +}; + + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -374,6 +381,8 @@ struct tcp_sock { unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; + struct tcp_deferred_accept_info defer_tcp_accept; + unsigned long last_synq_overflow; u32 tso_deferred; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 70013c5..ed59cb0 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -72,8 +72,7 @@ struct inet_request_sock { tstamp_ok : 1, sack_ok : 1, wscale_ok : 1, - ecn_ok : 1, - acked : 1; + ecn_ok : 1; struct ip_options *opt; }; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index cff4608..389abc4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -116,8 +116,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; - u8 rskq_defer_accept; - /* 3 bytes hole, try to pack */ + u16 rskq_defer_accept; + /* 2 bytes hole, try to pack */ struct listen_sock *listen_opt; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7de4ea3..dd3df5d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -138,6 +138,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 +#define MAX_TCP_ACCEPT_DEFERRED 65535 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ @@ -955,7 +956,6 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->sack_ok = rx_opt->sack_ok; ireq->snd_wscale = rx_opt->snd_wscale; ireq->wscale_ok = rx_opt->wscale_ok; - ireq->acked = 0; ireq->ecn_ok = 0; ireq->rmt_port = tcp_hdr(skb)->source; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7216f5e..80e03b3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; + int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) - && (inet_rsk(req)->acked || - !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) { + if (req->retrans < thresh + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { unsigned long timeo; if (req->retrans++ == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 071e83a..905e39e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,16 +2105,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - icsk->icsk_accept_queue.rskq_defer_accept = 0; - if (val > 0) { - /* Translate value in seconds to number of - * retransmits */ - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && - val > ((TCP_TIMEOUT_INIT / HZ) << - icsk->icsk_accept_queue.rskq_defer_accept)) - icsk->icsk_accept_queue.rskq_defer_accept++; - icsk->icsk_accept_queue.rskq_defer_accept++; + if (val < 0) + err = -EINVAL; + else + { + if (val > MAX_TCP_ACCEPT_DEFERRED) + val = MAX_TCP_ACCEPT_DEFERRED; + icsk->icsk_accept_queue.rskq_defer_accept = val; } + break; case TCP_WINDOW_CLAMP: @@ -2295,8 +2294,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); + val = icsk->icsk_accept_queue.rskq_defer_accept; break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 19c449f..60a51e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4807,6 +4807,40 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + + if (tp->defer_tcp_accept.request ) { + + if ((!th->fin) && + tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { + + if (sock_flag(sk, SOCK_KEEPOPEN)) { + inet_csk_reset_keepalive_timer(sk, + keepalive_time_when(tp)); + } + else { + inet_csk_delete_keepalive_timer(sk); + } + + inet_csk_reqsk_queue_add( + tp->defer_tcp_accept.listen_sk, + tp->defer_tcp_accept.request, + sk); + + tp->defer_tcp_accept.listen_sk->sk_data_ready( + tp->defer_tcp_accept.listen_sk, 0); + + sock_put (tp->defer_tcp_accept.listen_sk); + sock_put (sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } + else + { + tcp_reset(sk); + return 1; + } + } + return 0; csum_error: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00156bf..31beaa5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1921,6 +1921,16 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + if (tp->defer_tcp_accept.request) { + + reqsk_free(tp->defer_tcp_accept.request); + sock_put (tp->defer_tcp_accept.listen_sk); + sock_put (sk); + + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } + atomic_dec(&tcp_sockets_allocated); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b61b768..c307052 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -569,10 +569,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this - bare ACK. Otherwise, we create an established connection. Both - ends (listening sockets) accept the new incoming connection and try - to talk to each other. 8-) + Both ends (listening sockets) accept the new incoming + connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,13 +638,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - inet_rsk(req)->acked = 1; - return NULL; - } - /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -685,12 +676,30 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + + /* the accept queue handling is done is est recv slow path + * so lets make sure to start there + */ + tcp_sk(child)->pred_flags = 0; + + sock_hold (sk); + sock_hold (child); + tcp_sk(child)->defer_tcp_accept.listen_sk = sk; + tcp_sk(child)->defer_tcp_accept.request = req; + + inet_csk_reset_keepalive_timer (child, + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); + } + else { + inet_csk_reqsk_queue_add(sk, req, child); + } + return child; listen_overflow: if (!sysctl_tcp_abort_on_overflow) { - inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 803d758..5faad8b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -481,6 +481,13 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } + + if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) + { + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established 2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus @ 2008-03-01 22:03 ` Ilpo Järvinen 2008-03-02 21:56 ` [PATCH 3/3 (spin 3)] " Patrick McManus 0 siblings, 1 reply; 5+ messages in thread From: Ilpo Järvinen @ 2008-03-01 22:03 UTC (permalink / raw) To: Patrick McManus; +Cc: netdev@vger.kernel.org On Sat, 1 Mar 2008, Patrick McManus wrote: There are a number of style issues, please see Documentation/CodingStyle for more details (most of them noted below as well)... > @@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, > reqp=&lopt->syn_table[i]; > while ((req = *reqp) != NULL) { > if (time_after_eq(now, req->expires)) { > - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) > - && (inet_rsk(req)->acked || > - !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) { > + if (req->retrans < thresh > + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { These &&/|| are usually placed into end of the previous line. > unsigned long timeo; > > if (req->retrans++ == 0) > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 071e83a..905e39e 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -2105,16 +2105,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, > break; > > case TCP_DEFER_ACCEPT: > - icsk->icsk_accept_queue.rskq_defer_accept = 0; > - if (val > 0) { > - /* Translate value in seconds to number of > - * retransmits */ > - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && > - val > ((TCP_TIMEOUT_INIT / HZ) << > - icsk->icsk_accept_queue.rskq_defer_accept)) > - icsk->icsk_accept_queue.rskq_defer_accept++; > - icsk->icsk_accept_queue.rskq_defer_accept++; > + if (val < 0) > + err = -EINVAL; > + else > + { Put brace to the same line with else and use braces for the both blocks whenever either block needs them. > + if (val > MAX_TCP_ACCEPT_DEFERRED) > + val = MAX_TCP_ACCEPT_DEFERRED; > + icsk->icsk_accept_queue.rskq_defer_accept = val; > } > + > break; > > case TCP_WINDOW_CLAMP: > @@ -2295,8 +2294,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, > val = (val ? : sysctl_tcp_fin_timeout) / HZ; > break; > case TCP_DEFER_ACCEPT: > - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : > - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); > + val = icsk->icsk_accept_queue.rskq_defer_accept; > break; > case TCP_WINDOW_CLAMP: > val = tp->window_clamp; > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 19c449f..60a51e7 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -4807,6 +4807,40 @@ step5: > > tcp_data_snd_check(sk); > tcp_ack_snd_check(sk); > + > + if (tp->defer_tcp_accept.request ) { space-- > + > + if ((!th->fin) && > + tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { > + > + if (sock_flag(sk, SOCK_KEEPOPEN)) { > + inet_csk_reset_keepalive_timer(sk, > + keepalive_time_when(tp)); > + } > + else { braces & elses to the same line please. > + inet_csk_delete_keepalive_timer(sk); > + } > + > + inet_csk_reqsk_queue_add( > + tp->defer_tcp_accept.listen_sk, > + tp->defer_tcp_accept.request, > + sk); > + > + tp->defer_tcp_accept.listen_sk->sk_data_ready( > + tp->defer_tcp_accept.listen_sk, 0); > + > + sock_put (tp->defer_tcp_accept.listen_sk); > + sock_put (sk); spaces--; > + tp->defer_tcp_accept.listen_sk = NULL; > + tp->defer_tcp_accept.request = NULL; > + } > + else > + { both braces on the same line as else please. > + tcp_reset(sk); > + return 1; > + } > + } > + > return 0; Please check for similar things elsewhere as well, and quickly check what the CodingStyle has to say as well. I stopped here and might have missed something above as well. -- i. ^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 3/3 (spin 3)] TCP_DEFER_ACCEPT updates - process as established 2008-03-01 22:03 ` Ilpo Järvinen @ 2008-03-02 21:56 ` Patrick McManus 0 siblings, 0 replies; 5+ messages in thread From: Patrick McManus @ 2008-03-02 21:56 UTC (permalink / raw) To: Ilpo Järvinen, netdev@vger.kernel.org Ilpo, thanks for the feedback! This spin contains corrections for codingstyle issues with help from checkpatch.pl, replacement of a flag used in dccp too, and a fix for cases where the fin piggybacks on real data. ===== Change TCP_DEFER_ACCEPT implementation so that it transitions a connection to ESTABLISHED after handshake is complete instead of leaving it in SYN-RECV until some data arrvies. Place connection in accept queue when first data packet arrives from slow path. Benefits: - established connection is now reset if it never makes it to the accept queue - diagnostic state of established matches with the packet traces showing completed handshake - TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be enforced with reasonable accuracy instead of rounding up to next exponential back-off of syn-ack retry. Signed-off-by: Patrick McManus <mcmanus@ducksong.com> --- diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 08027f1..d96d9b1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,6 +239,11 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +struct tcp_deferred_accept_info { + struct sock *listen_sk; + struct request_sock *request; +}; + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -374,6 +379,8 @@ struct tcp_sock { unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; + struct tcp_deferred_accept_info defer_tcp_accept; + unsigned long last_synq_overflow; u32 tso_deferred; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index cff4608..389abc4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -116,8 +116,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; - u8 rskq_defer_accept; - /* 3 bytes hole, try to pack */ + u16 rskq_defer_accept; + /* 2 bytes hole, try to pack */ struct listen_sock *listen_opt; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7de4ea3..5780f62 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -138,6 +138,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 +#define MAX_TCP_ACCEPT_DEFERRED 65535 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7216f5e..9323405 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; + int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) - && (inet_rsk(req)->acked || - !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) { + if (req->retrans < thresh && + !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { unsigned long timeo; if (req->retrans++ == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 071e83a..e0fbc25 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,15 +2105,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - icsk->icsk_accept_queue.rskq_defer_accept = 0; - if (val > 0) { - /* Translate value in seconds to number of - * retransmits */ - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && - val > ((TCP_TIMEOUT_INIT / HZ) << - icsk->icsk_accept_queue.rskq_defer_accept)) - icsk->icsk_accept_queue.rskq_defer_accept++; - icsk->icsk_accept_queue.rskq_defer_accept++; + if (val < 0) { + err = -EINVAL; + } else { + if (val > MAX_TCP_ACCEPT_DEFERRED) + val = MAX_TCP_ACCEPT_DEFERRED; + icsk->icsk_accept_queue.rskq_defer_accept = val; } break; @@ -2295,8 +2292,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); + val = icsk->icsk_accept_queue.rskq_defer_accept; break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 19c449f..11aea98 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4447,6 +4447,49 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } +static int tcp_defer_accept_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->defer_tcp_accept.request) { + int queued_data = tp->rcv_nxt - tp->copied_seq; + int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? + tcp_hdr((struct sk_buff *) + sk->sk_receive_queue.prev)->fin : 0; + + if (queued_data && hasfin) + queued_data--; + + if (queued_data && + tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { + if (sock_flag(sk, SOCK_KEEPOPEN)) { + inet_csk_reset_keepalive_timer(sk, + keepalive_time_when(tp)); + } else { + inet_csk_delete_keepalive_timer(sk); + } + + inet_csk_reqsk_queue_add( + tp->defer_tcp_accept.listen_sk, + tp->defer_tcp_accept.request, + sk); + + tp->defer_tcp_accept.listen_sk->sk_data_ready( + tp->defer_tcp_accept.listen_sk, 0); + + sock_put(tp->defer_tcp_accept.listen_sk); + sock_put(sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } else if (hasfin || + tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { + tcp_reset(sk); + return -1; + } + } + return 0; +} + static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4807,6 +4850,9 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + + if (tcp_defer_accept_check(sk)) + return -1; return 0; csum_error: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00156bf..d4c9d03 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1921,6 +1921,14 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + if (tp->defer_tcp_accept.request) { + reqsk_free(tp->defer_tcp_accept.request); + sock_put(tp->defer_tcp_accept.listen_sk); + sock_put(sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } + atomic_dec(&tcp_sockets_allocated); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b61b768..ab519d8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -569,10 +569,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this - bare ACK. Otherwise, we create an established connection. Both - ends (listening sockets) accept the new incoming connection and try - to talk to each other. 8-) + Both ends (listening sockets) accept the new incoming + connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,13 +638,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - inet_rsk(req)->acked = 1; - return NULL; - } - /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -685,7 +676,24 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + + /* the accept queue handling is done is est recv slow + * path so lets make sure to start there + */ + tcp_sk(child)->pred_flags = 0; + sock_hold(sk); + sock_hold(child); + tcp_sk(child)->defer_tcp_accept.listen_sk = sk; + tcp_sk(child)->defer_tcp_accept.request = req; + + inet_csk_reset_keepalive_timer(child, + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); + } else { + inet_csk_reqsk_queue_add(sk, req, child); + } + return child; listen_overflow: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 803d758..160d16f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } + if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; ^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2008-03-02 21:55 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus 2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus 2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus 2008-03-01 22:03 ` Ilpo Järvinen 2008-03-02 21:56 ` [PATCH 3/3 (spin 3)] " Patrick McManus
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).