* [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh
@ 2008-03-01 20:59 Patrick McManus
2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus
2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus
0 siblings, 2 replies; 5+ messages in thread
From: Patrick McManus @ 2008-03-01 20:59 UTC (permalink / raw)
To: netdev@vger.kernel.org
timeout associated with SO_DEFER_ACCEPT wasn't being honored if it was
less than the timeout allowed by the maximum syn-recv queue size
algorithm. Fix by using the SO_DEFER_ACCEPT value if the ack has
arrived.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
* Acked-by: Eric Dumazet <dada1@cosmosbay.com>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b189278..03cc323 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -461,8 +461,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
+ if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
&& !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
unsigned long timeo;
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack
2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus
@ 2008-03-01 21:01 ` Patrick McManus
2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus
1 sibling, 0 replies; 5+ messages in thread
From: Patrick McManus @ 2008-03-01 21:01 UTC (permalink / raw)
To: netdev@vger.kernel.org
a socket in LISTEN that had completed its 3 way handshake, but not notified
userspace because of SO_DEFER_ACCEPT, would retransmit the already
acked syn-ack during the time it was waiting for the first data byte
from the peer.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
----
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 03cc323..7216f5e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -461,8 +461,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
- && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+ if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
+ && (inet_rsk(req)->acked ||
+ !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) {
unsigned long timeo;
if (req->retrans++ == 0)
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established
2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus
2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus
@ 2008-03-01 21:05 ` Patrick McManus
2008-03-01 22:03 ` Ilpo Järvinen
1 sibling, 1 reply; 5+ messages in thread
From: Patrick McManus @ 2008-03-01 21:05 UTC (permalink / raw)
To: netdev@vger.kernel.org
Change TCP_DEFER_ACCEPT implementation so that it transitions a
connection to ESTABLISHED after handshake is complete instead of
leaving it in SYN-RECV until some data arrvies. Place connection in
accept queue when first data packet arrives from slow path.
Benefits:
- established connection is now reset if it never makes it to the accept
queue
- diagnostic state of established matches with the packet traces
showing completed handshake
- TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be
enforced with reasonable accuracy instead of rounding up to next
exponential back-off of syn-ack retry.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
---
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 08027f1..5843344 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -239,6 +239,13 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
+struct tcp_deferred_accept_info
+{
+ struct sock *listen_sk;
+ struct request_sock *request;
+};
+
+
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@@ -374,6 +381,8 @@ struct tcp_sock {
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
+ struct tcp_deferred_accept_info defer_tcp_accept;
+
unsigned long last_synq_overflow;
u32 tso_deferred;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 70013c5..ed59cb0 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -72,8 +72,7 @@ struct inet_request_sock {
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
- ecn_ok : 1,
- acked : 1;
+ ecn_ok : 1;
struct ip_options *opt;
};
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cff4608..389abc4 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -116,8 +116,8 @@ struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
- u8 rskq_defer_accept;
- /* 3 bytes hole, try to pack */
+ u16 rskq_defer_accept;
+ /* 2 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de4ea3..dd3df5d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -138,6 +138,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_KEEPINTVL 32767
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
+#define MAX_TCP_ACCEPT_DEFERRED 65535
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
@@ -955,7 +956,6 @@ static inline void tcp_openreq_init(struct request_sock *req,
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
- ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->rmt_port = tcp_hdr(skb)->source;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7216f5e..80e03b3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
struct inet_connection_sock *icsk = inet_csk(parent);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
- int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
+ int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
unsigned long now = jiffies;
struct request_sock **reqp, *req;
int i, budget;
@@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
}
}
- if (queue->rskq_defer_accept)
- max_retries = queue->rskq_defer_accept;
-
budget = 2 * (lopt->nr_table_entries / (timeout / interval));
i = lopt->clock_hand;
@@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
- && (inet_rsk(req)->acked ||
- !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) {
+ if (req->retrans < thresh
+ && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
unsigned long timeo;
if (req->retrans++ == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 071e83a..905e39e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2105,16 +2105,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_DEFER_ACCEPT:
- icsk->icsk_accept_queue.rskq_defer_accept = 0;
- if (val > 0) {
- /* Translate value in seconds to number of
- * retransmits */
- while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
- val > ((TCP_TIMEOUT_INIT / HZ) <<
- icsk->icsk_accept_queue.rskq_defer_accept))
- icsk->icsk_accept_queue.rskq_defer_accept++;
- icsk->icsk_accept_queue.rskq_defer_accept++;
+ if (val < 0)
+ err = -EINVAL;
+ else
+ {
+ if (val > MAX_TCP_ACCEPT_DEFERRED)
+ val = MAX_TCP_ACCEPT_DEFERRED;
+ icsk->icsk_accept_queue.rskq_defer_accept = val;
}
+
break;
case TCP_WINDOW_CLAMP:
@@ -2295,8 +2294,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
- ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+ val = icsk->icsk_accept_queue.rskq_defer_accept;
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 19c449f..60a51e7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4807,6 +4807,40 @@ step5:
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
+
+ if (tp->defer_tcp_accept.request ) {
+
+ if ((!th->fin) &&
+ tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
+
+ if (sock_flag(sk, SOCK_KEEPOPEN)) {
+ inet_csk_reset_keepalive_timer(sk,
+ keepalive_time_when(tp));
+ }
+ else {
+ inet_csk_delete_keepalive_timer(sk);
+ }
+
+ inet_csk_reqsk_queue_add(
+ tp->defer_tcp_accept.listen_sk,
+ tp->defer_tcp_accept.request,
+ sk);
+
+ tp->defer_tcp_accept.listen_sk->sk_data_ready(
+ tp->defer_tcp_accept.listen_sk, 0);
+
+ sock_put (tp->defer_tcp_accept.listen_sk);
+ sock_put (sk);
+ tp->defer_tcp_accept.listen_sk = NULL;
+ tp->defer_tcp_accept.request = NULL;
+ }
+ else
+ {
+ tcp_reset(sk);
+ return 1;
+ }
+ }
+
return 0;
csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 00156bf..31beaa5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1921,6 +1921,16 @@ int tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
+ if (tp->defer_tcp_accept.request) {
+
+ reqsk_free(tp->defer_tcp_accept.request);
+ sock_put (tp->defer_tcp_accept.listen_sk);
+ sock_put (sk);
+
+ tp->defer_tcp_accept.listen_sk = NULL;
+ tp->defer_tcp_accept.request = NULL;
+ }
+
atomic_dec(&tcp_sockets_allocated);
return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b61b768..c307052 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -569,10 +569,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
- bare ACK. Otherwise, we create an established connection. Both
- ends (listening sockets) accept the new incoming connection and try
- to talk to each other. 8-)
+ Both ends (listening sockets) accept the new incoming
+ connection and try to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -640,13 +638,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
- /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
-
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -685,12 +676,30 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+
+ /* the accept queue handling is done is est recv slow path
+ * so lets make sure to start there
+ */
+ tcp_sk(child)->pred_flags = 0;
+
+ sock_hold (sk);
+ sock_hold (child);
+ tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
+ tcp_sk(child)->defer_tcp_accept.request = req;
+
+ inet_csk_reset_keepalive_timer (child,
+ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
+ }
+ else {
+ inet_csk_reqsk_queue_add(sk, req, child);
+ }
+
return child;
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
- inet_rsk(req)->acked = 1;
return NULL;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 803d758..5faad8b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -481,6 +481,13 @@ static void tcp_keepalive_timer (unsigned long data)
goto death;
}
+
+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED)
+ {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
goto out;
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established
2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus
@ 2008-03-01 22:03 ` Ilpo Järvinen
2008-03-02 21:56 ` [PATCH 3/3 (spin 3)] " Patrick McManus
0 siblings, 1 reply; 5+ messages in thread
From: Ilpo Järvinen @ 2008-03-01 22:03 UTC (permalink / raw)
To: Patrick McManus; +Cc: netdev@vger.kernel.org
On Sat, 1 Mar 2008, Patrick McManus wrote:
There are a number of style issues, please see Documentation/CodingStyle
for more details (most of them noted below as well)...
> @@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
> reqp=&lopt->syn_table[i];
> while ((req = *reqp) != NULL) {
> if (time_after_eq(now, req->expires)) {
> - if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
> - && (inet_rsk(req)->acked ||
> - !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) {
> + if (req->retrans < thresh
> + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
These &&/|| are usually placed into end of the previous line.
> unsigned long timeo;
>
> if (req->retrans++ == 0)
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 071e83a..905e39e 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2105,16 +2105,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
> break;
>
> case TCP_DEFER_ACCEPT:
> - icsk->icsk_accept_queue.rskq_defer_accept = 0;
> - if (val > 0) {
> - /* Translate value in seconds to number of
> - * retransmits */
> - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
> - val > ((TCP_TIMEOUT_INIT / HZ) <<
> - icsk->icsk_accept_queue.rskq_defer_accept))
> - icsk->icsk_accept_queue.rskq_defer_accept++;
> - icsk->icsk_accept_queue.rskq_defer_accept++;
> + if (val < 0)
> + err = -EINVAL;
> + else
> + {
Put brace to the same line with else and use braces for the both blocks
whenever either block needs them.
> + if (val > MAX_TCP_ACCEPT_DEFERRED)
> + val = MAX_TCP_ACCEPT_DEFERRED;
> + icsk->icsk_accept_queue.rskq_defer_accept = val;
> }
> +
> break;
>
> case TCP_WINDOW_CLAMP:
> @@ -2295,8 +2294,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
> val = (val ? : sysctl_tcp_fin_timeout) / HZ;
> break;
> case TCP_DEFER_ACCEPT:
> - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
> - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
> + val = icsk->icsk_accept_queue.rskq_defer_accept;
> break;
> case TCP_WINDOW_CLAMP:
> val = tp->window_clamp;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 19c449f..60a51e7 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4807,6 +4807,40 @@ step5:
>
> tcp_data_snd_check(sk);
> tcp_ack_snd_check(sk);
> +
> + if (tp->defer_tcp_accept.request ) {
space--
> +
> + if ((!th->fin) &&
> + tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
> +
> + if (sock_flag(sk, SOCK_KEEPOPEN)) {
> + inet_csk_reset_keepalive_timer(sk,
> + keepalive_time_when(tp));
> + }
> + else {
braces & elses to the same line please.
> + inet_csk_delete_keepalive_timer(sk);
> + }
> +
> + inet_csk_reqsk_queue_add(
> + tp->defer_tcp_accept.listen_sk,
> + tp->defer_tcp_accept.request,
> + sk);
> +
> + tp->defer_tcp_accept.listen_sk->sk_data_ready(
> + tp->defer_tcp_accept.listen_sk, 0);
> +
> + sock_put (tp->defer_tcp_accept.listen_sk);
> + sock_put (sk);
spaces--;
> + tp->defer_tcp_accept.listen_sk = NULL;
> + tp->defer_tcp_accept.request = NULL;
> + }
> + else
> + {
both braces on the same line as else please.
> + tcp_reset(sk);
> + return 1;
> + }
> + }
> +
> return 0;
Please check for similar things elsewhere as well, and quickly check what
the CodingStyle has to say as well. I stopped here and might have missed
something above as well.
--
i.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 3/3 (spin 3)] TCP_DEFER_ACCEPT updates - process as established
2008-03-01 22:03 ` Ilpo Järvinen
@ 2008-03-02 21:56 ` Patrick McManus
0 siblings, 0 replies; 5+ messages in thread
From: Patrick McManus @ 2008-03-02 21:56 UTC (permalink / raw)
To: Ilpo Järvinen, netdev@vger.kernel.org
Ilpo, thanks for the feedback! This spin contains corrections for
codingstyle issues with help from checkpatch.pl, replacement of a flag
used in dccp too, and a fix for cases where the fin piggybacks on real
data.
=====
Change TCP_DEFER_ACCEPT implementation so that it transitions a
connection to ESTABLISHED after handshake is complete instead of
leaving it in SYN-RECV until some data arrvies. Place connection in
accept queue when first data packet arrives from slow path.
Benefits:
- established connection is now reset if it never makes it
to the accept queue
- diagnostic state of established matches with the packet traces
showing completed handshake
- TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be
enforced with reasonable accuracy instead of rounding up to next
exponential back-off of syn-ack retry.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
---
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 08027f1..d96d9b1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -239,6 +239,11 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
+struct tcp_deferred_accept_info {
+ struct sock *listen_sk;
+ struct request_sock *request;
+};
+
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@@ -374,6 +379,8 @@ struct tcp_sock {
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
+ struct tcp_deferred_accept_info defer_tcp_accept;
+
unsigned long last_synq_overflow;
u32 tso_deferred;
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cff4608..389abc4 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -116,8 +116,8 @@ struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
- u8 rskq_defer_accept;
- /* 3 bytes hole, try to pack */
+ u16 rskq_defer_accept;
+ /* 2 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de4ea3..5780f62 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -138,6 +138,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_KEEPINTVL 32767
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
+#define MAX_TCP_ACCEPT_DEFERRED 65535
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7216f5e..9323405 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
struct inet_connection_sock *icsk = inet_csk(parent);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
- int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
+ int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
unsigned long now = jiffies;
struct request_sock **reqp, *req;
int i, budget;
@@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
}
}
- if (queue->rskq_defer_accept)
- max_retries = queue->rskq_defer_accept;
-
budget = 2 * (lopt->nr_table_entries / (timeout / interval));
i = lopt->clock_hand;
@@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
- && (inet_rsk(req)->acked ||
- !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) {
+ if (req->retrans < thresh &&
+ !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
unsigned long timeo;
if (req->retrans++ == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 071e83a..e0fbc25 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2105,15 +2105,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_DEFER_ACCEPT:
- icsk->icsk_accept_queue.rskq_defer_accept = 0;
- if (val > 0) {
- /* Translate value in seconds to number of
- * retransmits */
- while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
- val > ((TCP_TIMEOUT_INIT / HZ) <<
- icsk->icsk_accept_queue.rskq_defer_accept))
- icsk->icsk_accept_queue.rskq_defer_accept++;
- icsk->icsk_accept_queue.rskq_defer_accept++;
+ if (val < 0) {
+ err = -EINVAL;
+ } else {
+ if (val > MAX_TCP_ACCEPT_DEFERRED)
+ val = MAX_TCP_ACCEPT_DEFERRED;
+ icsk->icsk_accept_queue.rskq_defer_accept = val;
}
break;
@@ -2295,8 +2292,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
- ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+ val = icsk->icsk_accept_queue.rskq_defer_accept;
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 19c449f..11aea98 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4447,6 +4447,49 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
}
}
+static int tcp_defer_accept_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->defer_tcp_accept.request) {
+ int queued_data = tp->rcv_nxt - tp->copied_seq;
+ int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ?
+ tcp_hdr((struct sk_buff *)
+ sk->sk_receive_queue.prev)->fin : 0;
+
+ if (queued_data && hasfin)
+ queued_data--;
+
+ if (queued_data &&
+ tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
+ if (sock_flag(sk, SOCK_KEEPOPEN)) {
+ inet_csk_reset_keepalive_timer(sk,
+ keepalive_time_when(tp));
+ } else {
+ inet_csk_delete_keepalive_timer(sk);
+ }
+
+ inet_csk_reqsk_queue_add(
+ tp->defer_tcp_accept.listen_sk,
+ tp->defer_tcp_accept.request,
+ sk);
+
+ tp->defer_tcp_accept.listen_sk->sk_data_ready(
+ tp->defer_tcp_accept.listen_sk, 0);
+
+ sock_put(tp->defer_tcp_accept.listen_sk);
+ sock_put(sk);
+ tp->defer_tcp_accept.listen_sk = NULL;
+ tp->defer_tcp_accept.request = NULL;
+ } else if (hasfin ||
+ tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
+ tcp_reset(sk);
+ return -1;
+ }
+ }
+ return 0;
+}
+
static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4807,6 +4850,9 @@ step5:
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
+
+ if (tcp_defer_accept_check(sk))
+ return -1;
return 0;
csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 00156bf..d4c9d03 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1921,6 +1921,14 @@ int tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
+ if (tp->defer_tcp_accept.request) {
+ reqsk_free(tp->defer_tcp_accept.request);
+ sock_put(tp->defer_tcp_accept.listen_sk);
+ sock_put(sk);
+ tp->defer_tcp_accept.listen_sk = NULL;
+ tp->defer_tcp_accept.request = NULL;
+ }
+
atomic_dec(&tcp_sockets_allocated);
return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b61b768..ab519d8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -569,10 +569,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
- bare ACK. Otherwise, we create an established connection. Both
- ends (listening sockets) accept the new incoming connection and try
- to talk to each other. 8-)
+ Both ends (listening sockets) accept the new incoming
+ connection and try to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -640,13 +638,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
- /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
-
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -685,7 +676,24 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+
+ /* the accept queue handling is done is est recv slow
+ * path so lets make sure to start there
+ */
+ tcp_sk(child)->pred_flags = 0;
+ sock_hold(sk);
+ sock_hold(child);
+ tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
+ tcp_sk(child)->defer_tcp_accept.request = req;
+
+ inet_csk_reset_keepalive_timer(child,
+ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
+ } else {
+ inet_csk_reqsk_queue_add(sk, req, child);
+ }
+
return child;
listen_overflow:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 803d758..160d16f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
goto death;
}
+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
goto out;
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2008-03-02 21:55 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-01 20:59 [PATCH 1/3] TCP_DEFER_ACCEPT updates - defer timeout conflics with max_thresh Patrick McManus
2008-03-01 21:01 ` [PATCH 2/3] TCP_DEFER_ACCEPT updates - dont retxmt synack Patrick McManus
2008-03-01 21:05 ` [PATCH 3/3 (spin 2)] TCP_DEFER_ACCEPT updates - process as established Patrick McManus
2008-03-01 22:03 ` Ilpo Järvinen
2008-03-02 21:56 ` [PATCH 3/3 (spin 3)] " Patrick McManus
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).