All of lore.kernel.org
 help / color / mirror / Atom feed
From: Patrick McManus <mcmanus@ducksong.com>
To: "netdev@vger.kernel.org" <netdev@vger.kernel.org>
Subject: RFC [PATCH 3/3] TCP_DEFER_ACCEPT updates: more accurate timers and resets
Date: Tue, 26 Feb 2008 20:49:33 -0500	[thread overview]
Message-ID: <1204076973.15970.37.camel@tng> (raw)

Signed-off-by: Patrick McManus <mcmanus@ducksong.com>

  Change TCP_DEFER_ACCEPT implementation so that it transitions a
    connection to ESTABLISHED after handshake is complete instead of
    leaving it in SYN-RECV until some data arrvies. Place connection in
    accept queue when first data packet arrives from slow path.

    Benefits:
     - established connection is now reset if it never makes it to the accept
     queue

     - diagnostic state of established matches with the packet traces
       showing completed handshake

    - TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be
      enforced with reasonable accuracy instead of rounding up to next
      exponential back-off of syn-ack retry.

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 08027f1..9b3ffda 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -239,6 +239,14 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
        return (struct tcp_request_sock *)req;
 }

+struct tcp_deferred_accept_info
+{
+       struct sock *listen_sk;
+       struct sock *child_sk;
+       struct request_sock *request;
+};
+
+
 struct tcp_sock {
        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock     inet_conn;
@@ -374,6 +382,8 @@ struct tcp_sock {
        unsigned int            keepalive_intvl;  /* time interval between keep alive probes */
        int                     linger2;

+       struct tcp_deferred_accept_info defer_tcp_accept;
+
        unsigned long last_synq_overflow;

        u32     tso_deferred;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 70013c5..ed59cb0 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -72,8 +72,7 @@ struct inet_request_sock {
                                tstamp_ok  : 1,
                                sack_ok    : 1,
                                wscale_ok  : 1,
-                               ecn_ok     : 1,
-                               acked      : 1;
+                               ecn_ok     : 1;
        struct ip_options       *opt;
 };

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cff4608..389abc4 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -116,8 +116,8 @@ struct request_sock_queue {
        struct request_sock     *rskq_accept_head;
        struct request_sock     *rskq_accept_tail;
        rwlock_t                syn_wait_lock;
-       u8                      rskq_defer_accept;
-       /* 3 bytes hole, try to pack */
+       u16                     rskq_defer_accept;
+       /* 2 bytes hole, try to pack */
        struct listen_sock      *listen_opt;
 };

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de4ea3..dd3df5d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -138,6 +138,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define MAX_TCP_KEEPINTVL      32767
 #define MAX_TCP_KEEPCNT                127
 #define MAX_TCP_SYNCNT         127
+#define MAX_TCP_ACCEPT_DEFERRED 65535

 #define TCP_SYNQ_INTERVAL      (HZ/5)  /* Period of SYNACK timer */

@@ -955,7 +956,6 @@ static inline void tcp_openreq_init(struct request_sock *req,
        ireq->sack_ok = rx_opt->sack_ok;
        ireq->snd_wscale = rx_opt->snd_wscale;
        ireq->wscale_ok = rx_opt->wscale_ok;
-       ireq->acked = 0;
        ireq->ecn_ok = 0;
        ireq->rmt_port = tcp_hdr(skb)->source;
 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7216f5e..80e03b3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
        struct inet_connection_sock *icsk = inet_csk(parent);
        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct listen_sock *lopt = queue->listen_opt;
-       int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
-       int thresh = max_retries;
+       int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
        unsigned long now = jiffies;
        struct request_sock **reqp, *req;
        int i, budget;
@@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                }
        }

-       if (queue->rskq_defer_accept)
-               max_retries = queue->rskq_defer_accept;
-
        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
        i = lopt->clock_hand;

@@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                reqp=&lopt->syn_table[i];
                while ((req = *reqp) != NULL) {
                        if (time_after_eq(now, req->expires)) {
-                               if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh))
-                                   &&  (inet_rsk(req)->acked ||
-                                        !req->rsk_ops->rtx_syn_ack(parent, req, NULL))) {
+                               if (req->retrans < thresh
+                                   && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
                                        unsigned long timeo;

                                        if (req->retrans++ == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 071e83a..3133259 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2105,16 +2105,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;

        case TCP_DEFER_ACCEPT:
-               icsk->icsk_accept_queue.rskq_defer_accept = 0;
-               if (val > 0) {
-                       /* Translate value in seconds to number of
-                        * retransmits */
-                       while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
-                              val > ((TCP_TIMEOUT_INIT / HZ) <<
-                                      icsk->icsk_accept_queue.rskq_defer_accept))
-                               icsk->icsk_accept_queue.rskq_defer_accept++;
-                       icsk->icsk_accept_queue.rskq_defer_accept++;
-               }
+               if (val < 0 || val > MAX_TCP_ACCEPT_DEFERRED)
+                       err = -EINVAL;
+               else
+                       icsk->icsk_accept_queue.rskq_defer_accept = val;
+
                break;

        case TCP_WINDOW_CLAMP:
@@ -2295,8 +2290,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
-               val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
-                       ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+               val = icsk->icsk_accept_queue.rskq_defer_accept;
                break;
        case TCP_WINDOW_CLAMP:
                val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 19c449f..979b2d3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4807,6 +4807,41 @@ step5:

        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
+
+       if (tp->defer_tcp_accept.request ) {
+
+               if ((!th->fin) &&
+                   tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
+
+                       if (sock_flag(sk, SOCK_KEEPOPEN)) {
+                               inet_csk_reset_keepalive_timer(sk,
+                                                              keepalive_time_when(tp));
+                       }
+                       else {
+                               inet_csk_delete_keepalive_timer(sk);
+                       }
+
+                       inet_csk_reqsk_queue_add(
+                               tp->defer_tcp_accept.listen_sk,
+                               tp->defer_tcp_accept.request,
+                               tp->defer_tcp_accept.child_sk);
+
+                       tp->defer_tcp_accept.listen_sk->sk_data_ready(
+                               tp->defer_tcp_accept.listen_sk, 0);
+
+                       sock_put (tp->defer_tcp_accept.listen_sk);
+                       sock_put (tp->defer_tcp_accept.child_sk);
+                       tp->defer_tcp_accept.listen_sk = NULL;
+                       tp->defer_tcp_accept.request = NULL;
+                       tp->defer_tcp_accept.child_sk = NULL;
+               }
+               else
+               {
+                       tcp_reset(sk);
+                       return 1;
+               }
+       }
+
        return 0;

 csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 00156bf..cf20744 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1921,6 +1921,17 @@ int tcp_v4_destroy_sock(struct sock *sk)
                sk->sk_sndmsg_page = NULL;
        }

+       if (tp->defer_tcp_accept.request) {
+
+               reqsk_free(tp->defer_tcp_accept.request);
+               sock_put (tp->defer_tcp_accept.listen_sk);
+               sock_put (tp->defer_tcp_accept.child_sk);
+
+               tp->defer_tcp_accept.listen_sk = NULL;
+               tp->defer_tcp_accept.request = NULL;
+               tp->defer_tcp_accept.child_sk = NULL;
+       }
+
        atomic_dec(&tcp_sockets_allocated);

        return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b61b768..9dfbb76 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -569,10 +569,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
           does sequence test, SYN is truncated, and thus we consider
           it a bare ACK.

-          If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
-          bare ACK.  Otherwise, we create an established connection.  Both
-          ends (listening sockets) accept the new incoming connection and try
-          to talk to each other. 8-)
+          Both ends (listening sockets) accept the new incoming
+          connection and try to talk to each other. 8-)

           Note: This case is both harmless, and rare.  Possibility is about the
           same as us discovering intelligent life on another plant tomorrow.
@@ -640,13 +638,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                if (!(flg & TCP_FLAG_ACK))
                        return NULL;

-               /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-               if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
-                   TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-                       inet_rsk(req)->acked = 1;
-                       return NULL;
-               }
-
                /* OK, ACK is valid, create big socket and
                 * feed this segment to it. It will repeat all
                 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -685,12 +676,31 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                inet_csk_reqsk_queue_unlink(sk, req, prev);
                inet_csk_reqsk_queue_removed(sk, req);

-               inet_csk_reqsk_queue_add(sk, req, child);
+               if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+                   TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+
+                       /* the accept queue handling is done is est recv slow path
+                        * so lets make sure to start there
+                        */
+                       tcp_sk(child)->pred_flags = 0;
+
+                       sock_hold (sk);
+                       sock_hold (child);
+                       tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
+                       tcp_sk(child)->defer_tcp_accept.child_sk = child;
+                       tcp_sk(child)->defer_tcp_accept.request = req;
+
+                       inet_csk_reset_keepalive_timer (child,
+                                                       inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
+               }
+               else {
+                       inet_csk_reqsk_queue_add(sk, req, child);
+               }
+
                return child;

        listen_overflow:
                if (!sysctl_tcp_abort_on_overflow) {
-                       inet_rsk(req)->acked = 1;
                        return NULL;
                }

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 803d758..5faad8b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -481,6 +481,13 @@ static void tcp_keepalive_timer (unsigned long data)
                goto death;
        }

+
+       if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED)
+       {
+               tcp_send_active_reset(sk, GFP_ATOMIC);
+               goto death;
+       }
+
        if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
                goto out;




             reply	other threads:[~2008-02-27  1:55 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-27  1:49 Patrick McManus [this message]
2008-03-01 10:53 ` RFC [PATCH 3/3] TCP_DEFER_ACCEPT updates: more accurate timers and resets Eric Dumazet
2008-03-01 20:59   ` Patrick McManus

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1204076973.15970.37.camel@tng \
    --to=mcmanus@ducksong.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.