From: Eric Dumazet <eric.dumazet@gmail.com>
To: David Miller <davem@davemloft.net>
Cc: kdakhane@gmail.com, netdev@vger.kernel.org,
netfilter@vger.kernel.org, zbr@ioremap.net,
Evgeniy Polyakov <zbr@ioremap.net>
Subject: [PATCH net-next-2.6] tcp: connect() race with timewait reuse
Date: Wed, 02 Dec 2009 16:08:59 +0100 [thread overview]
Message-ID: <4B16830B.4010801@gmail.com> (raw)
In-Reply-To: <4B164293.7070804@gmail.com>
Eric Dumazet a écrit :
> Eric Dumazet a écrit :
>> But even if sysctl_tw_reuse is cleared, we might trigger the bug if
>> local port is bound to a value.
>
> Oh well, that's more subtle than that.
>
> __inet_check_established() is called not only with bh disabled,
> but also with a lock on bind list if twp != NULL.
>
> However, if twp is NULL, lock is not held by caller.
>
> [ Thats the final
> ret = check_established(death_row, sk, snum, NULL);
> in __inet_hash_connect()]
>
> So triggering this bug with tw_reuse clear is tricky :
>
> You need several threads, using sockets with REUSEADDR set,
> and bind() to same address/port before connect() to same target.
>
> We need another patch to correct this.
>
Here is a separate patch for this issue, cooked on top of net-next-2.6
for testing purposes, and public discussion.
Thanks
[PATCH net-next-2.6] tcp: connect() race with timewait reuse
Its currently possible that several threads issuing a connect() find the same
timewait socket and try to reuse it, leading to list corruptions.
Condition for bug is that these threads bound their socket on same address/port
of to be found timewait socket, and connected to same target. (SO_REUSEADDR needed)
To fix this problem, we could unhash timewait socket while holding ehash lock,
to make sure lookups/changes will be serialized. Only first one find the timewait
socket, other ones find the established socket and return an EADDRNOTAVAIL error.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/inet_timewait_sock.h | 2 +
net/ipv4/inet_hashtables.c | 7 +++--
net/ipv4/inet_timewait_sock.c | 36 ++++++++++++++++++++---------
net/ipv6/inet6_hashtables.c | 12 +++++----
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 773b10f..59c80a0 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -199,6 +199,8 @@ static inline __be32 inet_rcv_saddr(const struct sock *sk)
extern void inet_twsk_put(struct inet_timewait_sock *tw);
+extern void inet_twsk_unhash(struct inet_timewait_sock *tw);
+
extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
const int state);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 94ef51a..143ddb4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -318,20 +318,21 @@ unique:
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
inet_twsk_put(tw);
}
-
return 0;
not_unique:
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1f5d508..680d09b 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,6 +14,21 @@
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+
+/*
+ * unhash a timewait socket from established hash
+ * lock must be hold by caller
+ */
+void inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+ if (hlist_nulls_unhashed(&tw->tw_node))
+ return;
+
+ hlist_nulls_del_rcu(&tw->tw_node);
+ sk_nulls_node_init(&tw->tw_node);
+ inet_twsk_put(tw);
+}
+
/* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
@@ -24,12 +39,9 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
spin_lock(lock);
- if (hlist_nulls_unhashed(&tw->tw_node)) {
- spin_unlock(lock);
- return;
- }
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
+
+ inet_twsk_unhash(tw);
+
spin_unlock(lock);
/* Disassociate with bind bucket. */
@@ -37,9 +49,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ if (tb) {
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ }
spin_unlock(&bhead->lock);
#ifdef SOCK_REFCNT_DEBUG
if (atomic_read(&tw->tw_refcnt) != 1) {
@@ -47,7 +61,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
}
#endif
- inet_twsk_put(tw);
+ if (tb)
+ inet_twsk_put(tw);
}
static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -92,6 +107,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ atomic_inc(&tw->tw_refcnt);
spin_unlock(&bhead->lock);
spin_lock(lock);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 00c6a3e..3681c00 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -250,19 +250,21 @@ unique:
* in hash table socket with a funny identity. */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
+ sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
- sk->sk_hash = hash;
+ if (tw) {
+ inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- if (twp != NULL) {
+ if (twp) {
*twp = tw;
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw != NULL) {
+ } else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
inet_twsk_put(tw);
}
next prev parent reply other threads:[~2009-12-02 15:08 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-12-01 2:02 soft lockup in inet_csk_get_port kapil dakhane
2009-12-01 6:10 ` Eric Dumazet
2009-12-01 15:00 ` [PATCH] tcp: Fix a connect() race with timewait sockets Eric Dumazet
2009-12-02 8:59 ` David Miller
2009-12-02 9:23 ` Eric Dumazet
2009-12-02 10:33 ` Eric Dumazet
2009-12-02 11:32 ` Evgeniy Polyakov
2009-12-02 19:18 ` kapil dakhane
2009-12-03 2:43 ` kapil dakhane
2009-12-03 10:49 ` [PATCH] tcp: fix a timewait refcnt race Eric Dumazet
2009-12-04 0:19 ` David Miller
2009-12-04 3:20 ` kapil dakhane
2009-12-04 6:29 ` Eric Dumazet
2009-12-04 6:39 ` David Miller
2009-12-02 15:08 ` Eric Dumazet [this message]
2009-12-02 22:15 ` [PATCH net-next-2.6] tcp: connect() race with timewait reuse Evgeniy Polyakov
2009-12-03 6:44 ` Eric Dumazet
2009-12-03 8:31 ` Eric Dumazet
2009-12-03 23:22 ` Evgeniy Polyakov
2009-12-04 0:18 ` David Miller
2009-12-02 16:05 ` [PATCH] tcp: Fix a connect() race with timewait sockets Ashwani Wason
2009-12-03 6:38 ` David Miller
2009-12-04 13:45 ` [PATCH 0/2] tcp: Fix connect() races " Eric Dumazet
2009-12-04 13:46 ` [PATCH 1/2] tcp: Fix a connect() race " Eric Dumazet
2009-12-05 21:21 ` Evgeniy Polyakov
2009-12-07 9:59 ` [PATCH] tcp: documents timewait refcnt tricks Eric Dumazet
2009-12-07 16:06 ` Randy Dunlap
2009-12-09 4:20 ` David Miller
2009-12-09 4:18 ` [PATCH 1/2] tcp: Fix a connect() race with timewait sockets David Miller
2009-12-04 13:47 ` [PATCH 2/2] " Eric Dumazet
2009-12-09 4:19 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4B16830B.4010801@gmail.com \
--to=eric.dumazet@gmail.com \
--cc=davem@davemloft.net \
--cc=kdakhane@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=netfilter@vger.kernel.org \
--cc=zbr@ioremap.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).