[PATCH] no more rwlock_t inside tcp_ehash_bucket

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Eric Dumazet <dada1@cosmosbay.com>
To: "David S. Miller" <davem@davemloft.net>, netdev@oss.sgi.com
Subject: [PATCH] no more rwlock_t inside tcp_ehash_bucket
Date: Tue, 15 Mar 2005 17:13:11 +0100	[thread overview]
Message-ID: <42370997.6010302@cosmosbay.com> (raw)

A machine with large tcp_ehash uses half the size of the hash to store rwlocks.

That seems not really necessary :

- Most accesses to tcp_ehash[] are done with read_locks(), so dirtying the memory (because of embedded rwlocks) all over the hash table is not 
SMP friendly. The cache hit given by the fact that the rwlock and chain were in the same cache line is not really a gain, because of the 
dirtying of the cache line (and exclusive use because of lock operations)

I suggest in this patch using an array of 256 rwlocks.

- Less memory used (or 2 x more entries in hash table if size >= (2^MAX_ORDER)*PAGE_SIZE)
- less memory touched during read_lock()/read_unlock()
- Better sharing of hash entries between cpus

tests done on a production machine (1 million slots, size 8MB). No differences found in performance (oprofile).

5 files changed :

- net/ipv4/tcp_diag.c
- net/ipv4/tcp_minisocks.c
- net/ipv4/tcp_ipv4.c
- net/ipv4/tcp.c
- include/net/tcp.h
- include/net/sock.h

(i converted also __tcp_bhash_size , __tcp_ehash_size , sk_hashent to 'unsigned int' to let compiler chose a better code)

Eric Dumazet

# diff -Nru linux-2.6.11 linux-2.6.11-ed
diff -Nru linux-2.6.11/include/net/sock.h linux-2.6.11-ed/include/net/sock.h
--- linux-2.6.11/include/net/sock.h     2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/include/net/sock.h  2005-03-15 17:02:52.000000000 +0100
@@ -217,7 +217,7 @@
         unsigned char           sk_no_largesend;
         int                     sk_route_caps;
         unsigned long           sk_lingertime;
-       int                     sk_hashent;
+       unsigned int                    sk_hashent;
         /*
          * The backlog queue is special, it is always used with
          * the per-socket spinlock held and requires low latency

diff -Nru linux-2.6.11/include/net/tcp.h linux-2.6.11-ed/include/net/tcp.h
--- linux-2.6.11/include/net/tcp.h      2005-03-02 08:37:48.000000000 +0100
+++ linux-2.6.11-ed/include/net/tcp.h   2005-03-15 15:06:31.000000000 +0100
@@ -44,9 +44,14 @@
   * for the rest.  I'll experiment with dynamic table growth later.
   */
  struct tcp_ehash_bucket {
-       rwlock_t          lock;
         struct hlist_head chain;
-} __attribute__((__aligned__(8)));
+} ;
+
+/*
+ * instead of using one rwlock_t for each ehash_bucket, use a table of 256 rwlock_t
+ */
+#define TCP_EHASH_RWLOCK_SZ 256
+extern rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;

  /* This is for listening sockets, thus all sockets which possess wildcards. */
  #define TCP_LHTABLE_SIZE       32      /* Yes, really, this is all you need. */
@@ -106,6 +111,7 @@
         return hlist_empty(&head->chain) ? NULL : __tb_head(head);
  }

+
  extern struct tcp_hashinfo {
         /* This is for sockets with full identity only.  Sockets here will
          * always be without wildcards and will have the following invariant:
@@ -122,8 +128,9 @@
          */
         struct tcp_bind_hashbucket *__tcp_bhash;

-       int __tcp_bhash_size;
-       int __tcp_ehash_size;
+       unsigned int __tcp_bhash_size;
+       unsigned int __tcp_ehash_size;
+

         /* All sockets in TCP_LISTEN state will be in here.  This is the only
          * table where wildcard'd TCP sockets can exist.  Hash function here

diff -Nru linux-2.6.11/net/ipv4/tcp.c linux-2.6.11-ed/net/ipv4/tcp.c
--- linux-2.6.11/net/ipv4/tcp.c 2005-03-15 14:50:53.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp.c      2005-03-15 15:08:28.000000000 +0100
@@ -2309,10 +2309,10 @@
                                         0);
         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
         for (i = 0; i < (tcp_ehash_size << 1); i++) {
-               rwlock_init(&tcp_ehash[i].lock);
                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
         }
-
+       for (i = 0; i < TCP_EHASH_RWLOCK_SZ; i++)
+               rwlock_init(&tcp_ehash_lock[i]);
         tcp_bhash = (struct tcp_bind_hashbucket *)
                 alloc_large_system_hash("TCP bind",
                                         sizeof(struct tcp_bind_hashbucket),

diff -Nru linux-2.6.11/net/ipv4/tcp_diag.c linux-2.6.11-ed/net/ipv4/tcp_diag.c
--- linux-2.6.11/net/ipv4/tcp_diag.c    2005-03-02 08:37:31.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_diag.c 2005-03-15 15:22:30.000000000 +0100
@@ -666,7 +666,7 @@
                 if (i > s_i)
                         s_num = 0;

-               read_lock_bh(&head->lock);
+               read_lock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);

                 num = 0;
                 sk_for_each(sk, node, &head->chain) {
@@ -682,7 +682,7 @@
                         if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
                                 goto next_normal;
                         if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                               read_unlock_bh(&head->lock);
+                               read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
                                 goto done;
                         }
  next_normal:
@@ -703,14 +703,14 @@
                                     r->id.tcpdiag_dport)
                                         goto next_dying;
                                 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                                       read_unlock_bh(&head->lock);
+                                       read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
                                         goto done;
                                 }
  next_dying:
                                 ++num;
                         }
                 }
-               read_unlock_bh(&head->lock);
+               read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
         }

  done:

diff -Nru linux-2.6.11/net/ipv4/tcp_ipv4.c linux-2.6.11-ed/net/ipv4/tcp_ipv4.c
--- linux-2.6.11/net/ipv4/tcp_ipv4.c    2005-03-02 08:37:54.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_ipv4.c 2005-03-15 17:02:52.000000000 +0100
@@ -96,6 +96,8 @@
         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  };

+rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;
+
  /*
   * This array holds the first and last local port number.
   * For high-usage systems, use sysctl to change this to
@@ -104,16 +106,16 @@
  int sysctl_local_port_range[2] = { 1024, 4999 };
  int tcp_port_rover = 1024 - 1;

-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+static __inline__ unsigned int tcp_hashfn(__u32 laddr, __u16 lport,
                                  __u32 faddr, __u16 fport)
  {
-       int h = (laddr ^ lport) ^ (faddr ^ fport);
+       unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
         h ^= h >> 16;
         h ^= h >> 8;
         return h & (tcp_ehash_size - 1);
  }

-static __inline__ int tcp_sk_hashfn(struct sock *sk)
+static __inline__ unsigned int tcp_sk_hashfn(struct sock *sk)
  {
         struct inet_sock *inet = inet_sk(sk);
         __u32 laddr = inet->rcv_saddr;
@@ -360,7 +362,7 @@
                 tcp_listen_wlock();
         } else {
                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
-               lock = &tcp_ehash[sk->sk_hashent].lock;
+               lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
                 write_lock(lock);
         }
         __sk_add_node(sk, list);
@@ -391,9 +393,8 @@
                 tcp_listen_wlock();
                 lock = &tcp_lhash_lock;
         } else {
-               struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
-               lock = &head->lock;
-               write_lock_bh(&head->lock);
+               lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
+               write_lock_bh(lock);
         }

         if (__sk_del_node_init(sk))
@@ -492,9 +493,9 @@
         /* Optimize here for direct hit, only listening connections can
          * have wildcards anyways.
          */
-       int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+       unsigned int hash = tcp_hashfn(daddr, hnum, saddr, sport);
         head = &tcp_ehash[hash];
-       read_lock(&head->lock);
+       read_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         sk_for_each(sk, node, &head->chain) {
                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
                         goto hit; /* You sunk my battleship! */
@@ -507,7 +508,7 @@
         }
         sk = NULL;
  out:
-       read_unlock(&head->lock);
+       read_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         return sk;
  hit:
         sock_hold(sk);
@@ -555,13 +556,13 @@
         int dif = sk->sk_bound_dev_if;
         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
-       int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+       unsigned int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
         struct sock *sk2;
         struct hlist_node *node;
         struct tcp_tw_bucket *tw;

-       write_lock(&head->lock);
+       write_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);

         /* Check TIME-WAIT sockets first. */
         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
@@ -616,7 +617,7 @@
         BUG_TRAP(sk_unhashed(sk));
         __sk_add_node(sk, &head->chain);
         sock_prot_inc_use(sk->sk_prot);
-       write_unlock(&head->lock);
+       write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);

         if (twp) {
                 *twp = tw;
@@ -632,7 +633,7 @@
         return 0;

  not_unique:
-       write_unlock(&head->lock);
+       write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         return -EADDRNOTAVAIL;
  }

@@ -2224,7 +2225,7 @@
                 /* We can reschedule _before_ having picked the target: */
                 cond_resched_softirq();

-               read_lock(&tcp_ehash[st->bucket].lock);
+               read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
                         if (sk->sk_family != st->family) {
                                 continue;
@@ -2241,7 +2242,7 @@
                         rc = tw;
                         goto out;
                 }
-               read_unlock(&tcp_ehash[st->bucket].lock);
+               read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 st->state = TCP_SEQ_STATE_ESTABLISHED;
         }
  out:
@@ -2268,14 +2269,14 @@
                         cur = tw;
                         goto out;
                 }
-               read_unlock(&tcp_ehash[st->bucket].lock);
+               read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 st->state = TCP_SEQ_STATE_ESTABLISHED;

                 /* We can reschedule between buckets: */
                 cond_resched_softirq();

                 if (++st->bucket < tcp_ehash_size) {
-                       read_lock(&tcp_ehash[st->bucket].lock);
+                       read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                         sk = sk_head(&tcp_ehash[st->bucket].chain);
                 } else {
                         cur = NULL;
@@ -2385,7 +2386,7 @@
         case TCP_SEQ_STATE_TIME_WAIT:
         case TCP_SEQ_STATE_ESTABLISHED:
                 if (v)
-                       read_unlock(&tcp_ehash[st->bucket].lock);
+                       read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 local_bh_enable();
                 break;
         }

diff -Nru linux-2.6.11/net/ipv4/tcp_minisocks.c linux-2.6.11-ed/net/ipv4/tcp_minisocks.c
--- linux-2.6.11/net/ipv4/tcp_minisocks.c       2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_minisocks.c    2005-03-15 15:21:05.000000000 +0100
@@ -66,14 +66,14 @@

         /* Unlink from established hashes. */
         ehead = &tcp_ehash[tw->tw_hashent];
-       write_lock(&ehead->lock);
+       write_lock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
         if (hlist_unhashed(&tw->tw_node)) {
-               write_unlock(&ehead->lock);
+               write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
                 return;
         }
         __hlist_del(&tw->tw_node);
         sk_node_init(&tw->tw_node);
-       write_unlock(&ehead->lock);
+       write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);

         /* Disassociate with bind bucket. */
         bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
@@ -297,6 +297,7 @@
  static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
  {
         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+       rwlock_t *rwp = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ] ;
         struct tcp_bind_hashbucket *bhead;

         /* Step 1: Put TW into bind hash. Original socket stays there too.
@@ -310,7 +311,7 @@
         tw_add_bind_node(tw, &tw->tw_tb->owners);
         spin_unlock(&bhead->lock);

-       write_lock(&ehead->lock);
+       write_lock(rwp);

         /* Step 2: Remove SK from established hash. */
         if (__sk_del_node_init(sk))
@@ -320,7 +321,7 @@
         tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
         atomic_inc(&tw->tw_refcnt);

-       write_unlock(&ehead->lock);
+       write_unlock(rwp);
  }

  /*

next             reply	other threads:[~2005-03-15 16:13 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-03-15 16:13 Eric Dumazet [this message]
2005-03-15 18:32 ` [PATCH] no more rwlock_t inside tcp_ehash_bucket David S. Miller
2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
2005-03-16 22:10       ` David S. Miller
2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
2005-03-17 19:52       ` Eric Dumazet
2005-04-01  6:13         ` David S. Miller
2005-04-01 14:39           ` Eric Dumazet
2005-04-01 15:53             ` Robert Olsson
2005-04-01 16:34               ` Eric Dumazet
2005-04-01 17:26                 ` Robert Olsson
2005-04-01 20:28             ` David S. Miller
2005-04-01 21:05               ` Eric Dumazet
2005-04-01 21:08                 ` David S. Miller
2005-04-01 21:43                   ` Eric Dumazet
2005-04-01 22:34                     ` David S. Miller
2005-04-01 23:21                       ` Eric Dumazet
2005-04-01 23:54                         ` David S. Miller
2005-04-02  8:21                         ` Herbert Xu
2005-04-02  9:21                           ` Eric Dumazet
2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
2005-04-02 13:58                               ` Eric Dumazet
2005-04-02 14:03                               ` Robert Olsson
2005-04-02 21:05                               ` jamal
2005-04-03  7:38                                 ` Herbert Xu
2005-04-03  7:41                                 ` Herbert Xu
2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
2005-04-02 14:10                               ` Eric Dumazet
2005-04-02 14:46                                 ` Robert Olsson
2005-04-02 20:47                                 ` jamal
2005-04-02 19:32                               ` Herbert Xu
2005-04-02 19:55                                 ` David S. Miller
2005-04-03  7:43                                   ` Herbert Xu
2005-04-03 19:57                                     ` Robert Olsson
2005-04-03 21:45                                       ` Herbert Xu
2005-04-04 10:27                                         ` Robert Olsson
2005-04-04 10:38                                           ` Herbert Xu
2005-04-04 12:29                                             ` Robert Olsson
2005-04-03 19:36                                 ` Robert Olsson
2005-04-03 21:43                                   ` Herbert Xu
2005-04-04 10:38                                     ` Robert Olsson
2005-04-04 10:48                                       ` Herbert Xu
2005-04-04 13:17                                         ` Robert Olsson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42370997.6010302@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=davem@davemloft.net \
    --cc=netdev@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).