[PATCH] no more rwlock_t inside tcp_ehash

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] no more rwlock_t inside tcp_ehash_bucket
@ 2005-03-15 16:13 Eric Dumazet
  2005-03-15 18:32 ` David S. Miller
  0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-03-15 16:13 UTC (permalink / raw)
  To: David S. Miller, netdev

A machine with large tcp_ehash uses half the size of the hash to store rwlocks.

That seems not really necessary :

- Most accesses to tcp_ehash[] are done with read_locks(), so dirtying the memory (because of embedded rwlocks) all over the hash table is not 
SMP friendly. The cache hit given by the fact that the rwlock and chain were in the same cache line is not really a gain, because of the 
dirtying of the cache line (and exclusive use because of lock operations)

I suggest in this patch using an array of 256 rwlocks.

- Less memory used (or 2 x more entries in hash table if size >= (2^MAX_ORDER)*PAGE_SIZE)
- less memory touched during read_lock()/read_unlock()
- Better sharing of hash entries between cpus

tests done on a production machine (1 million slots, size 8MB). No differences found in performance (oprofile).

5 files changed :

- net/ipv4/tcp_diag.c
- net/ipv4/tcp_minisocks.c
- net/ipv4/tcp_ipv4.c
- net/ipv4/tcp.c
- include/net/tcp.h
- include/net/sock.h

(i converted also __tcp_bhash_size , __tcp_ehash_size , sk_hashent to 'unsigned int' to let compiler chose a better code)

Eric Dumazet

# diff -Nru linux-2.6.11 linux-2.6.11-ed
diff -Nru linux-2.6.11/include/net/sock.h linux-2.6.11-ed/include/net/sock.h
--- linux-2.6.11/include/net/sock.h     2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/include/net/sock.h  2005-03-15 17:02:52.000000000 +0100
@@ -217,7 +217,7 @@
         unsigned char           sk_no_largesend;
         int                     sk_route_caps;
         unsigned long           sk_lingertime;
-       int                     sk_hashent;
+       unsigned int                    sk_hashent;
         /*
          * The backlog queue is special, it is always used with
          * the per-socket spinlock held and requires low latency

diff -Nru linux-2.6.11/include/net/tcp.h linux-2.6.11-ed/include/net/tcp.h
--- linux-2.6.11/include/net/tcp.h      2005-03-02 08:37:48.000000000 +0100
+++ linux-2.6.11-ed/include/net/tcp.h   2005-03-15 15:06:31.000000000 +0100
@@ -44,9 +44,14 @@
   * for the rest.  I'll experiment with dynamic table growth later.
   */
  struct tcp_ehash_bucket {
-       rwlock_t          lock;
         struct hlist_head chain;
-} __attribute__((__aligned__(8)));
+} ;
+
+/*
+ * instead of using one rwlock_t for each ehash_bucket, use a table of 256 rwlock_t
+ */
+#define TCP_EHASH_RWLOCK_SZ 256
+extern rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;

  /* This is for listening sockets, thus all sockets which possess wildcards. */
  #define TCP_LHTABLE_SIZE       32      /* Yes, really, this is all you need. */
@@ -106,6 +111,7 @@
         return hlist_empty(&head->chain) ? NULL : __tb_head(head);
  }

+
  extern struct tcp_hashinfo {
         /* This is for sockets with full identity only.  Sockets here will
          * always be without wildcards and will have the following invariant:
@@ -122,8 +128,9 @@
          */
         struct tcp_bind_hashbucket *__tcp_bhash;

-       int __tcp_bhash_size;
-       int __tcp_ehash_size;
+       unsigned int __tcp_bhash_size;
+       unsigned int __tcp_ehash_size;
+

         /* All sockets in TCP_LISTEN state will be in here.  This is the only
          * table where wildcard'd TCP sockets can exist.  Hash function here

diff -Nru linux-2.6.11/net/ipv4/tcp.c linux-2.6.11-ed/net/ipv4/tcp.c
--- linux-2.6.11/net/ipv4/tcp.c 2005-03-15 14:50:53.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp.c      2005-03-15 15:08:28.000000000 +0100
@@ -2309,10 +2309,10 @@
                                         0);
         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
         for (i = 0; i < (tcp_ehash_size << 1); i++) {
-               rwlock_init(&tcp_ehash[i].lock);
                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
         }
-
+       for (i = 0; i < TCP_EHASH_RWLOCK_SZ; i++)
+               rwlock_init(&tcp_ehash_lock[i]);
         tcp_bhash = (struct tcp_bind_hashbucket *)
                 alloc_large_system_hash("TCP bind",
                                         sizeof(struct tcp_bind_hashbucket),

diff -Nru linux-2.6.11/net/ipv4/tcp_diag.c linux-2.6.11-ed/net/ipv4/tcp_diag.c
--- linux-2.6.11/net/ipv4/tcp_diag.c    2005-03-02 08:37:31.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_diag.c 2005-03-15 15:22:30.000000000 +0100
@@ -666,7 +666,7 @@
                 if (i > s_i)
                         s_num = 0;

-               read_lock_bh(&head->lock);
+               read_lock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);

                 num = 0;
                 sk_for_each(sk, node, &head->chain) {
@@ -682,7 +682,7 @@
                         if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
                                 goto next_normal;
                         if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                               read_unlock_bh(&head->lock);
+                               read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
                                 goto done;
                         }
  next_normal:
@@ -703,14 +703,14 @@
                                     r->id.tcpdiag_dport)
                                         goto next_dying;
                                 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-                                       read_unlock_bh(&head->lock);
+                                       read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
                                         goto done;
                                 }
  next_dying:
                                 ++num;
                         }
                 }
-               read_unlock_bh(&head->lock);
+               read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
         }

  done:

diff -Nru linux-2.6.11/net/ipv4/tcp_ipv4.c linux-2.6.11-ed/net/ipv4/tcp_ipv4.c
--- linux-2.6.11/net/ipv4/tcp_ipv4.c    2005-03-02 08:37:54.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_ipv4.c 2005-03-15 17:02:52.000000000 +0100
@@ -96,6 +96,8 @@
         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  };

+rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;
+
  /*
   * This array holds the first and last local port number.
   * For high-usage systems, use sysctl to change this to
@@ -104,16 +106,16 @@
  int sysctl_local_port_range[2] = { 1024, 4999 };
  int tcp_port_rover = 1024 - 1;

-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+static __inline__ unsigned int tcp_hashfn(__u32 laddr, __u16 lport,
                                  __u32 faddr, __u16 fport)
  {
-       int h = (laddr ^ lport) ^ (faddr ^ fport);
+       unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
         h ^= h >> 16;
         h ^= h >> 8;
         return h & (tcp_ehash_size - 1);
  }

-static __inline__ int tcp_sk_hashfn(struct sock *sk)
+static __inline__ unsigned int tcp_sk_hashfn(struct sock *sk)
  {
         struct inet_sock *inet = inet_sk(sk);
         __u32 laddr = inet->rcv_saddr;
@@ -360,7 +362,7 @@
                 tcp_listen_wlock();
         } else {
                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
-               lock = &tcp_ehash[sk->sk_hashent].lock;
+               lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
                 write_lock(lock);
         }
         __sk_add_node(sk, list);
@@ -391,9 +393,8 @@
                 tcp_listen_wlock();
                 lock = &tcp_lhash_lock;
         } else {
-               struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
-               lock = &head->lock;
-               write_lock_bh(&head->lock);
+               lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
+               write_lock_bh(lock);
         }

         if (__sk_del_node_init(sk))
@@ -492,9 +493,9 @@
         /* Optimize here for direct hit, only listening connections can
          * have wildcards anyways.
          */
-       int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+       unsigned int hash = tcp_hashfn(daddr, hnum, saddr, sport);
         head = &tcp_ehash[hash];
-       read_lock(&head->lock);
+       read_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         sk_for_each(sk, node, &head->chain) {
                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
                         goto hit; /* You sunk my battleship! */
@@ -507,7 +508,7 @@
         }
         sk = NULL;
  out:
-       read_unlock(&head->lock);
+       read_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         return sk;
  hit:
         sock_hold(sk);
@@ -555,13 +556,13 @@
         int dif = sk->sk_bound_dev_if;
         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
-       int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+       unsigned int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
         struct sock *sk2;
         struct hlist_node *node;
         struct tcp_tw_bucket *tw;

-       write_lock(&head->lock);
+       write_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);

         /* Check TIME-WAIT sockets first. */
         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
@@ -616,7 +617,7 @@
         BUG_TRAP(sk_unhashed(sk));
         __sk_add_node(sk, &head->chain);
         sock_prot_inc_use(sk->sk_prot);
-       write_unlock(&head->lock);
+       write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);

         if (twp) {
                 *twp = tw;
@@ -632,7 +633,7 @@
         return 0;

  not_unique:
-       write_unlock(&head->lock);
+       write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
         return -EADDRNOTAVAIL;
  }

@@ -2224,7 +2225,7 @@
                 /* We can reschedule _before_ having picked the target: */
                 cond_resched_softirq();

-               read_lock(&tcp_ehash[st->bucket].lock);
+               read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
                         if (sk->sk_family != st->family) {
                                 continue;
@@ -2241,7 +2242,7 @@
                         rc = tw;
                         goto out;
                 }
-               read_unlock(&tcp_ehash[st->bucket].lock);
+               read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 st->state = TCP_SEQ_STATE_ESTABLISHED;
         }
  out:
@@ -2268,14 +2269,14 @@
                         cur = tw;
                         goto out;
                 }
-               read_unlock(&tcp_ehash[st->bucket].lock);
+               read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 st->state = TCP_SEQ_STATE_ESTABLISHED;

                 /* We can reschedule between buckets: */
                 cond_resched_softirq();

                 if (++st->bucket < tcp_ehash_size) {
-                       read_lock(&tcp_ehash[st->bucket].lock);
+                       read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                         sk = sk_head(&tcp_ehash[st->bucket].chain);
                 } else {
                         cur = NULL;
@@ -2385,7 +2386,7 @@
         case TCP_SEQ_STATE_TIME_WAIT:
         case TCP_SEQ_STATE_ESTABLISHED:
                 if (v)
-                       read_unlock(&tcp_ehash[st->bucket].lock);
+                       read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
                 local_bh_enable();
                 break;
         }

diff -Nru linux-2.6.11/net/ipv4/tcp_minisocks.c linux-2.6.11-ed/net/ipv4/tcp_minisocks.c
--- linux-2.6.11/net/ipv4/tcp_minisocks.c       2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_minisocks.c    2005-03-15 15:21:05.000000000 +0100
@@ -66,14 +66,14 @@

         /* Unlink from established hashes. */
         ehead = &tcp_ehash[tw->tw_hashent];
-       write_lock(&ehead->lock);
+       write_lock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
         if (hlist_unhashed(&tw->tw_node)) {
-               write_unlock(&ehead->lock);
+               write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
                 return;
         }
         __hlist_del(&tw->tw_node);
         sk_node_init(&tw->tw_node);
-       write_unlock(&ehead->lock);
+       write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);

         /* Disassociate with bind bucket. */
         bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
@@ -297,6 +297,7 @@
  static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
  {
         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+       rwlock_t *rwp = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ] ;
         struct tcp_bind_hashbucket *bhead;

         /* Step 1: Put TW into bind hash. Original socket stays there too.
@@ -310,7 +311,7 @@
         tw_add_bind_node(tw, &tw->tw_tb->owners);
         spin_unlock(&bhead->lock);

-       write_lock(&ehead->lock);
+       write_lock(rwp);

         /* Step 2: Remove SK from established hash. */
         if (__sk_del_node_init(sk))
@@ -320,7 +321,7 @@
         tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
         atomic_inc(&tw->tw_refcnt);

-       write_unlock(&ehead->lock);
+       write_unlock(rwp);
  }

  /*

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] no more rwlock_t inside tcp_ehash_bucket
  2005-03-15 16:13 [PATCH] no more rwlock_t inside tcp_ehash_bucket Eric Dumazet
@ 2005-03-15 18:32 ` David S. Miller
  2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
  0 siblings, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-03-15 18:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Tue, 15 Mar 2005 17:13:11 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> I suggest in this patch using an array of 256 rwlocks.
> 
> - Less memory used (or 2 x more entries in hash table if size >= (2^MAX_ORDER)*PAGE_SIZE)
> - less memory touched during read_lock()/read_unlock()
> - Better sharing of hash entries between cpus

I'm generally in support of this change, however 2 suggestions:

1) Please allocate the rwlock table dynamically.  You can put
   #ifdef CONFIG_SMP around this stuff if you wish so that
   we don't do silly things like allocate a zero sized table
   on non-SMP builds.

   Perhaps you might want to similarly abstract out the rwlock
   acquisition, "tcp_ehash_lock(unsigned int slot)" and
   "tcp_ehash_unlock(unsigned int slot)".  It's just an idea.

2) With dynamic allocation, you can consider perhaps dynamic sizing
   based upon a) the ehash size b) NR_CPUS or some combination of
   those two.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-03-15 18:32 ` David S. Miller
@ 2005-03-16 10:47   ` Eric Dumazet
  2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
  2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
  0 siblings, 2 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-03-16 10:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

After having hard times tuning /proc/sys/net/ipv4/route/* values, with various crashes of production machines,
I did some investigations...

The rt_check_expire() has a serious problem on machines with large route caches, and a standard HZ value of 1000.

With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;

the loop count :

     for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;

overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536 slots in route cache hash table)

Another problem is the fact that this function has close to 0 effect, because even if ip_rt_gc_interval is changed to 1 HZ,
only 1/300 of the table is scanned every second. And the loop breaks as soon a jiffie is consumed.

We should adapt the loop count based on the actual number of entries in the route cache,
and eventually give more 'jiffies' in some pressure cases.

I am experimenting some changes that I will share when ready.

Thank you
Eric Dumazet

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures
  2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
@ 2005-03-16 18:05     ` Eric Dumazet
  2005-03-16 22:10       ` David S. Miller
  2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
  1 sibling, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-03-16 18:05 UTC (permalink / raw)
  To: David S. Miller, netdev

As peer_cachep uses kmem_cache_create("inet_peer_cache", ... SLAB_HWCACHE_ALIGN ...),
better try to use exactly 64 bytes instead of 72 for struct inet_peer.

This also means that this structure fits in one cache line on x86_64.

Thank you
Eric Dumazet



# diff -Nru linux-2.6.11/include/net/inetpeer.h linux-2.6.11-ed/include/net/inetpeer.h
--- linux-2.6.11/include/net/inetpeer.h 2005-03-02 08:37:48.000000000 +0100
+++ linux-2.6.11-ed/include/net/inetpeer.h      2005-03-16 18:52:49.000000000 +0100
@@ -19,9 +19,9 @@
  {
         struct inet_peer        *avl_left, *avl_right;
         struct inet_peer        *unused_next, **unused_prevp;
-       atomic_t                refcnt;
         unsigned long           dtime;          /* the time of last use of not
                                                  * referenced entries */
+       atomic_t                refcnt;
         __u32                   v4daddr;        /* peer's address */
         __u16                   avl_height;
         __u16                   ip_id_count;    /* IP ID for the next packet */

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
  2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
@ 2005-03-16 22:09     ` David S. Miller
  2005-03-17 19:52       ` Eric Dumazet
  1 sibling, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-03-16 22:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Wed, 16 Mar 2005 11:47:34 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> The rt_check_expire() has a serious problem on machines with large
> route caches, and a standard HZ value of 1000.
> 
> With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;
> 
> the loop count :
> 
>      for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
> 
> 
> overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
> slots in route cache hash table)
 ...
> I am experimenting some changes that I will share when ready.

Good catch, let us know when you have a patch for review.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures
  2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
@ 2005-03-16 22:10       ` David S. Miller
  0 siblings, 0 replies; 44+ messages in thread
From: David S. Miller @ 2005-03-16 22:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Wed, 16 Mar 2005 19:05:45 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> As peer_cachep uses kmem_cache_create("inet_peer_cache", ... SLAB_HWCACHE_ALIGN ...),
> better try to use exactly 64 bytes instead of 72 for struct inet_peer.
> 
> This also means that this structure fits in one cache line on x86_64.

Please resend the patch, your email client turned all the tab
characters into spaces so the patch will not apply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
@ 2005-03-17 19:52       ` Eric Dumazet
  2005-04-01  6:13         ` David S. Miller
  0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-03-17 19:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1147 bytes --]

David S. Miller a écrit :
> On Wed, 16 Mar 2005 11:47:34 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> 
>>The rt_check_expire() has a serious problem on machines with large
>>route caches, and a standard HZ value of 1000.
>>
>>With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;
>>
>>the loop count :
>>
>>     for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
>>
>>
>>overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
>>slots in route cache hash table)
> 
>  ...
> 
>>I am experimenting some changes that I will share when ready.
> 
> 
> Good catch, let us know when you have a patch for review.
> 
> 

Hi

This is the patch I'm currently running on several machines (with very high pressure on route cache)

If you want to comment it before I submit it with a nice looking mail :)

Description :

    - Makes rt_check_expire() do some real work, auto-adapting the timer interval (chosen between gc_interval/8 and gc_interval)

    - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP)

Thank you

Eric Dumazet

[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 10920 bytes --]

diff -Nru linux-2.6.11/net/ipv4/route.c linux-2.6.11-ed/net/ipv4/route.c
--- linux-2.6.11/net/ipv4/route.c	2005-03-17 11:19:45.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/route.c	2005-03-17 18:33:20.000000000 +0100
@@ -54,6 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -107,12 +108,13 @@
 #define IP_MAX_MTU	0xFFF0
 
 #define RT_GC_TIMEOUT (300*HZ)
+#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */
 
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
-static int ip_rt_gc_interval		= 60 * HZ;
+static int ip_rt_gc_interval		= RT_GC_INTERVAL;
 static int ip_rt_gc_min_interval	= HZ / 2;
 static int ip_rt_redirect_number	= 9;
 static int ip_rt_redirect_load		= HZ / 50;
@@ -124,6 +126,7 @@
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static int ip_rt_debug ;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
@@ -197,8 +200,24 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of fixed size spinlocks
+ */
+# define RT_HASH_LOCK_SZ 256
+	static spinlock_t	rt_hash_lock[RT_HASH_LOCK_SZ];
+# define rt_hash_lock_addr(slot) &rt_hash_lock[slot & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_lock[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -470,7 +489,7 @@
 		rth->u.dst.expires;
 }
 
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
 	int ret = 0;
@@ -516,45 +535,84 @@
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	static unsigned int effective_interval = RT_GC_INTERVAL;
+	static unsigned int cached_gc_interval = RT_GC_INTERVAL;
+	unsigned int i = rover, t;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
+	unsigned int freed = 0 , t0;
+	u64 mult;
 
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
-		unsigned long tmo = ip_rt_gc_timeout;
-
+	if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */
+		cached_gc_interval = ip_rt_gc_interval;
+		effective_interval = cached_gc_interval;
+	}
+	/* computes the number of slots we should examin in this run */
+	mult = ((u64)effective_interval) << rt_hash_log;
+	do_div(mult, ip_rt_gc_timeout);
+	t = (unsigned int)mult;
+
+	if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/8) {
+		t <<= 1; /* be more aggressive */
+		if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/4) {
+			t <<= 1; /* be more aggressive */
+			if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/2) {
+				t <<= 1; /* be more aggressive */
+				now++; /* give us one more tick (time) to do our job */
+			}
+		}
+	}
+	if (t > rt_hash_mask) t = rt_hash_mask + 1;
+	t0 = t;
+	for ( ; t > 0; t -= 1) {
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
-
-		spin_lock(&rt_hash_table[i].lock);
-		while ((rth = *rthp) != NULL) {
-			if (rth->u.dst.expires) {
-				/* Entry is expired even if it is in use */
-				if (time_before_eq(now, rth->u.dst.expires)) {
+		if (*rthp) {
+			unsigned long tmo = ip_rt_gc_timeout;
+			spin_lock(rt_hash_lock_addr(i));
+			while ((rth = *rthp) != NULL) {
+				if (rth->u.dst.expires) {
+					/* Entry is expired even if it is in use */
+					if (time_before_eq(now, rth->u.dst.expires)) {
+						tmo >>= 1;
+						rthp = &rth->u.rt_next;
+						continue;
+					}
+				} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 					tmo >>= 1;
 					rthp = &rth->u.rt_next;
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.rt_next;
-				continue;
+	
+				/* Cleanup aged off entries. */
+				*rthp = rth->u.rt_next;
+				freed++;
+				rt_free(rth);
 			}
-
-			/* Cleanup aged off entries. */
-			*rthp = rth->u.rt_next;
-			rt_free(rth);
+			spin_unlock(rt_hash_lock_addr(i));
 		}
-		spin_unlock(&rt_hash_table[i].lock);
-
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	if (t) {
+		/* Not enough time to perform our job, try to adjust the timer.
+		 * Firing the timer sooner means less planned work.
+		 * We allow the timer to be 1/8 of the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval/8)/2;
+	}
+	else {
+		/* We finished our job before time limit, try to increase the timer
+		 * The limit is the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval)/2;
+	}
+	if (ip_rt_debug & 1)
+		printk(KERN_WARNING "rt_check_expire() : %u freed, t=%u/%u interval=%u ticks\n", freed, t, t0, effective_interval);
+	mod_timer(&rt_periodic_timer, jiffies + effective_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
@@ -570,11 +628,11 @@
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -704,7 +762,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -715,7 +773,7 @@
 				rt_free(rth);
 				goal--;
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -792,7 +850,7 @@
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
 			/* Put it first */
@@ -813,7 +871,7 @@
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -854,7 +912,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -895,7 +953,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -962,7 +1020,7 @@
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -971,7 +1029,7 @@
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -2569,6 +2627,23 @@
 		.strategy	= &sysctl_jiffies,
 	},
 	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL_MS,
+		.procname	= "gc_interval_ms",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_ms_jiffies,
+		.strategy	= &sysctl_ms_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_DEBUG,
+		.procname	= "gc_debug",
+		.data		= &ip_rt_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
@@ -2718,7 +2793,7 @@
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
+	int order, goal, rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
@@ -2766,14 +2841,12 @@
 	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
 		/* NOTHING */;
 
-	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
+	memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = rt_hash_mask;
+	ip_rt_max_size = rt_hash_mask * 16;
+	rt_hash_mask--;
 
 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat)
diff -Nru linux-2.6.11/Documentation/filesystems/proc.txt linux-2.6.11-ed/Documentation/filesystems/proc.txt
--- linux-2.6.11/Documentation/filesystems/proc.txt	2005-03-17 18:42:54.000000000 +0100
+++ linux-2.6.11-ed/Documentation/filesystems/proc.txt	2005-03-17 18:42:14.000000000 +0100
@@ -1709,12 +1709,13 @@
 
 Writing to this file results in a flush of the routing cache.
 
-gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
+gc_elasticity, gc_interval_ms, gc_min_interval_ms, gc_timeout, gc_thresh, gc_debug
 ---------------------------------------------------------------------
 
 Values to  control  the  frequency  and  behavior  of  the  garbage collection
 algorithm for the routing cache. gc_min_interval is deprecated and replaced
-by gc_min_interval_ms.
+by gc_min_interval_ms. gc_interval is deprecated and replaced by
+gc_interval_ms. gc_debug enables some printk()
 
 
 max_size
diff -Nru linux-2.6.11/include/linux/sysctl.h linux-2.6.11-ed/include/linux/sysctl.h
--- linux-2.6.11/include/linux/sysctl.h	2005-03-17 18:37:13.000000000 +0100
+++ linux-2.6.11-ed/include/linux/sysctl.h	2005-03-17 18:36:57.000000000 +0100
@@ -367,6 +367,8 @@
 	NET_IPV4_ROUTE_MIN_ADVMSS=17,
 	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
 	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19,
+	NET_IPV4_ROUTE_GC_INTERVAL_MS=20,
+	NET_IPV4_ROUTE_GC_DEBUG=21,
 };
 
 enum

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-03-17 19:52       ` Eric Dumazet
@ 2005-04-01  6:13         ` David S. Miller
  2005-04-01 14:39           ` Eric Dumazet
  0 siblings, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-04-01  6:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Thu, 17 Mar 2005 20:52:44 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

>     - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP)

If spinlock_t is a zero sized structure on UP, how can this save memory
on UP? :-)

Anyways, I think perhaps you should dynamically allocate this lock table.
Otherwise it looks fine.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01  6:13         ` David S. Miller
@ 2005-04-01 14:39           ` Eric Dumazet
  2005-04-01 15:53             ` Robert Olsson
  2005-04-01 20:28             ` David S. Miller
  0 siblings, 2 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-04-01 14:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller a écrit :
> On Thu, 17 Mar 2005 20:52:44 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> 
>>    - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP)
> 
> 
> If spinlock_t is a zero sized structure on UP, how can this save memory
> on UP? :-)

Because I deleted the  __attribute__((__aligned__(8))) constraint on struct rt_hash_bucket.

So sizeof(struct rt_hash_bucket) is now 4 instead of 8 on 32 bits architectures.

May I remind you some people still use 32 bits CPU ? :-)

By the way I have an updated patch... surviving very serious loads.

> 
> Anyways, I think perhaps you should dynamically allocate this lock table.

Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ?

> Otherwise it looks fine.
> 
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 14:39           ` Eric Dumazet
@ 2005-04-01 15:53             ` Robert Olsson
  2005-04-01 16:34               ` Eric Dumazet
  2005-04-01 20:28             ` David S. Miller
  1 sibling, 1 reply; 44+ messages in thread
From: Robert Olsson @ 2005-04-01 15:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

 Hello!

Eric Dumazet writes:
 > By the way I have an updated patch... surviving very serious loads.

 Did you check for performance changes too? From what I understand 
 we can add new lookup and cache miss in the fast packet path.

 > > Anyways, I think perhaps you should dynamically allocate this lock table.
 > 
 > Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ?

 IMO we should be careful with adding new complexity the route hash.
 Also was this dynamic behavior gc_interval needed to fix the overflow?
 gc_interval is only sort of last resort timer.

						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 15:53             ` Robert Olsson
@ 2005-04-01 16:34               ` Eric Dumazet
  2005-04-01 17:26                 ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-04-01 16:34 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David S. Miller, netdev

Robert Olsson a écrit :
>  Hello!
> 
>  Did you check for performance changes too? From what I understand 
>  we can add new lookup and cache miss in the fast packet path.

Performance is better because in case of stress (lot of incoming packets per second), the 1024 bytes of the locks are all in cache.
As the size of the hash is divided by a 2 factor, rt_check_expire() and/or rt_garbage_collect() have to touch less cache lines.

According to oprofile, an unpatched kernel was spending more than 15% of time in route.c routines, now I see ip_route_input() at 1.88%

> 
>  > > Anyways, I think perhaps you should dynamically allocate this lock table.
>  > 
>  > Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ?
> 
>  IMO we should be careful with adding new complexity the route hash.
>  Also was this dynamic behavior gc_interval needed to fix the overflow?

In my case yes, because I have huge route cache.

>  gc_interval is only sort of last resort timer.

Actually not : gc_interval controls the rt_check_expire() to clean the hash table after use.

All old enough entries can be deleted smoothly, on behalf of a timer tick (so network interrupts can still occur)

I found it was better to adjust gc_interval to 1 (to let it fire every second and examine 1/300 table slots, or more if the dynamic behavior 
triggers), and ajust params so that rt_garbage_collect() doesnt run at all : rt_garbage_collect() can take forever to complete, blocking 
network trafic.

Eric Dumazet

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 16:34               ` Eric Dumazet
@ 2005-04-01 17:26                 ` Robert Olsson
  0 siblings, 0 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-01 17:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Robert Olsson, David S. Miller, netdev

Eric Dumazet writes:

 > According to oprofile, an unpatched kernel was spending more than 15% of time in route.c routines, now I see ip_route_input() at 1.88%

 Would like to see absolute numbers for UP/SMP single flow and DoS to be
 confident.

 > I found it was better to adjust gc_interval to 1 (to let it fire every second and examine 1/300 table slots, or more if the dynamic behavior 
 > triggers), and ajust params so that rt_garbage_collect() doesnt run at all : rt_garbage_collect() can take forever to complete, blocking 
 > network trafic.

 I don't think you can depend on timer for GC solely. Timer tick is 
 eternity for todays packet rates. You can distribute the GC load by allowing 
 it to run more frequent this in combination with huge cache seems to be a
 very interesting approach given that you have memory.

						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 14:39           ` Eric Dumazet
  2005-04-01 15:53             ` Robert Olsson
@ 2005-04-01 20:28             ` David S. Miller
  2005-04-01 21:05               ` Eric Dumazet
  1 sibling, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-04-01 20:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Fri, 01 Apr 2005 16:39:48 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> > If spinlock_t is a zero sized structure on UP, how can this save memory
> > on UP? :-)
> 
> Because I deleted the  __attribute__((__aligned__(8))) constraint on struct rt_hash_bucket.

Right.

> > Anyways, I think perhaps you should dynamically allocate this lock table.
> 
> Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ?

Even for NR_CPUS, I think the table should be dynamically allocated.

It is a goal to eliminate all of these huge arrays in the static
kernel image, which has grown incredibly too much in recent times.
I work often to eliminate such things, let's not add new ones :-)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 20:28             ` David S. Miller
@ 2005-04-01 21:05               ` Eric Dumazet
  2005-04-01 21:08                 ` David S. Miller
  0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-04-01 21:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller a écrit :
> On Fri, 01 Apr 2005 16:39:48 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 

>>Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ?
> 
> 
> Even for NR_CPUS, I think the table should be dynamically allocated.
> 
> It is a goal to eliminate all of these huge arrays in the static
> kernel image, which has grown incredibly too much in recent times.
> I work often to eliminate such things, let's not add new ones :-)

You mean you prefer :

static spinlock_t   *rt_hash_lock ; /* rt_hash_lock = alloc_memory_at_boot_time(...) */

instead of

static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ;

In both cases, memory is taken from lowmem, and size of kernel image is roughly the same (bss section takes no space in image)

Then the runtime cost is more expensive in the 'dynamic case' because of the extra indirection...

?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 21:05               ` Eric Dumazet
@ 2005-04-01 21:08                 ` David S. Miller
  2005-04-01 21:43                   ` Eric Dumazet
  0 siblings, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-04-01 21:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Fri, 01 Apr 2005 23:05:37 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> You mean you prefer :
> 
> static spinlock_t   *rt_hash_lock ; /* rt_hash_lock =
> alloc_memory_at_boot_time(...) */
> 
> instead of
> 
> static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ;
> 
> In both cases, memory is taken from lowmem, and size of kernel image
> is roughly the same (bss section takes no space in image)

In the former case the kernel image the bootloader has to
load is smaller.  That's important, believe it or not.  It
means less TLB entries need to be locked permanently into
the MMU on certain platforms.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 21:08                 ` David S. Miller
@ 2005-04-01 21:43                   ` Eric Dumazet
  2005-04-01 22:34                     ` David S. Miller
  0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-04-01 21:43 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller a écrit :
> On Fri, 01 Apr 2005 23:05:37 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> 
>>You mean you prefer :
>>
>>static spinlock_t   *rt_hash_lock ; /* rt_hash_lock =
>>alloc_memory_at_boot_time(...) */
>>
>>instead of
>>
>>static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ;
>>
>>In both cases, memory is taken from lowmem, and size of kernel image
>>is roughly the same (bss section takes no space in image)
> 
> 
> In the former case the kernel image the bootloader has to
> load is smaller.  That's important, believe it or not.  It
> means less TLB entries need to be locked permanently into
> the MMU on certain platforms.
> 
> 
OK thanks for this clarification. I changed to :

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
/*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  */
#if NR_CPUS >= 32
#define RT_HASH_LOCK_SZ 4096
#elif NR_CPUS >= 16
#define RT_HASH_LOCK_SZ 2048
#elif NR_CPUS >= 8
#define RT_HASH_LOCK_SZ 1024
#elif NR_CPUS >= 4
#define RT_HASH_LOCK_SZ 512
#else
#define RT_HASH_LOCK_SZ 256
#endif

         static spinlock_t       *rt_hash_locks;
# define rt_hash_lock_addr(slot) &rt_hash_locks[slot & (RT_HASH_LOCK_SZ - 1)]
# define rt_hash_lock_init()    { \
                 int i; \
                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
                         spin_lock_init(&rt_hash_locks[i]); \
                 }
#else
# define rt_hash_lock_addr(slot) NULL
# define rt_hash_lock_init()
#endif


Are you OK if I also use alloc_large_system_hash() to allocate rt_hash_table, instead of the current method ?
This new method is used in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning.


Eric

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 21:43                   ` Eric Dumazet
@ 2005-04-01 22:34                     ` David S. Miller
  2005-04-01 23:21                       ` Eric Dumazet
  0 siblings, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-04-01 22:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Fri, 01 Apr 2005 23:43:38 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Are you OK if I also use alloc_large_system_hash() to allocate
> rt_hash_table, instead of the current method ? This new method is used
> in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning.

Sure, that's fine.

BTW, please line-wrap your emails. :-/

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 22:34                     ` David S. Miller
@ 2005-04-01 23:21                       ` Eric Dumazet
  2005-04-01 23:54                         ` David S. Miller
  2005-04-02  8:21                         ` Herbert Xu
  0 siblings, 2 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-04-01 23:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 904 bytes --]

David S. Miller a écrit :
> On Fri, 01 Apr 2005 23:43:38 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> 
>>Are you OK if I also use alloc_large_system_hash() to allocate
>>rt_hash_table, instead of the current method ? This new method is used
>>in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning.
> 
> 
> Sure, that's fine.
> 
> BTW, please line-wrap your emails. :-/
> 
> 

:-)

OK this patch includes everything...

  - Locking abstraction
  - rt_check_expire() fixes
  - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second
  - New gc_debug sysctl to let sysadmin tune gc
  - Less memory used by hash table (spinlocks moved to a smaller table)
  - sizing of spinlocks table depends on NR_CPUS
  - hash table allocated using alloc_large_system_hash() function
  - header fix for /proc/net/stat/rt_cache

Thank you
Eric



[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 14030 bytes --]

diff -Nru linux-2.6.11/net/ipv4/route.c linux-2.6.11-ed/net/ipv4/route.c
--- linux-2.6.11/net/ipv4/route.c	2005-03-02 08:38:38.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/route.c	2005-04-02 01:10:37.000000000 +0200
@@ -54,6 +54,8 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
+ *				:	bugfix in rt_cpu_seq_show()
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -70,6 +72,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -107,12 +110,13 @@
 #define IP_MAX_MTU	0xFFF0
 
 #define RT_GC_TIMEOUT (300*HZ)
+#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */
 
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
-static int ip_rt_gc_interval		= 60 * HZ;
+static int ip_rt_gc_interval		= RT_GC_INTERVAL;
 static int ip_rt_gc_min_interval	= HZ / 2;
 static int ip_rt_redirect_number	= 9;
 static int ip_rt_redirect_load		= HZ / 50;
@@ -124,6 +128,7 @@
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static int ip_rt_debug;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
@@ -197,8 +202,38 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ	4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ	2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ	1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ	512
+#else
+#define RT_HASH_LOCK_SZ	256
+#endif
+
+	static spinlock_t	*rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[slot & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_locks[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -393,7 +428,7 @@
 	struct rt_cache_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 		return 0;
 	}
 	
@@ -470,7 +505,7 @@
 		rth->u.dst.expires;
 }
 
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
 	int ret = 0;
@@ -516,45 +551,93 @@
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	static unsigned int effective_interval = RT_GC_INTERVAL;
+	static unsigned int cached_gc_interval = RT_GC_INTERVAL;
+	unsigned int i, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
+	unsigned int freed = 0 , t0;
+	u64 mult;
 
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
-		unsigned long tmo = ip_rt_gc_timeout;
-
+	if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */
+		cached_gc_interval = ip_rt_gc_interval;
+		effective_interval = cached_gc_interval;
+	}
+	/* Computes the number of slots we should examin in this run :
+	 * We want to perform a full scan every ip_rt_gc_timeout, and
+	 * the timer is started every 'effective_interval' ticks.
+	 * so goal = (number_of_slots) * (effective_interval / ip_rt_gc_timeout)
+	 */
+	mult = ((u64)effective_interval) << rt_hash_log;
+	do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+
+	i = atomic_read(&ipv4_dst_ops.entries) << 3;
+	if (i > ip_rt_max_size) {
+		goal <<= 1; /* be more aggressive */
+		i >>= 1;
+		if (i > ip_rt_max_size) {
+			goal <<= 1; /* be more aggressive */
+			i >>= 1;
+			if (i > ip_rt_max_size) {
+				goal <<= 1; /* be more aggressive */
+				now++; /* give us one more tick (time) to do our job */
+			}
+		}
+	}
+	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	t0 = goal;
+	i = rover;
+	for ( ; goal > 0; goal--) {
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
-
-		spin_lock(&rt_hash_table[i].lock);
-		while ((rth = *rthp) != NULL) {
-			if (rth->u.dst.expires) {
-				/* Entry is expired even if it is in use */
-				if (time_before_eq(now, rth->u.dst.expires)) {
+		if (*rthp) {
+			unsigned long tmo = ip_rt_gc_timeout;
+			spin_lock(rt_hash_lock_addr(i));
+			while ((rth = *rthp) != NULL) {
+				if (rth->u.dst.expires) {
+					/* Entry is expired even if it is in use */
+					if (time_before_eq(now, rth->u.dst.expires)) {
+						tmo >>= 1;
+						rthp = &rth->u.rt_next;
+						continue;
+					}
+				} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 					tmo >>= 1;
 					rthp = &rth->u.rt_next;
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.rt_next;
-				continue;
-			}
 
-			/* Cleanup aged off entries. */
-			*rthp = rth->u.rt_next;
-			rt_free(rth);
+				/* Cleanup aged off entries. */
+				*rthp = rth->u.rt_next;
+				freed++;
+				rt_free(rth);
+			}
+			spin_unlock(rt_hash_lock_addr(i));
 		}
-		spin_unlock(&rt_hash_table[i].lock);
-
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	if (goal != 0) {
+		/* Not enough time to perform our job, try to adjust the timer.
+		 * Firing the timer sooner means less planned work.
+		 * We allow the timer to be 1/8 of the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval/8)/2;
+	}
+	else {
+		/* We finished our job before time limit, try to increase the timer
+		 * The limit is the sysctl value, we use a weight of 3/1 to
+		 * increase slowly.
+		 */
+		effective_interval = (3*effective_interval + cached_gc_interval + 3)/4;
+	}
+	if (ip_rt_debug & 1)
+		printk(KERN_WARNING "rt_check_expire() : %u freed, goal=%u/%u, interval=%u ticks\n", freed, goal, t0, effective_interval);
+	mod_timer(&rt_periodic_timer, jiffies + effective_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
@@ -570,11 +653,11 @@
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -704,7 +787,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -715,7 +798,7 @@
 				rt_free(rth);
 				goal--;
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -792,7 +875,7 @@
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
 			/* Put it first */
@@ -813,7 +896,7 @@
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -854,7 +937,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -895,7 +978,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -962,7 +1045,7 @@
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -971,7 +1054,7 @@
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -2569,6 +2652,23 @@
 		.strategy	= &sysctl_jiffies,
 	},
 	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL_MS,
+		.procname	= "gc_interval_ms",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_ms_jiffies,
+		.strategy	= &sysctl_ms_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_DEBUG,
+		.procname	= "gc_debug",
+		.data		= &ip_rt_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
@@ -2718,12 +2818,13 @@
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
 
 #ifdef CONFIG_NET_CLS_ROUTE
+	{
+	int order;
 	for (order = 0;
 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
 		/* NOTHING */;
@@ -2731,6 +2832,7 @@
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
+	}
 #endif
 
 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -2741,39 +2843,24 @@
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
-	if (rhash_entries)
-		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
-	for (order = 0; (1UL << order) < goal; order++)
-		/* NOTHING */;
-
-	do {
-		rt_hash_mask = (1UL << order) * PAGE_SIZE /
-			sizeof(struct rt_hash_bucket);
-		while (rt_hash_mask & (rt_hash_mask - 1))
-			rt_hash_mask--;
-		rt_hash_table = (struct rt_hash_bucket *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (rt_hash_table == NULL && --order > 0);
-
-	if (!rt_hash_table)
-		panic("Failed to allocate IP route cache hash table\n");
-
-	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-	       rt_hash_mask,
-	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
+	rt_hash_table = (struct rt_hash_bucket *)
+		alloc_large_system_hash("IP route cache",
+					sizeof(struct rt_hash_bucket),
+					rhash_entries,
+					(num_physpages >= 128 * 1024) ?
+						(27 - PAGE_SHIFT) :
+						(29 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&rt_hash_log,
+					&rt_hash_mask,
+					0);
 
-	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-		/* NOTHING */;
+	memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
+	ipv4_dst_ops.gc_thresh = rt_hash_mask;
+	ip_rt_max_size = rt_hash_mask * 16;
 	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat)
@@ -2819,7 +2906,7 @@
 	xfrm_init();
 	xfrm4_init();
 #endif
-	return rc;
+	return 0;
 }
 
 EXPORT_SYMBOL(__ip_select_ident);
diff -Nru linux-2.6.11/Documentation/filesystems/proc.txt linux-2.6.11-ed/Documentation/filesystems/proc.txt
--- linux-2.6.11/Documentation/filesystems/proc.txt	2005-04-02 01:19:15.000000000 +0200
+++ linux-2.6.11-ed/Documentation/filesystems/proc.txt	2005-04-02 01:19:04.000000000 +0200
@@ -1709,12 +1709,13 @@
 
 Writing to this file results in a flush of the routing cache.
 
-gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
+gc_elasticity, gc_interval_ms, gc_min_interval_ms, gc_timeout, gc_thresh, gc_debug
 ---------------------------------------------------------------------
 
 Values to  control  the  frequency  and  behavior  of  the  garbage collection
 algorithm for the routing cache. gc_min_interval is deprecated and replaced
-by gc_min_interval_ms.
+by gc_min_interval_ms. gc_interval is deprecated and replaced by
+gc_interval_ms. gc_debug enables some printk()
 
 
 max_size
diff -Nru linux-2.6.11/include/linux/sysctl.h linux-2.6.11-ed/include/linux/sysctl.h
--- linux-2.6.11/include/linux/sysctl.h	2005-03-02 08:38:10.000000000 +0100
+++ linux-2.6.11-ed/include/linux/sysctl.h	2005-04-02 00:43:11.000000000 +0200
@@ -367,6 +367,8 @@
 	NET_IPV4_ROUTE_MIN_ADVMSS=17,
 	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
 	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19,
+	NET_IPV4_ROUTE_GC_INTERVAL_MS=20,
+	NET_IPV4_ROUTE_GC_DEBUG=21,
 };
 
 enum

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 23:21                       ` Eric Dumazet
@ 2005-04-01 23:54                         ` David S. Miller
  2005-04-02  8:21                         ` Herbert Xu
  1 sibling, 0 replies; 44+ messages in thread
From: David S. Miller @ 2005-04-01 23:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Sat, 02 Apr 2005 01:21:49 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> OK this patch includes everything...
> 
>   - Locking abstraction
>   - rt_check_expire() fixes
>   - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second
>   - New gc_debug sysctl to let sysadmin tune gc
>   - Less memory used by hash table (spinlocks moved to a smaller table)
>   - sizing of spinlocks table depends on NR_CPUS
>   - hash table allocated using alloc_large_system_hash() function
>   - header fix for /proc/net/stat/rt_cache

Looks fine to me.

I'd like to see some feedback from folks like Robert Olsson
and co. before applying this, so let's allow the patch to
simmer over the weekend, ok? :-)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-01 23:21                       ` Eric Dumazet
  2005-04-01 23:54                         ` David S. Miller
@ 2005-04-02  8:21                         ` Herbert Xu
  2005-04-02  9:21                           ` Eric Dumazet
  1 sibling, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-02  8:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, Robert.Olsson

Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> OK this patch includes everything...
> 
>  - Locking abstraction
>  - rt_check_expire() fixes
>  - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second
>  - New gc_debug sysctl to let sysadmin tune gc
>  - Less memory used by hash table (spinlocks moved to a smaller table)
>  - sizing of spinlocks table depends on NR_CPUS
>  - hash table allocated using alloc_large_system_hash() function
>  - header fix for /proc/net/stat/rt_cache

This patch is doing too many things.  How about splitting it up?

For instance the spin lock stuff is pretty straightforward and
should be in its own patch.

The benefits of the GC changes are not obvious to me.  rt_check_expire
is simply meant to kill off old entries.  It's not really meant to be
used to free up entries when the table gets full.

rt_garbage_collect on the other hand is designed to free entries
when it is needed.  Eric raised the point that rt_garbage_collect
is pretty expensive.  So what about amortising its cost a bit more?

For instance, we can set a new threshold that's lower than gc_thresh
and perform GC on the chain being inserted in rt_intern_hash if we're
above that threshold.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02  8:21                         ` Herbert Xu
@ 2005-04-02  9:21                           ` Eric Dumazet
  2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
  2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
  0 siblings, 2 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-04-02  9:21 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Robert.Olsson

Herbert Xu a écrit :
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>>OK this patch includes everything...
>>
>> - Locking abstraction
>> - rt_check_expire() fixes
>> - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second
>> - New gc_debug sysctl to let sysadmin tune gc
>> - Less memory used by hash table (spinlocks moved to a smaller table)
>> - sizing of spinlocks table depends on NR_CPUS
>> - hash table allocated using alloc_large_system_hash() function
>> - header fix for /proc/net/stat/rt_cache
> 
> 
> This patch is doing too many things.  How about splitting it up?
> 
> For instance the spin lock stuff is pretty straightforward and
> should be in its own patch.
> 
> The benefits of the GC changes are not obvious to me.  rt_check_expire
> is simply meant to kill off old entries.  It's not really meant to be
> used to free up entries when the table gets full.

Well, I began my work because of the overflow bug in rt_check_expire()...
Then I realize this function could not work as expected. On a loaded machine, one timer tick is 1 ms.
During this time, number of chains that are scanned is ridiculous.
With the standard timer of 60 second, fact is rt_check_expire() is useless.

> 
> rt_garbage_collect on the other hand is designed to free entries
> when it is needed.  Eric raised the point that rt_garbage_collect
> is pretty expensive.  So what about amortising its cost a bit more?

Yes. rt_garbage_collect() has serious problems. But this function is sooo complex I dont want to touch it and let experts do it if they 
want. But then one may think why we have two similar functions that are doing basically the same thing : garbage collection.

One of a production machine rtstat -i 1 output is :

rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|
  entries|  in_hit|in_slow_|in_slow_|in_no_ro|  in_brd|in_marti|in_marti| 
out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis|
         |        |     tot|      mc|     ute|        |  an_dst|  an_src|        |    _tot|     _mc|        |      ed|    miss| verflow| 
_search|t_search|
  2618087|   28581|    7673|       0|       0|       0|       0|       0|    1800|    1450|       0|       0|       0|       0|       0| 
37630|    4783|
  2618689|   25444|    4918|       0|       0|       0|       0|       0|    2051|    1699|       0|       0|       0|       0|       0| 
27741|    5461|
  2619369|   25000|    4567|       0|       0|       0|       0|       0|    1860|    1304|       0|       0|       0|       0|       0| 
26606|    4563|
  2618396|   24830|    4633|       0|       0|       0|       0|       0|    1959|    1492|       0|       0|       0|       0|       0| 
26643|    4930|


Without serious tuning, this machine could not handle this load, or even half of it.

Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die.

> 
> For instance, we can set a new threshold that's lower than gc_thresh
> and perform GC on the chain being inserted in rt_intern_hash if we're
> above that threshold.

We could also try to perform GC on L1_CACHE_SIZE/sizeof(struct rt_hash_bucket) chains, not only the 'current chain', to fully use the cache 
miss.

> 
> Cheers,

Thank you

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02  9:21                           ` Eric Dumazet
@ 2005-04-02 11:23                             ` Herbert Xu
  2005-04-02 13:58                               ` Eric Dumazet
                                                 ` (2 more replies)
  2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
  1 sibling, 3 replies; 44+ messages in thread
From: Herbert Xu @ 2005-04-02 11:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, Robert.Olsson, hadi

On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote:
> 
> Well, I began my work because of the overflow bug in rt_check_expire()...
> Then I realize this function could not work as expected. On a loaded 
> machine, one timer tick is 1 ms.
> During this time, number of chains that are scanned is ridiculous.
> With the standard timer of 60 second, fact is rt_check_expire() is useless.

I see.  What we've got here is a scalability problem with respect
to the number of hash buckets.  As the number of buckets increases,
the amount of work the timer GC has to perform inreases proportionally.

Since the timer GC parameters are fixed, this will eventually break.

Rather than changing the timer GC so that it runs more often to keep
up with the large routing cache, we should get out of this by reducing
the amount of work we have to do.

Imagine an ideal balanced hash table with 2.6 million entries.  That
is, all incoming/outgoing packets belong to flows that are already in
the hash table.  Imagine also that there is no PMTU/link failure taking
place so all entries are valid forever.

In this state there is absolutely no need to execute the timer GC.

Let's remove one of those assumptions and allow there to be entries
which need to expire after a set period.

Instead of having the timer GC clean them up, we can move the expire
check to the place where the entries are used.  That is, we make
ip_route_input/ip_route_output/ipv4_dst_check check whether the
entry has expired.

On the face of it we're doing more work since every routing cache
hit will need to check the validity of the dst.  However, because
it's a single subtraction it is actually pretty cheap.  There is
also no additional cache miss compared to doing it in the timer
GC since we have to read the dst anyway.

Let's go one step further and make the routing cache come to life.
Now there are new entries coming in and we need to remove old ones
in order to make room for them.

That task is currently carried out by the timer GC in rt_check_expire
and on demand by rt_garbage_collect.  Either way we have to walk the
entire routing cache looking for entries to get rid of.

This is quite expensive when the routing cache is large.  However,
there is a better way.

The reason we keep a cap on the routing cache (for a given hash size)
is so that individual chains do not degenerate into long linked lists.

In other words, we don't really care about how many entries there are
in the routing cache.  But we do care about how long each hash chain
is.

So instead of walking the entire routing cache to keep the number of
entries down, what we should do is keep each hash chain as short as
possible.

Assuming that the hash function is good, this should achieve the
same end result.

Here is how it can be done: Every time a routing entry is inserted into
a hash chain, we perform GC on that chain unconditionally.

It might seem that we're doing more work again.  However, as before
because we're traversing the chain anyway, it is very cheap to perform
the GC operations which mainly involve the checks in rt_may_expire.

OK that's enough thinking and it's time to write some code to see
whether this is all bullshit :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02  9:21                           ` Eric Dumazet
  2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
@ 2005-04-02 13:48                             ` Robert Olsson
  2005-04-02 14:10                               ` Eric Dumazet
  2005-04-02 19:32                               ` Herbert Xu
  1 sibling, 2 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-02 13:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Herbert Xu, davem, netdev, Robert.Olsson


Eric Dumazet writes:

 > > This patch is doing too many things.  How about splitting it up?
 > > 
 > > For instance the spin lock stuff is pretty straightforward and
 > > should be in its own patch.

 Yes a good idea so it can be tested separatly....

 > > The benefits of the GC changes are not obvious to me.  rt_check_expire
 > > is simply meant to kill off old entries.  It's not really meant to be
 > > used to free up entries when the table gets full.


 Agree with Herbert... 

 >   entries|  in_hit|in_slow_|in_slow_|in_no_ro|  in_brd|in_marti|in_marti| 
 > out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis|
 >          |        |     tot|      mc|     ute|        |  an_dst|  an_src|        |    _tot|     _mc|        |      ed|    miss| verflow| 
 > _search|t_search|
 >   2618087|   28581|    7673|       0|       0|       0|       0|       0|    1800|    1450|       0|       0|       0|       0|       0| 


 > Without serious tuning, this machine could not handle this load, or even half of it.
 
 Yes thats a pretty much load. Very short flows some reason? 
 What's your ip_rt_gc_min_interval? GC should be allowed to 
 run frequent to smoothen out the GC load. Also good idea 
 to decrease gc_thresh and you hash is really huge.

 
 > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die.

 A good idea to increase the secret_interval interval but it should survive.

						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
@ 2005-04-02 13:58                               ` Eric Dumazet
  2005-04-02 14:03                               ` Robert Olsson
  2005-04-02 21:05                               ` jamal
  2 siblings, 0 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-04-02 13:58 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev, Robert.Olsson, hadi

Herbert Xu a écrit :
> On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote:
> 
>>Well, I began my work because of the overflow bug in rt_check_expire()...
>>Then I realize this function could not work as expected. On a loaded 
>>machine, one timer tick is 1 ms.
>>During this time, number of chains that are scanned is ridiculous.
>>With the standard timer of 60 second, fact is rt_check_expire() is useless.
> 
> 
> I see.  What we've got here is a scalability problem with respect
> to the number of hash buckets.  As the number of buckets increases,
> the amount of work the timer GC has to perform inreases proportionally.
> 
> Since the timer GC parameters are fixed, this will eventually break.
> 
> Rather than changing the timer GC so that it runs more often to keep
> up with the large routing cache, we should get out of this by reducing
> the amount of work we have to do.
> 
> Imagine an ideal balanced hash table with 2.6 million entries.  That
> is, all incoming/outgoing packets belong to flows that are already in
> the hash table.  Imagine also that there is no PMTU/link failure taking
> place so all entries are valid forever.
> 
> In this state there is absolutely no need to execute the timer GC.
> 
> Let's remove one of those assumptions and allow there to be entries
> which need to expire after a set period.
> 
> Instead of having the timer GC clean them up, we can move the expire
> check to the place where the entries are used.  That is, we make
> ip_route_input/ip_route_output/ipv4_dst_check check whether the
> entry has expired.
> 
> On the face of it we're doing more work since every routing cache
> hit will need to check the validity of the dst.  However, because
> it's a single subtraction it is actually pretty cheap.  There is
> also no additional cache miss compared to doing it in the timer
> GC since we have to read the dst anyway.
> 
> Let's go one step further and make the routing cache come to life.
> Now there are new entries coming in and we need to remove old ones
> in order to make room for them.
> 
> That task is currently carried out by the timer GC in rt_check_expire
> and on demand by rt_garbage_collect.  Either way we have to walk the
> entire routing cache looking for entries to get rid of.
> 
> This is quite expensive when the routing cache is large.  However,
> there is a better way.
> 
> The reason we keep a cap on the routing cache (for a given hash size)
> is so that individual chains do not degenerate into long linked lists.
> 
> In other words, we don't really care about how many entries there are
> in the routing cache.  But we do care about how long each hash chain
> is.
> 
> So instead of walking the entire routing cache to keep the number of
> entries down, what we should do is keep each hash chain as short as
> possible.
> 
> Assuming that the hash function is good, this should achieve the
> same end result.
> 
> Here is how it can be done: Every time a routing entry is inserted into
> a hash chain, we perform GC on that chain unconditionally.
> 
> It might seem that we're doing more work again.  However, as before
> because we're traversing the chain anyway, it is very cheap to perform
> the GC operations which mainly involve the checks in rt_may_expire.
> 
> OK that's enough thinking and it's time to write some code to see
> whether this is all bullshit :)
> 
> Cheers,

Well, it may work if you dont care about memory used.

# grep dst /proc/slabinfo
ip_dst_cache      2825575 2849590    384   10    1 : tunables   54   27    8 : slabdata 284959 284959      0

On this machine, route cache takes 1.1 GB of ram... impressive.

Then if the network load decrease (or completely stop), only a timer driven gc could purge the cache.

So rt_check_expire() is *needed*

You are right saying that gc parameters are fixed, thus gc breaks at high load.


Eric

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
  2005-04-02 13:58                               ` Eric Dumazet
@ 2005-04-02 14:03                               ` Robert Olsson
  2005-04-02 21:05                               ` jamal
  2 siblings, 0 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-02 14:03 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Eric Dumazet, davem, netdev, Robert.Olsson, hadi


Herbert Xu writes:

 > Rather than changing the timer GC so that it runs more often to keep
 > up with the large routing cache, we should get out of this by reducing
 > the amount of work we have to do.

 Yeep.

 > Imagine an ideal balanced hash table with 2.6 million entries.  That
 > is, all incoming/outgoing packets belong to flows that are already in
 > the hash table.  Imagine also that there is no PMTU/link failure taking
 > place so all entries are valid forever.
 > 
 > In this state there is absolutely no need to execute the timer GC.
 
 > Let's remove one of those assumptions and allow there to be entries
 > which need to expire after a set period.
 > 
 > Instead of having the timer GC clean them up, we can move the expire
 > check to the place where the entries are used.  That is, we make
 > ip_route_input/ip_route_output/ipv4_dst_check check whether the
 > entry has expired.
 > 
 > On the face of it we're doing more work since every routing cache
 > hit will need to check the validity of the dst.  However, because
 > it's a single subtraction it is actually pretty cheap.  There is
 > also no additional cache miss compared to doing it in the timer
 > GC since we have to read the dst anyway.
 > 
 > Let's go one step further and make the routing cache come to life.
 > Now there are new entries coming in and we need to remove old ones
 > in order to make room for them.
 > 
 > That task is currently carried out by the timer GC in rt_check_expire
 > and on demand by rt_garbage_collect.  Either way we have to walk the
 > entire routing cache looking for entries to get rid of.
 > 
 > This is quite expensive when the routing cache is large.  However,
 > there is a better way.
 > 
 > The reason we keep a cap on the routing cache (for a given hash size)
 > is so that individual chains do not degenerate into long linked lists.
 > 
 > In other words, we don't really care about how many entries there are
 > in the routing cache.  But we do care about how long each hash chain
 > is.
 > 
 > So instead of walking the entire routing cache to keep the number of
 > entries down, what we should do is keep each hash chain as short as
 > possible.
 > 
 > Assuming that the hash function is good, this should achieve the
 > same end result.
 > 
 > Here is how it can be done: Every time a routing entry is inserted into
 > a hash chain, we perform GC on that chain unconditionally.
 > 
 > It might seem that we're doing more work again.  However, as before
 > because we're traversing the chain anyway, it is very cheap to perform
 > the GC operations which mainly involve the checks in rt_may_expire.


 Agree... It's very interesting and worth to test something like this.
 also it could clean up the GC process and the need for tuning which 
 would be very welcome.

					   --ro
 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
@ 2005-04-02 14:10                               ` Eric Dumazet
  2005-04-02 14:46                                 ` Robert Olsson
  2005-04-02 20:47                                 ` jamal
  2005-04-02 19:32                               ` Herbert Xu
  1 sibling, 2 replies; 44+ messages in thread
From: Eric Dumazet @ 2005-04-02 14:10 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Herbert Xu, davem, netdev

Robert Olsson a écrit :
> Eric Dumazet writes:

>  Yes thats a pretty much load. Very short flows some reason?

Well... yes. This is a real server, not a DOS simulation.
1 million TCP flows, and about 3 million peers using UDP frames.

>  What's your ip_rt_gc_min_interval? GC should be allowed to 
>  run frequent to smoothen out the GC load. Also good idea 
>  to decrease gc_thresh and you hash is really huge.

No. As soon as I lower gc_thresh (and let gc running), the machine starts to drop connections and crash some seconds later.
I found I had to make the hash table very large (but lowering elasticity, ie chain length) .
It needs lot of ram, but at least CPU usage of net/ipv4/route.c is close to 0.

# grep . /proc/sys/net/ipv4/route/*
/proc/sys/net/ipv4/route/error_burst:5000
/proc/sys/net/ipv4/route/error_cost:1000
/proc/sys/net/ipv4/route/gc_elasticity:2
/proc/sys/net/ipv4/route/gc_interval:1
/proc/sys/net/ipv4/route/gc_min_interval:0
/proc/sys/net/ipv4/route/gc_min_interval_ms:500
/proc/sys/net/ipv4/route/gc_thresh:2900000
/proc/sys/net/ipv4/route/gc_timeout:155
/proc/sys/net/ipv4/route/max_delay:10
/proc/sys/net/ipv4/route/max_size:16777216
/proc/sys/net/ipv4/route/min_adv_mss:256
/proc/sys/net/ipv4/route/min_delay:2
/proc/sys/net/ipv4/route/min_pmtu:552
/proc/sys/net/ipv4/route/mtu_expires:600
/proc/sys/net/ipv4/route/redirect_load:20
/proc/sys/net/ipv4/route/redirect_number:9
/proc/sys/net/ipv4/route/redirect_silence:20480
/proc/sys/net/ipv4/route/secret_interval:36000

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 14:10                               ` Eric Dumazet
@ 2005-04-02 14:46                                 ` Robert Olsson
  2005-04-02 20:47                                 ` jamal
  1 sibling, 0 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-02 14:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Robert Olsson, Herbert Xu, davem, netdev


Eric Dumazet writes:

 > Well... yes. This is a real server, not a DOS simulation.
 > 1 million TCP flows, and about 3 million peers using UDP frames.

 I see.

 > >  What's your ip_rt_gc_min_interval? GC should be allowed to 
 > >  run frequent to smoothen out the GC load. Also good idea 
 > >  to decrease gc_thresh and you hash is really huge.

 > No. As soon as I lower gc_thresh (and let gc running), the machine starts to drop connections and crash some seconds later.
 > I found I had to make the hash table very large (but lowering elasticity, ie chain length) .
 > It needs lot of ram, but at least CPU usage of net/ipv4/route.c is close to 0.

 OK! Not so bad. Most of your GC likely happens in rt_intern_hash chain pruning.
 This way you keep hash-chains short and get "datadriven" GC. But there must be 
 bugs causing the crash...
 
 Maybe there should be an explicit control hash lengths not via elasticity 
 but adding even more tuning knobs hurts. :)

						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
  2005-04-02 14:10                               ` Eric Dumazet
@ 2005-04-02 19:32                               ` Herbert Xu
  2005-04-02 19:55                                 ` David S. Miller
  2005-04-03 19:36                                 ` Robert Olsson
  1 sibling, 2 replies; 44+ messages in thread
From: Herbert Xu @ 2005-04-02 19:32 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev

On Sat, Apr 02, 2005 at 03:48:32PM +0200, Robert Olsson wrote:
>  
>  > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die.
> 
>  A good idea to increase the secret_interval interval but it should survive.

Incidentally we should change the way the rehashing is triggered.
Instead of doing it regularly, we can do it when we notice that a
specific hash chain grows beyond a certain size.

The idea is that if someone is attacking our hash then they can
only do so by lengthening the chains.  If they're not doing that
then even if they knew how to attack us we don't really care.

Of course when it does happen it'll still kill your machine unless
we can find a way to amortise this.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 19:32                               ` Herbert Xu
@ 2005-04-02 19:55                                 ` David S. Miller
  2005-04-03  7:43                                   ` Herbert Xu
  2005-04-03 19:36                                 ` Robert Olsson
  1 sibling, 1 reply; 44+ messages in thread
From: David S. Miller @ 2005-04-02 19:55 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert.Olsson, dada1, netdev

On Sun, 3 Apr 2005 05:32:24 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Sat, Apr 02, 2005 at 03:48:32PM +0200, Robert Olsson wrote:
> >  
> >  > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die.
> > 
> >  A good idea to increase the secret_interval interval but it should survive.
> 
> Incidentally we should change the way the rehashing is triggered.
> Instead of doing it regularly, we can do it when we notice that a
> specific hash chain grows beyond a certain size.
> 
> The idea is that if someone is attacking our hash then they can
> only do so by lengthening the chains.  If they're not doing that
> then even if they knew how to attack us we don't really care.

Yes, the secret_interval is way too short.  It is a very paranoid
default value selected when initially fixing that DoS.

I think we should, in the short term, increase the secret interval
where it exists in the tree (netfilter conntrack is another instance
for example).

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 14:10                               ` Eric Dumazet
  2005-04-02 14:46                                 ` Robert Olsson
@ 2005-04-02 20:47                                 ` jamal
  1 sibling, 0 replies; 44+ messages in thread
From: jamal @ 2005-04-02 20:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Robert Olsson, Herbert Xu, David S. Miller, netdev

On Sat, 2005-04-02 at 09:10, Eric Dumazet wrote:
> Robert Olsson a écrit :
> > Eric Dumazet writes:
> 
> >  Yes thats a pretty much load. Very short flows some reason?
> 
> Well... yes. This is a real server, not a DOS simulation.
> 1 million TCP flows, and about 3 million peers using UDP frames.

SMP? How many processors?

cheers,
jamal

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
  2005-04-02 13:58                               ` Eric Dumazet
  2005-04-02 14:03                               ` Robert Olsson
@ 2005-04-02 21:05                               ` jamal
  2005-04-03  7:38                                 ` Herbert Xu
  2005-04-03  7:41                                 ` Herbert Xu
  2 siblings, 2 replies; 44+ messages in thread
From: jamal @ 2005-04-02 21:05 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson

On Sat, 2005-04-02 at 06:23, Herbert Xu wrote:
> On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote:
> > 
> > Well, I began my work because of the overflow bug in rt_check_expire()...
> > Then I realize this function could not work as expected. On a loaded 
> > machine, one timer tick is 1 ms.
> > During this time, number of chains that are scanned is ridiculous.
> > With the standard timer of 60 second, fact is rt_check_expire() is useless.
> 
> I see.  What we've got here is a scalability problem with respect
> to the number of hash buckets.  As the number of buckets increases,
> the amount of work the timer GC has to perform inreases proportionally.
> 

Its classical incremental garbage collection algorithm thats being used
i.e something along whats typically refered to as mark-and-sweep.
Could the main issue be not the amount of routes in the cache but
rather the locking when number of CPUs go up?
Incrementing the timer frequency would certainly help but maybe have
adverse effects if the frequency is too high because of the across
system locking IMO.

> Since the timer GC parameters are fixed, this will eventually break.
> 
> Rather than changing the timer GC so that it runs more often to keep
> up with the large routing cache, we should get out of this by reducing
> the amount of work we have to do.
> 

Refer to my hint above: perhaps per CPU caches?

> Imagine an ideal balanced hash table with 2.6 million entries.  That
> is, all incoming/outgoing packets belong to flows that are already in
> the hash table.  Imagine also that there is no PMTU/link failure taking
> place so all entries are valid forever.
> 
>
> In this state there is absolutely no need to execute the timer GC.
> 

Yeah, but memory is finite friend. True, if you can imagine infinite
memory we would not need gc ;->

> Let's remove one of those assumptions and allow there to be entries

> which need to expire after a set period.

> Instead of having the timer GC clean them up, we can move the expire
> check to the place where the entries are used.  That is, we make
> ip_route_input/ip_route_output/ipv4_dst_check check whether the
> entry has expired.
> 

If you can show lock grabbing is the main contentious issue; i believe
it is as CPUs go up. Then this is a valuable idea since you are already
grabbing the locks anyways.

> On the face of it we're doing more work since every routing cache
> hit will need to check the validity of the dst.  However, because
> it's a single subtraction it is actually pretty cheap.  There is
> also no additional cache miss compared to doing it in the timer
> GC since we have to read the dst anyway.
> 

In the case of slower machine, the compute is also an issue. 
To be honest i feel like handwaving - experimenting and collecting
profiles would help nail it.

> Let's go one step further and make the routing cache come to life.
> Now there are new entries coming in and we need to remove old ones
> in order to make room for them.
> 
> That task is currently carried out by the timer GC in rt_check_expire
> and on demand by rt_garbage_collect.  Either way we have to walk the
> entire routing cache looking for entries to get rid of.
> 

we dont really do the whole route cache everytime - I am sure you know
that.

> This is quite expensive when the routing cache is large.  However,
> there is a better way.
> 
> The reason we keep a cap on the routing cache (for a given hash size)
> is so that individual chains do not degenerate into long linked lists.
> 
> In other words, we don't really care about how many entries there are
> in the routing cache.  But we do care about how long each hash chain
> is.
> 
> So instead of walking the entire routing cache to keep the number of
> entries down, what we should do is keep each hash chain as short as
> possible.
> 

Thats certainly one solution .. reading on how you achive this ..

> Assuming that the hash function is good, this should achieve the
> same end result.
> 
> Here is how it can be done: Every time a routing entry is inserted into
> a hash chain, we perform GC on that chain unconditionally.
> 

May not be a good idea to do it unconditionally - in particular on SMP
where another CPU maybe spinning waiting for you to let go of bucket
lock. In particular if a burst of packets accessing the same bucket show
up on different processors, this would be aggravated.
You may wanna kick in this algorithm only when things start going past a
certain threshold.

> It might seem that we're doing more work again.  However, as before
> because we're traversing the chain anyway, it is very cheap to perform
> the GC operations which mainly involve the checks in rt_may_expire.
> 
> OK that's enough thinking and it's time to write some code to see
> whether this is all bullshit :)
> 

I think there are some good ideas in there; the bottleneck could be
perceived as one of either the locks are too expensive (clearly so in
SMP as number of CPUs go up) or the compute is taking too long (clearly
so in slower systems - but a general fact of life as well).
For the first issue, amortizing the lock grabbing via compute as you
suggest maybe of value or make per cpu caches.

cheers,
jamal

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02 21:05                               ` jamal
@ 2005-04-03  7:38                                 ` Herbert Xu
  2005-04-03  7:41                                 ` Herbert Xu
  1 sibling, 0 replies; 44+ messages in thread
From: Herbert Xu @ 2005-04-03  7:38 UTC (permalink / raw)
  To: jamal; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson

On Sat, Apr 02, 2005 at 04:05:55PM -0500, jamal wrote:
>
> > In this state there is absolutely no need to execute the timer GC.
> 
> Yeah, but memory is finite friend. True, if you can imagine infinite
> memory we would not need gc ;->

True.  However running the GC when you can't free most of the entries
is a waste of time.

On a busy system where the routing cache is near capacity and new
entries are coming in all the time, we should arrange it so that the
old entries are expired when entries are inserted.

Assuming the hash function is good, then as long as there is
a steady stream of entries coming in, the old entries will be
expired automatically.

Of course, we should not leave the systems that have experienced
a burst of flows at a disadvantage.  Indeed there is a rather
simple way of doing GC for them without having to do work that's
proportional to the number of hash chains in the routing cache.

The key is that the GC is only useful when the routing cache contains
enough entries that can be freed.  Let's say that if we can free more
than 1/3 of the entries then the GC should be run.  Of course you
can define this to be whatever you want.

So now the problem is to quickly determine whether there are enough
entries in the cache that can be freed.  What we can do is take a
leaf out of the politicians' book :)

We take a poll on a small sample of the routing cache.  That is,
we run the GC on a fixed number of chains, e.g., 256 chains.  After
that we tally the total number of entries and the number of entries
freed.

Since the hash function should be spreading entries throughout the
chains evenly, the ratio here can be extrapolated out to the entire
cache.

Therefore once the ratio exceeds the defined threshold, we perform
GC over the entire cache, preferably in a kernel thread.

If not then we'll simply let the GC roam along at the constant pace
of 256 chains.

The advantage of this is that the GC will free entries in the entire
table as soon as that becomes possible without having to do work
proportional to the number of chains in each GC interval.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: Get rid of rt_check_expire and rt_garbage_collect
  2005-04-02 21:05                               ` jamal
  2005-04-03  7:38                                 ` Herbert Xu
@ 2005-04-03  7:41                                 ` Herbert Xu
  1 sibling, 0 replies; 44+ messages in thread
From: Herbert Xu @ 2005-04-03  7:41 UTC (permalink / raw)
  To: jamal; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson

On Sat, Apr 02, 2005 at 04:05:55PM -0500, jamal wrote:
>
> > Here is how it can be done: Every time a routing entry is inserted into
> > a hash chain, we perform GC on that chain unconditionally.
> 
> May not be a good idea to do it unconditionally - in particular on SMP
> where another CPU maybe spinning waiting for you to let go of bucket
> lock. In particular if a burst of packets accessing the same bucket show
> up on different processors, this would be aggravated.
> You may wanna kick in this algorithm only when things start going past a
> certain threshold.

This isn't too bad because:

1. The fast path is lockless using RCU.
2. The number of locks exceeds the number of CPUs by some insane amount.
3. The cost of performing GC is really cheap, it's just a matter of
calling rt_may_expire.

Anyway, I agree that all of these ideas are simply fantasy until
we have some code.  So let me work on that and then we can let the
benchmarks do the talking :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 19:55                                 ` David S. Miller
@ 2005-04-03  7:43                                   ` Herbert Xu
  2005-04-03 19:57                                     ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-03  7:43 UTC (permalink / raw)
  To: David S. Miller; +Cc: Robert.Olsson, dada1, netdev

On Sat, Apr 02, 2005 at 11:55:28AM -0800, David S. Miller wrote:
> 
> I think we should, in the short term, increase the secret interval
> where it exists in the tree (netfilter conntrack is another instance
> for example).

We could also move rt_cache_flush into a kernel thread.  When the
number of chains is large this function is really expensive for a
softirq handler.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-02 19:32                               ` Herbert Xu
  2005-04-02 19:55                                 ` David S. Miller
@ 2005-04-03 19:36                                 ` Robert Olsson
  2005-04-03 21:43                                   ` Herbert Xu
  1 sibling, 1 reply; 44+ messages in thread
From: Robert Olsson @ 2005-04-03 19:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev


Herbert Xu writes:

 > Incidentally we should change the way the rehashing is triggered.
 > Instead of doing it regularly, we can do it when we notice that a
 > specific hash chain grows beyond a certain size.

 > The idea is that if someone is attacking our hash then they can
 > only do so by lengthening the chains.  If they're not doing that
 > then even if they knew how to attack us we don't really care.


 Well I don't see how we detect the need for rehash just be looking
 at the hash chains. How does the the "lengthening" look like that
 are allowed to trigger a rehash?

 Agree with Dave that we can increase the interval to start with.


					  --ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-03  7:43                                   ` Herbert Xu
@ 2005-04-03 19:57                                     ` Robert Olsson
  2005-04-03 21:45                                       ` Herbert Xu
  0 siblings, 1 reply; 44+ messages in thread
From: Robert Olsson @ 2005-04-03 19:57 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, Robert.Olsson, dada1, netdev


Herbert Xu writes:

 > We could also move rt_cache_flush into a kernel thread.  When the
 > number of chains is large this function is really expensive for a
 > softirq handler.

 It can also be done via /proc and left to administrators to find 
 suitable policy. Kernel just provides the mechanism to rehash.


						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-03 19:36                                 ` Robert Olsson
@ 2005-04-03 21:43                                   ` Herbert Xu
  2005-04-04 10:38                                     ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-03 21:43 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev

On Sun, Apr 03, 2005 at 09:36:52PM +0200, Robert Olsson wrote:
> 
>  Well I don't see how we detect the need for rehash just be looking
>  at the hash chains. How does the the "lengthening" look like that
>  are allowed to trigger a rehash?

The only way to attack a hash is by exploiting collisions and
create one or more excessively long chains.

This can be detected as follows at each rt hash insertion.  If

(total number of entries in cache >> (hash length - user defined length)) <
current bucket length

is true, then we schedule a rehash/flush.

Hash length is the number of bits in the hash, i.e.,

1 << hash length == number of buckets

I'd suggest a default shift length of 3.  That is, if any individual
chain is growing beyond 8 times the average chain length then we've
got a problem.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-03 19:57                                     ` Robert Olsson
@ 2005-04-03 21:45                                       ` Herbert Xu
  2005-04-04 10:27                                         ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-03 21:45 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David S. Miller, dada1, netdev

On Sun, Apr 03, 2005 at 09:57:08PM +0200, Robert Olsson wrote:
> 
> Herbert Xu writes:
> 
>  > We could also move rt_cache_flush into a kernel thread.  When the
>  > number of chains is large this function is really expensive for a
>  > softirq handler.
> 
>  It can also be done via /proc and left to administrators to find 
>  suitable policy. Kernel just provides the mechanism to rehash.

The reason I'm suggesting the move to a kernel thread is because
softirq context is not preemptible.

So doing a large amount of work in it when your table is big means
that a UP machine will freeze for a while.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-03 21:45                                       ` Herbert Xu
@ 2005-04-04 10:27                                         ` Robert Olsson
  2005-04-04 10:38                                           ` Herbert Xu
  0 siblings, 1 reply; 44+ messages in thread
From: Robert Olsson @ 2005-04-04 10:27 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert Olsson, David S. Miller, dada1, netdev


Herbert Xu writes:

 > The reason I'm suggesting the move to a kernel thread is because
 > softirq context is not preemptible.
 > 
 > So doing a large amount of work in it when your table is big means
 > that a UP machine will freeze for a while.

 The flush transient will happen also on UP... as I understand this 
 When we have changed the rt_hash_rnd and therefore invalidated all current 
 entries it would be best to blackhole *all* traffic until all old entries 
 are deleted this to avoid transients.

						--ro


 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-03 21:43                                   ` Herbert Xu
@ 2005-04-04 10:38                                     ` Robert Olsson
  2005-04-04 10:48                                       ` Herbert Xu
  0 siblings, 1 reply; 44+ messages in thread
From: Robert Olsson @ 2005-04-04 10:38 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev


Herbert Xu writes:

 > The only way to attack a hash is by exploiting collisions and
 > create one or more excessively long chains.
 > 
 > This can be detected as follows at each rt hash insertion.  If
 > 
 > (total number of entries in cache >> (hash length - user defined length)) <
 > current bucket length
 > 
 > is true, then we schedule a rehash/flush.
 > 
 > Hash length is the number of bits in the hash, i.e.,
 > 
 > 1 << hash length == number of buckets
 > 
 > I'd suggest a default shift length of 3.  That is, if any individual
 > chain is growing beyond 8 times the average chain length then we've
 > got a problem.

 This is likely to happen in rt_intern_hash? I don't see how this can
 get along with chain-pruning there?

 IMO the thoughts of extending in-flow GC etc are interesting and can
 hopefully give us more robust performance.

					--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-04 10:27                                         ` Robert Olsson
@ 2005-04-04 10:38                                           ` Herbert Xu
  2005-04-04 12:29                                             ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-04 10:38 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David S. Miller, dada1, netdev

On Mon, Apr 04, 2005 at 12:27:43PM +0200, Robert Olsson wrote:
> 
>  The flush transient will happen also on UP... as I understand this 
>  When we have changed the rt_hash_rnd and therefore invalidated all current 
>  entries it would be best to blackhole *all* traffic until all old entries 
>  are deleted this to avoid transients.

That's nasty because if you have a large cache like Eric, then you'll
be dropping packets for quite a while :)

Actually, what's so bad about seeing transients? One cost that
I can see is that you'll be walking a chain only to conclude that
none of the entries might match.  But this is pretty cheap as long as
we keep the chain lengths short.

The other cost is that we might be creating an entry that gets flushed
straight away.  However, that's no worse than not using the cache at
all since in that case we'll be creating one entry for each packet
anyway.

Both of these can be avoided too if we really cared.  All we need
is one bit per chain that indicated whether it's been flushed.  So
when ip_route_* hits a chain that hasn't been flushed, it could

1) Skip the lookup step.
2) Create the rt entry as usual.
3) Flush the chain while we insert the entry and set the bit.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-04 10:38                                     ` Robert Olsson
@ 2005-04-04 10:48                                       ` Herbert Xu
  2005-04-04 13:17                                         ` Robert Olsson
  0 siblings, 1 reply; 44+ messages in thread
From: Herbert Xu @ 2005-04-04 10:48 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev

On Mon, Apr 04, 2005 at 12:38:03PM +0200, Robert Olsson wrote:
> 
>  This is likely to happen in rt_intern_hash? I don't see how this can
>  get along with chain-pruning there?

What I'm trying to catch is the case when you've got x number of
entries in the table and a large fraction of them are all in one
chain.

This does not conflict with the goal of keeping the chains short.

Even if you strictly allow only 8 entries per chain, it's trivial
to exceed 8 times the average chain length.

Remember the average chain length can be fractions like 0.1.  Of
course we need to set a minimum value that the chain needs to
grow beyond before this check kicks in.

>  IMO the thoughts of extending in-flow GC etc are interesting and can
>  hopefully give us more robust performance.

Indeed, it looks like Alexey has already put the code there.  It just
needs to be made more strict :) It needs to free entries even if they
are in use.

After all, freeing an entry in use can't be much worse than not having
a cache at all.  OTOH, having a very long chain is definitely much worse
than not having a cache :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-04 10:38                                           ` Herbert Xu
@ 2005-04-04 12:29                                             ` Robert Olsson
  0 siblings, 0 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-04 12:29 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert Olsson, David S. Miller, dada1, netdev


Herbert Xu writes:

 > That's nasty because if you have a large cache like Eric, then you'll
 > be dropping packets for quite a while :)
 > 
 > Actually, what's so bad about seeing transients? One cost that
 > I can see is that you'll be walking a chain only to conclude that
 > none of the entries might match.  But this is pretty cheap as long as
 > we keep the chain lengths short.
 > 
 > The other cost is that we might be creating an entry that gets flushed
 > straight away.  However, that's no worse than not using the cache at
 > all since in that case we'll be creating one entry for each packet
 > anyway.

 Maybe you're right and systems seems to survive. But the transit period
 should be as short as possible. 

 > Both of these can be avoided too if we really cared.  All we need
 > is one bit per chain that indicated whether it's been flushed.  So
 > when ip_route_* hits a chain that hasn't been flushed, it could
 > 
 > 1) Skip the lookup step.
 > 2) Create the rt entry as usual.
 > 3) Flush the chain while we insert the entry and set the bit.

 Yes better was thinking of something like this too.


					   --ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
  2005-04-04 10:48                                       ` Herbert Xu
@ 2005-04-04 13:17                                         ` Robert Olsson
  0 siblings, 0 replies; 44+ messages in thread
From: Robert Olsson @ 2005-04-04 13:17 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev


Herbert Xu writes:

 > What I'm trying to catch is the case when you've got x number of
 > entries in the table and a large fraction of them are all in one
 > chain.
 > 
 > This does not conflict with the goal of keeping the chains short.
 > 
 > Even if you strictly allow only 8 entries per chain, it's trivial
 > to exceed 8 times the average chain length.

 OK! Since deletions doen't happen instantly..
 
 Try some code it can print a warning to start with.


 > >  IMO the thoughts of extending in-flow GC etc are interesting and can
 > >  hopefully give us more robust performance.
 > 
 > Indeed, it looks like Alexey has already put the code there.  It just
 > needs to be made more strict :) It needs to free entries even if they
 > are in use.
 > 
 > After all, freeing an entry in use can't be much worse than not having
 > a cache at all.  OTOH, having a very long chain is definitely much worse
 > than not having a cache :)

 FYI I'm experimenting with "new" routing algo that does 24-bit ipv4 lookup 
 and routing without route hash to see if we can come close route hash
 performance  This needs memory. :)

 IP: FIB routing table of 16777216 buckets, 65536Kbytes for table id=255

 Needs some more work before testing.

						--ro

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2005-04-04 13:17 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-03-15 16:13 [PATCH] no more rwlock_t inside tcp_ehash_bucket Eric Dumazet
2005-03-15 18:32 ` David S. Miller
2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
2005-03-16 22:10       ` David S. Miller
2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
2005-03-17 19:52       ` Eric Dumazet
2005-04-01  6:13         ` David S. Miller
2005-04-01 14:39           ` Eric Dumazet
2005-04-01 15:53             ` Robert Olsson
2005-04-01 16:34               ` Eric Dumazet
2005-04-01 17:26                 ` Robert Olsson
2005-04-01 20:28             ` David S. Miller
2005-04-01 21:05               ` Eric Dumazet
2005-04-01 21:08                 ` David S. Miller
2005-04-01 21:43                   ` Eric Dumazet
2005-04-01 22:34                     ` David S. Miller
2005-04-01 23:21                       ` Eric Dumazet
2005-04-01 23:54                         ` David S. Miller
2005-04-02  8:21                         ` Herbert Xu
2005-04-02  9:21                           ` Eric Dumazet
2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
2005-04-02 13:58                               ` Eric Dumazet
2005-04-02 14:03                               ` Robert Olsson
2005-04-02 21:05                               ` jamal
2005-04-03  7:38                                 ` Herbert Xu
2005-04-03  7:41                                 ` Herbert Xu
2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
2005-04-02 14:10                               ` Eric Dumazet
2005-04-02 14:46                                 ` Robert Olsson
2005-04-02 20:47                                 ` jamal
2005-04-02 19:32                               ` Herbert Xu
2005-04-02 19:55                                 ` David S. Miller
2005-04-03  7:43                                   ` Herbert Xu
2005-04-03 19:57                                     ` Robert Olsson
2005-04-03 21:45                                       ` Herbert Xu
2005-04-04 10:27                                         ` Robert Olsson
2005-04-04 10:38                                           ` Herbert Xu
2005-04-04 12:29                                             ` Robert Olsson
2005-04-03 19:36                                 ` Robert Olsson
2005-04-03 21:43                                   ` Herbert Xu
2005-04-04 10:38                                     ` Robert Olsson
2005-04-04 10:48                                       ` Herbert Xu
2005-04-04 13:17                                         ` Robert Olsson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).