* [PATCH] no more rwlock_t inside tcp_ehash_bucket
@ 2005-03-15 16:13 Eric Dumazet
2005-03-15 18:32 ` David S. Miller
0 siblings, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2005-03-15 16:13 UTC (permalink / raw)
To: David S. Miller, netdev
A machine with large tcp_ehash uses half the size of the hash to store rwlocks.
That seems not really necessary :
- Most accesses to tcp_ehash[] are done with read_locks(), so dirtying the memory (because of embedded rwlocks) all over the hash table is not
SMP friendly. The cache hit given by the fact that the rwlock and chain were in the same cache line is not really a gain, because of the
dirtying of the cache line (and exclusive use because of lock operations)
I suggest in this patch using an array of 256 rwlocks.
- Less memory used (or 2 x more entries in hash table if size >= (2^MAX_ORDER)*PAGE_SIZE)
- less memory touched during read_lock()/read_unlock()
- Better sharing of hash entries between cpus
tests done on a production machine (1 million slots, size 8MB). No differences found in performance (oprofile).
5 files changed :
- net/ipv4/tcp_diag.c
- net/ipv4/tcp_minisocks.c
- net/ipv4/tcp_ipv4.c
- net/ipv4/tcp.c
- include/net/tcp.h
- include/net/sock.h
(i converted also __tcp_bhash_size , __tcp_ehash_size , sk_hashent to 'unsigned int' to let compiler chose a better code)
Eric Dumazet
# diff -Nru linux-2.6.11 linux-2.6.11-ed
diff -Nru linux-2.6.11/include/net/sock.h linux-2.6.11-ed/include/net/sock.h
--- linux-2.6.11/include/net/sock.h 2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/include/net/sock.h 2005-03-15 17:02:52.000000000 +0100
@@ -217,7 +217,7 @@
unsigned char sk_no_largesend;
int sk_route_caps;
unsigned long sk_lingertime;
- int sk_hashent;
+ unsigned int sk_hashent;
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
diff -Nru linux-2.6.11/include/net/tcp.h linux-2.6.11-ed/include/net/tcp.h
--- linux-2.6.11/include/net/tcp.h 2005-03-02 08:37:48.000000000 +0100
+++ linux-2.6.11-ed/include/net/tcp.h 2005-03-15 15:06:31.000000000 +0100
@@ -44,9 +44,14 @@
* for the rest. I'll experiment with dynamic table growth later.
*/
struct tcp_ehash_bucket {
- rwlock_t lock;
struct hlist_head chain;
-} __attribute__((__aligned__(8)));
+} ;
+
+/*
+ * instead of using one rwlock_t for each ehash_bucket, use a table of 256 rwlock_t
+ */
+#define TCP_EHASH_RWLOCK_SZ 256
+extern rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;
/* This is for listening sockets, thus all sockets which possess wildcards. */
#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
@@ -106,6 +111,7 @@
return hlist_empty(&head->chain) ? NULL : __tb_head(head);
}
+
extern struct tcp_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
@@ -122,8 +128,9 @@
*/
struct tcp_bind_hashbucket *__tcp_bhash;
- int __tcp_bhash_size;
- int __tcp_ehash_size;
+ unsigned int __tcp_bhash_size;
+ unsigned int __tcp_ehash_size;
+
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
diff -Nru linux-2.6.11/net/ipv4/tcp.c linux-2.6.11-ed/net/ipv4/tcp.c
--- linux-2.6.11/net/ipv4/tcp.c 2005-03-15 14:50:53.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp.c 2005-03-15 15:08:28.000000000 +0100
@@ -2309,10 +2309,10 @@
0);
tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
for (i = 0; i < (tcp_ehash_size << 1); i++) {
- rwlock_init(&tcp_ehash[i].lock);
INIT_HLIST_HEAD(&tcp_ehash[i].chain);
}
-
+ for (i = 0; i < TCP_EHASH_RWLOCK_SZ; i++)
+ rwlock_init(&tcp_ehash_lock[i]);
tcp_bhash = (struct tcp_bind_hashbucket *)
alloc_large_system_hash("TCP bind",
sizeof(struct tcp_bind_hashbucket),
diff -Nru linux-2.6.11/net/ipv4/tcp_diag.c linux-2.6.11-ed/net/ipv4/tcp_diag.c
--- linux-2.6.11/net/ipv4/tcp_diag.c 2005-03-02 08:37:31.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_diag.c 2005-03-15 15:22:30.000000000 +0100
@@ -666,7 +666,7 @@
if (i > s_i)
s_num = 0;
- read_lock_bh(&head->lock);
+ read_lock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
num = 0;
sk_for_each(sk, node, &head->chain) {
@@ -682,7 +682,7 @@
if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
goto next_normal;
if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
+ read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
goto done;
}
next_normal:
@@ -703,14 +703,14 @@
r->id.tcpdiag_dport)
goto next_dying;
if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
+ read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
goto done;
}
next_dying:
++num;
}
}
- read_unlock_bh(&head->lock);
+ read_unlock_bh(&tcp_ehash_lock[i%TCP_EHASH_RWLOCK_SZ]);
}
done:
diff -Nru linux-2.6.11/net/ipv4/tcp_ipv4.c linux-2.6.11-ed/net/ipv4/tcp_ipv4.c
--- linux-2.6.11/net/ipv4/tcp_ipv4.c 2005-03-02 08:37:54.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_ipv4.c 2005-03-15 17:02:52.000000000 +0100
@@ -96,6 +96,8 @@
.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
};
+rwlock_t tcp_ehash_lock[TCP_EHASH_RWLOCK_SZ] ;
+
/*
* This array holds the first and last local port number.
* For high-usage systems, use sysctl to change this to
@@ -104,16 +106,16 @@
int sysctl_local_port_range[2] = { 1024, 4999 };
int tcp_port_rover = 1024 - 1;
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+static __inline__ unsigned int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
{
- int h = (laddr ^ lport) ^ (faddr ^ fport);
+ unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
h ^= h >> 16;
h ^= h >> 8;
return h & (tcp_ehash_size - 1);
}
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
+static __inline__ unsigned int tcp_sk_hashfn(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
__u32 laddr = inet->rcv_saddr;
@@ -360,7 +362,7 @@
tcp_listen_wlock();
} else {
list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
+ lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
write_lock(lock);
}
__sk_add_node(sk, list);
@@ -391,9 +393,8 @@
tcp_listen_wlock();
lock = &tcp_lhash_lock;
} else {
- struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
- lock = &head->lock;
- write_lock_bh(&head->lock);
+ lock = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ];
+ write_lock_bh(lock);
}
if (__sk_del_node_init(sk))
@@ -492,9 +493,9 @@
/* Optimize here for direct hit, only listening connections can
* have wildcards anyways.
*/
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ unsigned int hash = tcp_hashfn(daddr, hnum, saddr, sport);
head = &tcp_ehash[hash];
- read_lock(&head->lock);
+ read_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
sk_for_each(sk, node, &head->chain) {
if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
@@ -507,7 +508,7 @@
}
sk = NULL;
out:
- read_unlock(&head->lock);
+ read_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
return sk;
hit:
sock_hold(sk);
@@ -555,13 +556,13 @@
int dif = sk->sk_bound_dev_if;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+ unsigned int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
struct tcp_ehash_bucket *head = &tcp_ehash[hash];
struct sock *sk2;
struct hlist_node *node;
struct tcp_tw_bucket *tw;
- write_lock(&head->lock);
+ write_lock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
@@ -616,7 +617,7 @@
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
if (twp) {
*twp = tw;
@@ -632,7 +633,7 @@
return 0;
not_unique:
- write_unlock(&head->lock);
+ write_unlock(&tcp_ehash_lock[hash%TCP_EHASH_RWLOCK_SZ]);
return -EADDRNOTAVAIL;
}
@@ -2224,7 +2225,7 @@
/* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- read_lock(&tcp_ehash[st->bucket].lock);
+ read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
@@ -2241,7 +2242,7 @@
rc = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2268,14 +2269,14 @@
cur = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* We can reschedule between buckets: */
cond_resched_softirq();
if (++st->bucket < tcp_ehash_size) {
- read_lock(&tcp_ehash[st->bucket].lock);
+ read_lock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
sk = sk_head(&tcp_ehash[st->bucket].chain);
} else {
cur = NULL;
@@ -2385,7 +2386,7 @@
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_ehash_lock[st->bucket%TCP_EHASH_RWLOCK_SZ]);
local_bh_enable();
break;
}
diff -Nru linux-2.6.11/net/ipv4/tcp_minisocks.c linux-2.6.11-ed/net/ipv4/tcp_minisocks.c
--- linux-2.6.11/net/ipv4/tcp_minisocks.c 2005-03-02 08:38:17.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/tcp_minisocks.c 2005-03-15 15:21:05.000000000 +0100
@@ -66,14 +66,14 @@
/* Unlink from established hashes. */
ehead = &tcp_ehash[tw->tw_hashent];
- write_lock(&ehead->lock);
+ write_lock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
+ write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
return;
}
__hlist_del(&tw->tw_node);
sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+ write_unlock(&tcp_ehash_lock[tw->tw_hashent%TCP_EHASH_RWLOCK_SZ]);
/* Disassociate with bind bucket. */
bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
@@ -297,6 +297,7 @@
static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
{
struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+ rwlock_t *rwp = &tcp_ehash_lock[sk->sk_hashent%TCP_EHASH_RWLOCK_SZ] ;
struct tcp_bind_hashbucket *bhead;
/* Step 1: Put TW into bind hash. Original socket stays there too.
@@ -310,7 +311,7 @@
tw_add_bind_node(tw, &tw->tw_tb->owners);
spin_unlock(&bhead->lock);
- write_lock(&ehead->lock);
+ write_lock(rwp);
/* Step 2: Remove SK from established hash. */
if (__sk_del_node_init(sk))
@@ -320,7 +321,7 @@
tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
atomic_inc(&tw->tw_refcnt);
- write_unlock(&ehead->lock);
+ write_unlock(rwp);
}
/*
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] no more rwlock_t inside tcp_ehash_bucket 2005-03-15 16:13 [PATCH] no more rwlock_t inside tcp_ehash_bucket Eric Dumazet @ 2005-03-15 18:32 ` David S. Miller 2005-03-16 10:47 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet 0 siblings, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-03-15 18:32 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Tue, 15 Mar 2005 17:13:11 +0100 Eric Dumazet <dada1@cosmosbay.com> wrote: > I suggest in this patch using an array of 256 rwlocks. > > - Less memory used (or 2 x more entries in hash table if size >= (2^MAX_ORDER)*PAGE_SIZE) > - less memory touched during read_lock()/read_unlock() > - Better sharing of hash entries between cpus I'm generally in support of this change, however 2 suggestions: 1) Please allocate the rwlock table dynamically. You can put #ifdef CONFIG_SMP around this stuff if you wish so that we don't do silly things like allocate a zero sized table on non-SMP builds. Perhaps you might want to similarly abstract out the rwlock acquisition, "tcp_ehash_lock(unsigned int slot)" and "tcp_ehash_unlock(unsigned int slot)". It's just an idea. 2) With dynamic allocation, you can consider perhaps dynamic sizing based upon a) the ehash size b) NR_CPUS or some combination of those two. ^ permalink raw reply [flat|nested] 44+ messages in thread
* [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-03-15 18:32 ` David S. Miller @ 2005-03-16 10:47 ` Eric Dumazet 2005-03-16 18:05 ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet 2005-03-16 22:09 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller 0 siblings, 2 replies; 44+ messages in thread From: Eric Dumazet @ 2005-03-16 10:47 UTC (permalink / raw) To: David S. Miller; +Cc: netdev After having hard times tuning /proc/sys/net/ipv4/route/* values, with various crashes of production machines, I did some investigations... The rt_check_expire() has a serious problem on machines with large route caches, and a standard HZ value of 1000. With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ; the loop count : for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; overflows (t is a 31 bit value) as soon rt_hash_log is >= 16 (65536 slots in route cache hash table) Another problem is the fact that this function has close to 0 effect, because even if ip_rt_gc_interval is changed to 1 HZ, only 1/300 of the table is scanned every second. And the loop breaks as soon a jiffie is consumed. We should adapt the loop count based on the actual number of entries in the route cache, and eventually give more 'jiffies' in some pressure cases. I am experimenting some changes that I will share when ready. Thank you Eric Dumazet ^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures 2005-03-16 10:47 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet @ 2005-03-16 18:05 ` Eric Dumazet 2005-03-16 22:10 ` David S. Miller 2005-03-16 22:09 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller 1 sibling, 1 reply; 44+ messages in thread From: Eric Dumazet @ 2005-03-16 18:05 UTC (permalink / raw) To: David S. Miller, netdev As peer_cachep uses kmem_cache_create("inet_peer_cache", ... SLAB_HWCACHE_ALIGN ...), better try to use exactly 64 bytes instead of 72 for struct inet_peer. This also means that this structure fits in one cache line on x86_64. Thank you Eric Dumazet # diff -Nru linux-2.6.11/include/net/inetpeer.h linux-2.6.11-ed/include/net/inetpeer.h --- linux-2.6.11/include/net/inetpeer.h 2005-03-02 08:37:48.000000000 +0100 +++ linux-2.6.11-ed/include/net/inetpeer.h 2005-03-16 18:52:49.000000000 +0100 @@ -19,9 +19,9 @@ { struct inet_peer *avl_left, *avl_right; struct inet_peer *unused_next, **unused_prevp; - atomic_t refcnt; unsigned long dtime; /* the time of last use of not * referenced entries */ + atomic_t refcnt; __u32 v4daddr; /* peer's address */ __u16 avl_height; __u16 ip_id_count; /* IP ID for the next packet */ ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures 2005-03-16 18:05 ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet @ 2005-03-16 22:10 ` David S. Miller 0 siblings, 0 replies; 44+ messages in thread From: David S. Miller @ 2005-03-16 22:10 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Wed, 16 Mar 2005 19:05:45 +0100 Eric Dumazet <dada1@cosmosbay.com> wrote: > As peer_cachep uses kmem_cache_create("inet_peer_cache", ... SLAB_HWCACHE_ALIGN ...), > better try to use exactly 64 bytes instead of 72 for struct inet_peer. > > This also means that this structure fits in one cache line on x86_64. Please resend the patch, your email client turned all the tab characters into spaces so the patch will not apply. ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-03-16 10:47 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet 2005-03-16 18:05 ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet @ 2005-03-16 22:09 ` David S. Miller 2005-03-17 19:52 ` Eric Dumazet 1 sibling, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-03-16 22:09 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Wed, 16 Mar 2005 11:47:34 +0100 Eric Dumazet <dada1@cosmosbay.com> wrote: > The rt_check_expire() has a serious problem on machines with large > route caches, and a standard HZ value of 1000. > > With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ; > > the loop count : > > for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; > > > overflows (t is a 31 bit value) as soon rt_hash_log is >= 16 (65536 > slots in route cache hash table) ... > I am experimenting some changes that I will share when ready. Good catch, let us know when you have a patch for review. ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-03-16 22:09 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller @ 2005-03-17 19:52 ` Eric Dumazet 2005-04-01 6:13 ` David S. Miller 0 siblings, 1 reply; 44+ messages in thread From: Eric Dumazet @ 2005-03-17 19:52 UTC (permalink / raw) To: David S. Miller; +Cc: netdev [-- Attachment #1: Type: text/plain, Size: 1147 bytes --] David S. Miller a écrit : > On Wed, 16 Mar 2005 11:47:34 +0100 > Eric Dumazet <dada1@cosmosbay.com> wrote: > > >>The rt_check_expire() has a serious problem on machines with large >>route caches, and a standard HZ value of 1000. >> >>With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ; >> >>the loop count : >> >> for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; >> >> >>overflows (t is a 31 bit value) as soon rt_hash_log is >= 16 (65536 >>slots in route cache hash table) > > ... > >>I am experimenting some changes that I will share when ready. > > > Good catch, let us know when you have a patch for review. > > Hi This is the patch I'm currently running on several machines (with very high pressure on route cache) If you want to comment it before I submit it with a nice looking mail :) Description : - Makes rt_check_expire() do some real work, auto-adapting the timer interval (chosen between gc_interval/8 and gc_interval) - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP) Thank you Eric Dumazet [-- Attachment #2: diff --] [-- Type: text/plain, Size: 10920 bytes --] diff -Nru linux-2.6.11/net/ipv4/route.c linux-2.6.11-ed/net/ipv4/route.c --- linux-2.6.11/net/ipv4/route.c 2005-03-17 11:19:45.000000000 +0100 +++ linux-2.6.11-ed/net/ipv4/route.c 2005-03-17 18:33:20.000000000 +0100 @@ -54,6 +54,7 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -107,12 +108,13 @@ #define IP_MAX_MTU 0xFFF0 #define RT_GC_TIMEOUT (300*HZ) +#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */ static int ip_rt_min_delay = 2 * HZ; static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; +static int ip_rt_gc_interval = RT_GC_INTERVAL; static int ip_rt_gc_min_interval = HZ / 2; static int ip_rt_redirect_number = 9; static int ip_rt_redirect_load = HZ / 50; @@ -124,6 +126,7 @@ static int ip_rt_min_pmtu = 512 + 20 + 20; static int ip_rt_min_advmss = 256; static int ip_rt_secret_interval = 10 * 60 * HZ; +static int ip_rt_debug ; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) @@ -197,8 +200,24 @@ struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; -} __attribute__((__aligned__(8))); +}; + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of fixed size spinlocks + */ +# define RT_HASH_LOCK_SZ 256 + static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ]; +# define rt_hash_lock_addr(slot) &rt_hash_lock[slot & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + spin_lock_init(&rt_hash_lock[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -470,7 +489,7 @@ rth->u.dst.expires; } -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) { unsigned long age; int ret = 0; @@ -516,45 +535,84 @@ /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + static unsigned int effective_interval = RT_GC_INTERVAL; + static unsigned int cached_gc_interval = RT_GC_INTERVAL; + unsigned int i = rover, t; struct rtable *rth, **rthp; unsigned long now = jiffies; + unsigned int freed = 0 , t0; + u64 mult; - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { - unsigned long tmo = ip_rt_gc_timeout; - + if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */ + cached_gc_interval = ip_rt_gc_interval; + effective_interval = cached_gc_interval; + } + /* computes the number of slots we should examin in this run */ + mult = ((u64)effective_interval) << rt_hash_log; + do_div(mult, ip_rt_gc_timeout); + t = (unsigned int)mult; + + if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/8) { + t <<= 1; /* be more aggressive */ + if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/4) { + t <<= 1; /* be more aggressive */ + if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/2) { + t <<= 1; /* be more aggressive */ + now++; /* give us one more tick (time) to do our job */ + } + } + } + if (t > rt_hash_mask) t = rt_hash_mask + 1; + t0 = t; + for ( ; t > 0; t -= 1) { i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - - spin_lock(&rt_hash_table[i].lock); - while ((rth = *rthp) != NULL) { - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (*rthp) { + unsigned long tmo = ip_rt_gc_timeout; + spin_lock(rt_hash_lock_addr(i)); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; + + /* Cleanup aged off entries. */ + *rthp = rth->u.rt_next; + freed++; + rt_free(rth); } - - /* Cleanup aged off entries. */ - *rthp = rth->u.rt_next; - rt_free(rth); + spin_unlock(rt_hash_lock_addr(i)); } - spin_unlock(&rt_hash_table[i].lock); - /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + if (t) { + /* Not enough time to perform our job, try to adjust the timer. + * Firing the timer sooner means less planned work. + * We allow the timer to be 1/8 of the sysctl value. + */ + effective_interval = (effective_interval + cached_gc_interval/8)/2; + } + else { + /* We finished our job before time limit, try to increase the timer + * The limit is the sysctl value. + */ + effective_interval = (effective_interval + cached_gc_interval)/2; + } + if (ip_rt_debug & 1) + printk(KERN_WARNING "rt_check_expire() : %u freed, t=%u/%u interval=%u ticks\n", freed, t, t0, effective_interval); + mod_timer(&rt_periodic_timer, jiffies + effective_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -570,11 +628,11 @@ get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -704,7 +762,7 @@ k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -715,7 +773,7 @@ rt_free(rth); goal--; } - spin_unlock_bh(&rt_hash_table[k].lock); + spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -792,7 +850,7 @@ rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ @@ -813,7 +871,7 @@ rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -854,7 +912,7 @@ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -895,7 +953,7 @@ } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -962,7 +1020,7 @@ { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -971,7 +1029,7 @@ rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -2569,6 +2627,23 @@ .strategy = &sysctl_jiffies, }, { + .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL_MS, + .procname = "gc_interval_ms", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_DEBUG, + .procname = "gc_debug", + .data = &ip_rt_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, @@ -2718,7 +2793,7 @@ int __init ip_rt_init(void) { - int i, order, goal, rc = 0; + int order, goal, rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); @@ -2766,14 +2841,12 @@ for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) /* NOTHING */; - rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); - rt_hash_table[i].chain = NULL; - } + memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + ipv4_dst_ops.gc_thresh = rt_hash_mask; + ip_rt_max_size = rt_hash_mask * 16; + rt_hash_mask--; rt_cache_stat = alloc_percpu(struct rt_cache_stat); if (!rt_cache_stat) diff -Nru linux-2.6.11/Documentation/filesystems/proc.txt linux-2.6.11-ed/Documentation/filesystems/proc.txt --- linux-2.6.11/Documentation/filesystems/proc.txt 2005-03-17 18:42:54.000000000 +0100 +++ linux-2.6.11-ed/Documentation/filesystems/proc.txt 2005-03-17 18:42:14.000000000 +0100 @@ -1709,12 +1709,13 @@ Writing to this file results in a flush of the routing cache. -gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh +gc_elasticity, gc_interval_ms, gc_min_interval_ms, gc_timeout, gc_thresh, gc_debug --------------------------------------------------------------------- Values to control the frequency and behavior of the garbage collection algorithm for the routing cache. gc_min_interval is deprecated and replaced -by gc_min_interval_ms. +by gc_min_interval_ms. gc_interval is deprecated and replaced by +gc_interval_ms. gc_debug enables some printk() max_size diff -Nru linux-2.6.11/include/linux/sysctl.h linux-2.6.11-ed/include/linux/sysctl.h --- linux-2.6.11/include/linux/sysctl.h 2005-03-17 18:37:13.000000000 +0100 +++ linux-2.6.11-ed/include/linux/sysctl.h 2005-03-17 18:36:57.000000000 +0100 @@ -367,6 +367,8 @@ NET_IPV4_ROUTE_MIN_ADVMSS=17, NET_IPV4_ROUTE_SECRET_INTERVAL=18, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19, + NET_IPV4_ROUTE_GC_INTERVAL_MS=20, + NET_IPV4_ROUTE_GC_DEBUG=21, }; enum ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-03-17 19:52 ` Eric Dumazet @ 2005-04-01 6:13 ` David S. Miller 2005-04-01 14:39 ` Eric Dumazet 0 siblings, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-04-01 6:13 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Thu, 17 Mar 2005 20:52:44 +0100 Eric Dumazet <dada1@cosmosbay.com> wrote: > - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP) If spinlock_t is a zero sized structure on UP, how can this save memory on UP? :-) Anyways, I think perhaps you should dynamically allocate this lock table. Otherwise it looks fine. ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 6:13 ` David S. Miller @ 2005-04-01 14:39 ` Eric Dumazet 2005-04-01 15:53 ` Robert Olsson 2005-04-01 20:28 ` David S. Miller 0 siblings, 2 replies; 44+ messages in thread From: Eric Dumazet @ 2005-04-01 14:39 UTC (permalink / raw) To: David S. Miller; +Cc: netdev David S. Miller a écrit : > On Thu, 17 Mar 2005 20:52:44 +0100 > Eric Dumazet <dada1@cosmosbay.com> wrote: > > >> - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP) > > > If spinlock_t is a zero sized structure on UP, how can this save memory > on UP? :-) Because I deleted the __attribute__((__aligned__(8))) constraint on struct rt_hash_bucket. So sizeof(struct rt_hash_bucket) is now 4 instead of 8 on 32 bits architectures. May I remind you some people still use 32 bits CPU ? :-) By the way I have an updated patch... surviving very serious loads. > > Anyways, I think perhaps you should dynamically allocate this lock table. Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ? > Otherwise it looks fine. > > ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 14:39 ` Eric Dumazet @ 2005-04-01 15:53 ` Robert Olsson 2005-04-01 16:34 ` Eric Dumazet 2005-04-01 20:28 ` David S. Miller 1 sibling, 1 reply; 44+ messages in thread From: Robert Olsson @ 2005-04-01 15:53 UTC (permalink / raw) To: Eric Dumazet; +Cc: David S. Miller, netdev Hello! Eric Dumazet writes: > By the way I have an updated patch... surviving very serious loads. Did you check for performance changes too? From what I understand we can add new lookup and cache miss in the fast packet path. > > Anyways, I think perhaps you should dynamically allocate this lock table. > > Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ? IMO we should be careful with adding new complexity the route hash. Also was this dynamic behavior gc_interval needed to fix the overflow? gc_interval is only sort of last resort timer. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 15:53 ` Robert Olsson @ 2005-04-01 16:34 ` Eric Dumazet 2005-04-01 17:26 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Eric Dumazet @ 2005-04-01 16:34 UTC (permalink / raw) To: Robert Olsson; +Cc: David S. Miller, netdev Robert Olsson a écrit : > Hello! > > Did you check for performance changes too? From what I understand > we can add new lookup and cache miss in the fast packet path. Performance is better because in case of stress (lot of incoming packets per second), the 1024 bytes of the locks are all in cache. As the size of the hash is divided by a 2 factor, rt_check_expire() and/or rt_garbage_collect() have to touch less cache lines. According to oprofile, an unpatched kernel was spending more than 15% of time in route.c routines, now I see ip_route_input() at 1.88% > > > > Anyways, I think perhaps you should dynamically allocate this lock table. > > > > Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ? > > IMO we should be careful with adding new complexity the route hash. > Also was this dynamic behavior gc_interval needed to fix the overflow? In my case yes, because I have huge route cache. > gc_interval is only sort of last resort timer. Actually not : gc_interval controls the rt_check_expire() to clean the hash table after use. All old enough entries can be deleted smoothly, on behalf of a timer tick (so network interrupts can still occur) I found it was better to adjust gc_interval to 1 (to let it fire every second and examine 1/300 table slots, or more if the dynamic behavior triggers), and ajust params so that rt_garbage_collect() doesnt run at all : rt_garbage_collect() can take forever to complete, blocking network trafic. Eric Dumazet ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 16:34 ` Eric Dumazet @ 2005-04-01 17:26 ` Robert Olsson 0 siblings, 0 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-01 17:26 UTC (permalink / raw) To: Eric Dumazet; +Cc: Robert Olsson, David S. Miller, netdev Eric Dumazet writes: > According to oprofile, an unpatched kernel was spending more than 15% of time in route.c routines, now I see ip_route_input() at 1.88% Would like to see absolute numbers for UP/SMP single flow and DoS to be confident. > I found it was better to adjust gc_interval to 1 (to let it fire every second and examine 1/300 table slots, or more if the dynamic behavior > triggers), and ajust params so that rt_garbage_collect() doesnt run at all : rt_garbage_collect() can take forever to complete, blocking > network trafic. I don't think you can depend on timer for GC solely. Timer tick is eternity for todays packet rates. You can distribute the GC load by allowing it to run more frequent this in combination with huge cache seems to be a very interesting approach given that you have memory. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 14:39 ` Eric Dumazet 2005-04-01 15:53 ` Robert Olsson @ 2005-04-01 20:28 ` David S. Miller 2005-04-01 21:05 ` Eric Dumazet 1 sibling, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-04-01 20:28 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Fri, 01 Apr 2005 16:39:48 +0200 Eric Dumazet <dada1@cosmosbay.com> wrote: > > If spinlock_t is a zero sized structure on UP, how can this save memory > > on UP? :-) > > Because I deleted the __attribute__((__aligned__(8))) constraint on struct rt_hash_bucket. Right. > > Anyways, I think perhaps you should dynamically allocate this lock table. > > Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ? Even for NR_CPUS, I think the table should be dynamically allocated. It is a goal to eliminate all of these huge arrays in the static kernel image, which has grown incredibly too much in recent times. I work often to eliminate such things, let's not add new ones :-) ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 20:28 ` David S. Miller @ 2005-04-01 21:05 ` Eric Dumazet 2005-04-01 21:08 ` David S. Miller 0 siblings, 1 reply; 44+ messages in thread From: Eric Dumazet @ 2005-04-01 21:05 UTC (permalink / raw) To: David S. Miller; +Cc: netdev David S. Miller a écrit : > On Fri, 01 Apr 2005 16:39:48 +0200 > Eric Dumazet <dada1@cosmosbay.com> wrote: > >>Maybe I should make a static sizing, (replace the 256 constant by something based on MAX_CPUS) ? > > > Even for NR_CPUS, I think the table should be dynamically allocated. > > It is a goal to eliminate all of these huge arrays in the static > kernel image, which has grown incredibly too much in recent times. > I work often to eliminate such things, let's not add new ones :-) You mean you prefer : static spinlock_t *rt_hash_lock ; /* rt_hash_lock = alloc_memory_at_boot_time(...) */ instead of static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ; In both cases, memory is taken from lowmem, and size of kernel image is roughly the same (bss section takes no space in image) Then the runtime cost is more expensive in the 'dynamic case' because of the extra indirection... ? ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 21:05 ` Eric Dumazet @ 2005-04-01 21:08 ` David S. Miller 2005-04-01 21:43 ` Eric Dumazet 0 siblings, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-04-01 21:08 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Fri, 01 Apr 2005 23:05:37 +0200 Eric Dumazet <dada1@cosmosbay.com> wrote: > You mean you prefer : > > static spinlock_t *rt_hash_lock ; /* rt_hash_lock = > alloc_memory_at_boot_time(...) */ > > instead of > > static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ; > > In both cases, memory is taken from lowmem, and size of kernel image > is roughly the same (bss section takes no space in image) In the former case the kernel image the bootloader has to load is smaller. That's important, believe it or not. It means less TLB entries need to be locked permanently into the MMU on certain platforms. ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 21:08 ` David S. Miller @ 2005-04-01 21:43 ` Eric Dumazet 2005-04-01 22:34 ` David S. Miller 0 siblings, 1 reply; 44+ messages in thread From: Eric Dumazet @ 2005-04-01 21:43 UTC (permalink / raw) To: David S. Miller; +Cc: netdev David S. Miller a écrit : > On Fri, 01 Apr 2005 23:05:37 +0200 > Eric Dumazet <dada1@cosmosbay.com> wrote: > > >>You mean you prefer : >> >>static spinlock_t *rt_hash_lock ; /* rt_hash_lock = >>alloc_memory_at_boot_time(...) */ >> >>instead of >> >>static spinlock_t rt_hash_lock[RT_HASH_LOCK_SZ] ; >> >>In both cases, memory is taken from lowmem, and size of kernel image >>is roughly the same (bss section takes no space in image) > > > In the former case the kernel image the bootloader has to > load is smaller. That's important, believe it or not. It > means less TLB entries need to be locked permanently into > the MMU on certain platforms. > > OK thanks for this clarification. I changed to : #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. */ #if NR_CPUS >= 32 #define RT_HASH_LOCK_SZ 4096 #elif NR_CPUS >= 16 #define RT_HASH_LOCK_SZ 2048 #elif NR_CPUS >= 8 #define RT_HASH_LOCK_SZ 1024 #elif NR_CPUS >= 4 #define RT_HASH_LOCK_SZ 512 #else #define RT_HASH_LOCK_SZ 256 #endif static spinlock_t *rt_hash_locks; # define rt_hash_lock_addr(slot) &rt_hash_locks[slot & (RT_HASH_LOCK_SZ - 1)] # define rt_hash_lock_init() { \ int i; \ rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ spin_lock_init(&rt_hash_locks[i]); \ } #else # define rt_hash_lock_addr(slot) NULL # define rt_hash_lock_init() #endif Are you OK if I also use alloc_large_system_hash() to allocate rt_hash_table, instead of the current method ? This new method is used in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning. Eric ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 21:43 ` Eric Dumazet @ 2005-04-01 22:34 ` David S. Miller 2005-04-01 23:21 ` Eric Dumazet 0 siblings, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-04-01 22:34 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Fri, 01 Apr 2005 23:43:38 +0200 Eric Dumazet <dada1@cosmosbay.com> wrote: > Are you OK if I also use alloc_large_system_hash() to allocate > rt_hash_table, instead of the current method ? This new method is used > in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning. Sure, that's fine. BTW, please line-wrap your emails. :-/ ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 22:34 ` David S. Miller @ 2005-04-01 23:21 ` Eric Dumazet 2005-04-01 23:54 ` David S. Miller 2005-04-02 8:21 ` Herbert Xu 0 siblings, 2 replies; 44+ messages in thread From: Eric Dumazet @ 2005-04-01 23:21 UTC (permalink / raw) To: David S. Miller; +Cc: netdev [-- Attachment #1: Type: text/plain, Size: 904 bytes --] David S. Miller a écrit : > On Fri, 01 Apr 2005 23:43:38 +0200 > Eric Dumazet <dada1@cosmosbay.com> wrote: > > >>Are you OK if I also use alloc_large_system_hash() to allocate >>rt_hash_table, instead of the current method ? This new method is used >>in net/ipv4/tcp.c for tcp_ehash and tcp_bhash and permits NUMA tuning. > > > Sure, that's fine. > > BTW, please line-wrap your emails. :-/ > > :-) OK this patch includes everything... - Locking abstraction - rt_check_expire() fixes - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second - New gc_debug sysctl to let sysadmin tune gc - Less memory used by hash table (spinlocks moved to a smaller table) - sizing of spinlocks table depends on NR_CPUS - hash table allocated using alloc_large_system_hash() function - header fix for /proc/net/stat/rt_cache Thank you Eric [-- Attachment #2: diff --] [-- Type: text/plain, Size: 14030 bytes --] diff -Nru linux-2.6.11/net/ipv4/route.c linux-2.6.11-ed/net/ipv4/route.c --- linux-2.6.11/net/ipv4/route.c 2005-03-02 08:38:38.000000000 +0100 +++ linux-2.6.11-ed/net/ipv4/route.c 2005-04-02 01:10:37.000000000 +0200 @@ -54,6 +54,8 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. + * : bugfix in rt_cpu_seq_show() * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -70,6 +72,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -107,12 +110,13 @@ #define IP_MAX_MTU 0xFFF0 #define RT_GC_TIMEOUT (300*HZ) +#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */ static int ip_rt_min_delay = 2 * HZ; static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; +static int ip_rt_gc_interval = RT_GC_INTERVAL; static int ip_rt_gc_min_interval = HZ / 2; static int ip_rt_redirect_number = 9; static int ip_rt_redirect_load = HZ / 50; @@ -124,6 +128,7 @@ static int ip_rt_min_pmtu = 512 + 20 + 20; static int ip_rt_min_advmss = 256; static int ip_rt_secret_interval = 10 * 60 * HZ; +static int ip_rt_debug; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) @@ -197,8 +202,38 @@ struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; -} __attribute__((__aligned__(8))); +}; + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + */ +#if NR_CPUS >= 32 +#define RT_HASH_LOCK_SZ 4096 +#elif NR_CPUS >= 16 +#define RT_HASH_LOCK_SZ 2048 +#elif NR_CPUS >= 8 +#define RT_HASH_LOCK_SZ 1024 +#elif NR_CPUS >= 4 +#define RT_HASH_LOCK_SZ 512 +#else +#define RT_HASH_LOCK_SZ 256 +#endif + + static spinlock_t *rt_hash_locks; +# define rt_hash_lock_addr(slot) &rt_hash_locks[slot & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ + if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + spin_lock_init(&rt_hash_locks[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -393,7 +428,7 @@ struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } @@ -470,7 +505,7 @@ rth->u.dst.expires; } -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) { unsigned long age; int ret = 0; @@ -516,45 +551,93 @@ /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + static unsigned int effective_interval = RT_GC_INTERVAL; + static unsigned int cached_gc_interval = RT_GC_INTERVAL; + unsigned int i, goal; struct rtable *rth, **rthp; unsigned long now = jiffies; + unsigned int freed = 0 , t0; + u64 mult; - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { - unsigned long tmo = ip_rt_gc_timeout; - + if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */ + cached_gc_interval = ip_rt_gc_interval; + effective_interval = cached_gc_interval; + } + /* Computes the number of slots we should examin in this run : + * We want to perform a full scan every ip_rt_gc_timeout, and + * the timer is started every 'effective_interval' ticks. + * so goal = (number_of_slots) * (effective_interval / ip_rt_gc_timeout) + */ + mult = ((u64)effective_interval) << rt_hash_log; + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + + i = atomic_read(&ipv4_dst_ops.entries) << 3; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + i >>= 1; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + i >>= 1; + if (i > ip_rt_max_size) { + goal <<= 1; /* be more aggressive */ + now++; /* give us one more tick (time) to do our job */ + } + } + } + if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + t0 = goal; + i = rover; + for ( ; goal > 0; goal--) { i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - - spin_lock(&rt_hash_table[i].lock); - while ((rth = *rthp) != NULL) { - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (*rthp) { + unsigned long tmo = ip_rt_gc_timeout; + spin_lock(rt_hash_lock_addr(i)); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; - } - /* Cleanup aged off entries. */ - *rthp = rth->u.rt_next; - rt_free(rth); + /* Cleanup aged off entries. */ + *rthp = rth->u.rt_next; + freed++; + rt_free(rth); + } + spin_unlock(rt_hash_lock_addr(i)); } - spin_unlock(&rt_hash_table[i].lock); - /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + if (goal != 0) { + /* Not enough time to perform our job, try to adjust the timer. + * Firing the timer sooner means less planned work. + * We allow the timer to be 1/8 of the sysctl value. + */ + effective_interval = (effective_interval + cached_gc_interval/8)/2; + } + else { + /* We finished our job before time limit, try to increase the timer + * The limit is the sysctl value, we use a weight of 3/1 to + * increase slowly. + */ + effective_interval = (3*effective_interval + cached_gc_interval + 3)/4; + } + if (ip_rt_debug & 1) + printk(KERN_WARNING "rt_check_expire() : %u freed, goal=%u/%u, interval=%u ticks\n", freed, goal, t0, effective_interval); + mod_timer(&rt_periodic_timer, jiffies + effective_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -570,11 +653,11 @@ get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -704,7 +787,7 @@ k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -715,7 +798,7 @@ rt_free(rth); goal--; } - spin_unlock_bh(&rt_hash_table[k].lock); + spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -792,7 +875,7 @@ rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ @@ -813,7 +896,7 @@ rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -854,7 +937,7 @@ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -895,7 +978,7 @@ } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -962,7 +1045,7 @@ { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -971,7 +1054,7 @@ rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -2569,6 +2652,23 @@ .strategy = &sysctl_jiffies, }, { + .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL_MS, + .procname = "gc_interval_ms", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_DEBUG, + .procname = "gc_debug", + .data = &ip_rt_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, @@ -2718,12 +2818,13 @@ int __init ip_rt_init(void) { - int i, order, goal, rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE + { + int order; for (order = 0; (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) /* NOTHING */; @@ -2731,6 +2832,7 @@ if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE << order); + } #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", @@ -2741,39 +2843,24 @@ if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); - goal = num_physpages >> (26 - PAGE_SHIFT); - if (rhash_entries) - goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; - for (order = 0; (1UL << order) < goal; order++) - /* NOTHING */; - - do { - rt_hash_mask = (1UL << order) * PAGE_SIZE / - sizeof(struct rt_hash_bucket); - while (rt_hash_mask & (rt_hash_mask - 1)) - rt_hash_mask--; - rt_hash_table = (struct rt_hash_bucket *) - __get_free_pages(GFP_ATOMIC, order); - } while (rt_hash_table == NULL && --order > 0); - - if (!rt_hash_table) - panic("Failed to allocate IP route cache hash table\n"); - - printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", - rt_hash_mask, - (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); + rt_hash_table = (struct rt_hash_bucket *) + alloc_large_system_hash("IP route cache", + sizeof(struct rt_hash_bucket), + rhash_entries, + (num_physpages >= 128 * 1024) ? + (27 - PAGE_SHIFT) : + (29 - PAGE_SHIFT), + HASH_HIGHMEM, + &rt_hash_log, + &rt_hash_mask, + 0); - for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) - /* NOTHING */; + memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); + ipv4_dst_ops.gc_thresh = rt_hash_mask; + ip_rt_max_size = rt_hash_mask * 16; rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); - rt_hash_table[i].chain = NULL; - } - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; rt_cache_stat = alloc_percpu(struct rt_cache_stat); if (!rt_cache_stat) @@ -2819,7 +2906,7 @@ xfrm_init(); xfrm4_init(); #endif - return rc; + return 0; } EXPORT_SYMBOL(__ip_select_ident); diff -Nru linux-2.6.11/Documentation/filesystems/proc.txt linux-2.6.11-ed/Documentation/filesystems/proc.txt --- linux-2.6.11/Documentation/filesystems/proc.txt 2005-04-02 01:19:15.000000000 +0200 +++ linux-2.6.11-ed/Documentation/filesystems/proc.txt 2005-04-02 01:19:04.000000000 +0200 @@ -1709,12 +1709,13 @@ Writing to this file results in a flush of the routing cache. -gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh +gc_elasticity, gc_interval_ms, gc_min_interval_ms, gc_timeout, gc_thresh, gc_debug --------------------------------------------------------------------- Values to control the frequency and behavior of the garbage collection algorithm for the routing cache. gc_min_interval is deprecated and replaced -by gc_min_interval_ms. +by gc_min_interval_ms. gc_interval is deprecated and replaced by +gc_interval_ms. gc_debug enables some printk() max_size diff -Nru linux-2.6.11/include/linux/sysctl.h linux-2.6.11-ed/include/linux/sysctl.h --- linux-2.6.11/include/linux/sysctl.h 2005-03-02 08:38:10.000000000 +0100 +++ linux-2.6.11-ed/include/linux/sysctl.h 2005-04-02 00:43:11.000000000 +0200 @@ -367,6 +367,8 @@ NET_IPV4_ROUTE_MIN_ADVMSS=17, NET_IPV4_ROUTE_SECRET_INTERVAL=18, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19, + NET_IPV4_ROUTE_GC_INTERVAL_MS=20, + NET_IPV4_ROUTE_GC_DEBUG=21, }; enum ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 23:21 ` Eric Dumazet @ 2005-04-01 23:54 ` David S. Miller 2005-04-02 8:21 ` Herbert Xu 1 sibling, 0 replies; 44+ messages in thread From: David S. Miller @ 2005-04-01 23:54 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev On Sat, 02 Apr 2005 01:21:49 +0200 Eric Dumazet <dada1@cosmosbay.com> wrote: > OK this patch includes everything... > > - Locking abstraction > - rt_check_expire() fixes > - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second > - New gc_debug sysctl to let sysadmin tune gc > - Less memory used by hash table (spinlocks moved to a smaller table) > - sizing of spinlocks table depends on NR_CPUS > - hash table allocated using alloc_large_system_hash() function > - header fix for /proc/net/stat/rt_cache Looks fine to me. I'd like to see some feedback from folks like Robert Olsson and co. before applying this, so let's allow the patch to simmer over the weekend, ok? :-) ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-01 23:21 ` Eric Dumazet 2005-04-01 23:54 ` David S. Miller @ 2005-04-02 8:21 ` Herbert Xu 2005-04-02 9:21 ` Eric Dumazet 1 sibling, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-02 8:21 UTC (permalink / raw) To: Eric Dumazet; +Cc: davem, netdev, Robert.Olsson Eric Dumazet <dada1@cosmosbay.com> wrote: > > OK this patch includes everything... > > - Locking abstraction > - rt_check_expire() fixes > - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second > - New gc_debug sysctl to let sysadmin tune gc > - Less memory used by hash table (spinlocks moved to a smaller table) > - sizing of spinlocks table depends on NR_CPUS > - hash table allocated using alloc_large_system_hash() function > - header fix for /proc/net/stat/rt_cache This patch is doing too many things. How about splitting it up? For instance the spin lock stuff is pretty straightforward and should be in its own patch. The benefits of the GC changes are not obvious to me. rt_check_expire is simply meant to kill off old entries. It's not really meant to be used to free up entries when the table gets full. rt_garbage_collect on the other hand is designed to free entries when it is needed. Eric raised the point that rt_garbage_collect is pretty expensive. So what about amortising its cost a bit more? For instance, we can set a new threshold that's lower than gc_thresh and perform GC on the chain being inserted in rt_intern_hash if we're above that threshold. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 8:21 ` Herbert Xu @ 2005-04-02 9:21 ` Eric Dumazet 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu 2005-04-02 13:48 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson 0 siblings, 2 replies; 44+ messages in thread From: Eric Dumazet @ 2005-04-02 9:21 UTC (permalink / raw) To: Herbert Xu; +Cc: davem, netdev, Robert.Olsson Herbert Xu a écrit : > Eric Dumazet <dada1@cosmosbay.com> wrote: > >>OK this patch includes everything... >> >> - Locking abstraction >> - rt_check_expire() fixes >> - New gc_interval_ms sysctl to be able to have timer gc_interval < 1 second >> - New gc_debug sysctl to let sysadmin tune gc >> - Less memory used by hash table (spinlocks moved to a smaller table) >> - sizing of spinlocks table depends on NR_CPUS >> - hash table allocated using alloc_large_system_hash() function >> - header fix for /proc/net/stat/rt_cache > > > This patch is doing too many things. How about splitting it up? > > For instance the spin lock stuff is pretty straightforward and > should be in its own patch. > > The benefits of the GC changes are not obvious to me. rt_check_expire > is simply meant to kill off old entries. It's not really meant to be > used to free up entries when the table gets full. Well, I began my work because of the overflow bug in rt_check_expire()... Then I realize this function could not work as expected. On a loaded machine, one timer tick is 1 ms. During this time, number of chains that are scanned is ridiculous. With the standard timer of 60 second, fact is rt_check_expire() is useless. > > rt_garbage_collect on the other hand is designed to free entries > when it is needed. Eric raised the point that rt_garbage_collect > is pretty expensive. So what about amortising its cost a bit more? Yes. rt_garbage_collect() has serious problems. But this function is sooo complex I dont want to touch it and let experts do it if they want. But then one may think why we have two similar functions that are doing basically the same thing : garbage collection. One of a production machine rtstat -i 1 output is : rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache| entries| in_hit|in_slow_|in_slow_|in_no_ro| in_brd|in_marti|in_marti| out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis| | | tot| mc| ute| | an_dst| an_src| | _tot| _mc| | ed| miss| verflow| _search|t_search| 2618087| 28581| 7673| 0| 0| 0| 0| 0| 1800| 1450| 0| 0| 0| 0| 0| 37630| 4783| 2618689| 25444| 4918| 0| 0| 0| 0| 0| 2051| 1699| 0| 0| 0| 0| 0| 27741| 5461| 2619369| 25000| 4567| 0| 0| 0| 0| 0| 1860| 1304| 0| 0| 0| 0| 0| 26606| 4563| 2618396| 24830| 4633| 0| 0| 0| 0| 0| 1959| 1492| 0| 0| 0| 0| 0| 26643| 4930| Without serious tuning, this machine could not handle this load, or even half of it. Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die. > > For instance, we can set a new threshold that's lower than gc_thresh > and perform GC on the chain being inserted in rt_intern_hash if we're > above that threshold. We could also try to perform GC on L1_CACHE_SIZE/sizeof(struct rt_hash_bucket) chains, not only the 'current chain', to fully use the cache miss. > > Cheers, Thank you ^ permalink raw reply [flat|nested] 44+ messages in thread
* Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 9:21 ` Eric Dumazet @ 2005-04-02 11:23 ` Herbert Xu 2005-04-02 13:58 ` Eric Dumazet ` (2 more replies) 2005-04-02 13:48 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson 1 sibling, 3 replies; 44+ messages in thread From: Herbert Xu @ 2005-04-02 11:23 UTC (permalink / raw) To: Eric Dumazet; +Cc: davem, netdev, Robert.Olsson, hadi On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote: > > Well, I began my work because of the overflow bug in rt_check_expire()... > Then I realize this function could not work as expected. On a loaded > machine, one timer tick is 1 ms. > During this time, number of chains that are scanned is ridiculous. > With the standard timer of 60 second, fact is rt_check_expire() is useless. I see. What we've got here is a scalability problem with respect to the number of hash buckets. As the number of buckets increases, the amount of work the timer GC has to perform inreases proportionally. Since the timer GC parameters are fixed, this will eventually break. Rather than changing the timer GC so that it runs more often to keep up with the large routing cache, we should get out of this by reducing the amount of work we have to do. Imagine an ideal balanced hash table with 2.6 million entries. That is, all incoming/outgoing packets belong to flows that are already in the hash table. Imagine also that there is no PMTU/link failure taking place so all entries are valid forever. In this state there is absolutely no need to execute the timer GC. Let's remove one of those assumptions and allow there to be entries which need to expire after a set period. Instead of having the timer GC clean them up, we can move the expire check to the place where the entries are used. That is, we make ip_route_input/ip_route_output/ipv4_dst_check check whether the entry has expired. On the face of it we're doing more work since every routing cache hit will need to check the validity of the dst. However, because it's a single subtraction it is actually pretty cheap. There is also no additional cache miss compared to doing it in the timer GC since we have to read the dst anyway. Let's go one step further and make the routing cache come to life. Now there are new entries coming in and we need to remove old ones in order to make room for them. That task is currently carried out by the timer GC in rt_check_expire and on demand by rt_garbage_collect. Either way we have to walk the entire routing cache looking for entries to get rid of. This is quite expensive when the routing cache is large. However, there is a better way. The reason we keep a cap on the routing cache (for a given hash size) is so that individual chains do not degenerate into long linked lists. In other words, we don't really care about how many entries there are in the routing cache. But we do care about how long each hash chain is. So instead of walking the entire routing cache to keep the number of entries down, what we should do is keep each hash chain as short as possible. Assuming that the hash function is good, this should achieve the same end result. Here is how it can be done: Every time a routing entry is inserted into a hash chain, we perform GC on that chain unconditionally. It might seem that we're doing more work again. However, as before because we're traversing the chain anyway, it is very cheap to perform the GC operations which mainly involve the checks in rt_may_expire. OK that's enough thinking and it's time to write some code to see whether this is all bullshit :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu @ 2005-04-02 13:58 ` Eric Dumazet 2005-04-02 14:03 ` Robert Olsson 2005-04-02 21:05 ` jamal 2 siblings, 0 replies; 44+ messages in thread From: Eric Dumazet @ 2005-04-02 13:58 UTC (permalink / raw) To: Herbert Xu; +Cc: davem, netdev, Robert.Olsson, hadi Herbert Xu a écrit : > On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote: > >>Well, I began my work because of the overflow bug in rt_check_expire()... >>Then I realize this function could not work as expected. On a loaded >>machine, one timer tick is 1 ms. >>During this time, number of chains that are scanned is ridiculous. >>With the standard timer of 60 second, fact is rt_check_expire() is useless. > > > I see. What we've got here is a scalability problem with respect > to the number of hash buckets. As the number of buckets increases, > the amount of work the timer GC has to perform inreases proportionally. > > Since the timer GC parameters are fixed, this will eventually break. > > Rather than changing the timer GC so that it runs more often to keep > up with the large routing cache, we should get out of this by reducing > the amount of work we have to do. > > Imagine an ideal balanced hash table with 2.6 million entries. That > is, all incoming/outgoing packets belong to flows that are already in > the hash table. Imagine also that there is no PMTU/link failure taking > place so all entries are valid forever. > > In this state there is absolutely no need to execute the timer GC. > > Let's remove one of those assumptions and allow there to be entries > which need to expire after a set period. > > Instead of having the timer GC clean them up, we can move the expire > check to the place where the entries are used. That is, we make > ip_route_input/ip_route_output/ipv4_dst_check check whether the > entry has expired. > > On the face of it we're doing more work since every routing cache > hit will need to check the validity of the dst. However, because > it's a single subtraction it is actually pretty cheap. There is > also no additional cache miss compared to doing it in the timer > GC since we have to read the dst anyway. > > Let's go one step further and make the routing cache come to life. > Now there are new entries coming in and we need to remove old ones > in order to make room for them. > > That task is currently carried out by the timer GC in rt_check_expire > and on demand by rt_garbage_collect. Either way we have to walk the > entire routing cache looking for entries to get rid of. > > This is quite expensive when the routing cache is large. However, > there is a better way. > > The reason we keep a cap on the routing cache (for a given hash size) > is so that individual chains do not degenerate into long linked lists. > > In other words, we don't really care about how many entries there are > in the routing cache. But we do care about how long each hash chain > is. > > So instead of walking the entire routing cache to keep the number of > entries down, what we should do is keep each hash chain as short as > possible. > > Assuming that the hash function is good, this should achieve the > same end result. > > Here is how it can be done: Every time a routing entry is inserted into > a hash chain, we perform GC on that chain unconditionally. > > It might seem that we're doing more work again. However, as before > because we're traversing the chain anyway, it is very cheap to perform > the GC operations which mainly involve the checks in rt_may_expire. > > OK that's enough thinking and it's time to write some code to see > whether this is all bullshit :) > > Cheers, Well, it may work if you dont care about memory used. # grep dst /proc/slabinfo ip_dst_cache 2825575 2849590 384 10 1 : tunables 54 27 8 : slabdata 284959 284959 0 On this machine, route cache takes 1.1 GB of ram... impressive. Then if the network load decrease (or completely stop), only a timer driven gc could purge the cache. So rt_check_expire() is *needed* You are right saying that gc parameters are fixed, thus gc breaks at high load. Eric ^ permalink raw reply [flat|nested] 44+ messages in thread
* Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu 2005-04-02 13:58 ` Eric Dumazet @ 2005-04-02 14:03 ` Robert Olsson 2005-04-02 21:05 ` jamal 2 siblings, 0 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-02 14:03 UTC (permalink / raw) To: Herbert Xu; +Cc: Eric Dumazet, davem, netdev, Robert.Olsson, hadi Herbert Xu writes: > Rather than changing the timer GC so that it runs more often to keep > up with the large routing cache, we should get out of this by reducing > the amount of work we have to do. Yeep. > Imagine an ideal balanced hash table with 2.6 million entries. That > is, all incoming/outgoing packets belong to flows that are already in > the hash table. Imagine also that there is no PMTU/link failure taking > place so all entries are valid forever. > > In this state there is absolutely no need to execute the timer GC. > Let's remove one of those assumptions and allow there to be entries > which need to expire after a set period. > > Instead of having the timer GC clean them up, we can move the expire > check to the place where the entries are used. That is, we make > ip_route_input/ip_route_output/ipv4_dst_check check whether the > entry has expired. > > On the face of it we're doing more work since every routing cache > hit will need to check the validity of the dst. However, because > it's a single subtraction it is actually pretty cheap. There is > also no additional cache miss compared to doing it in the timer > GC since we have to read the dst anyway. > > Let's go one step further and make the routing cache come to life. > Now there are new entries coming in and we need to remove old ones > in order to make room for them. > > That task is currently carried out by the timer GC in rt_check_expire > and on demand by rt_garbage_collect. Either way we have to walk the > entire routing cache looking for entries to get rid of. > > This is quite expensive when the routing cache is large. However, > there is a better way. > > The reason we keep a cap on the routing cache (for a given hash size) > is so that individual chains do not degenerate into long linked lists. > > In other words, we don't really care about how many entries there are > in the routing cache. But we do care about how long each hash chain > is. > > So instead of walking the entire routing cache to keep the number of > entries down, what we should do is keep each hash chain as short as > possible. > > Assuming that the hash function is good, this should achieve the > same end result. > > Here is how it can be done: Every time a routing entry is inserted into > a hash chain, we perform GC on that chain unconditionally. > > It might seem that we're doing more work again. However, as before > because we're traversing the chain anyway, it is very cheap to perform > the GC operations which mainly involve the checks in rt_may_expire. Agree... It's very interesting and worth to test something like this. also it could clean up the GC process and the need for tuning which would be very welcome. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu 2005-04-02 13:58 ` Eric Dumazet 2005-04-02 14:03 ` Robert Olsson @ 2005-04-02 21:05 ` jamal 2005-04-03 7:38 ` Herbert Xu 2005-04-03 7:41 ` Herbert Xu 2 siblings, 2 replies; 44+ messages in thread From: jamal @ 2005-04-02 21:05 UTC (permalink / raw) To: Herbert Xu; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson On Sat, 2005-04-02 at 06:23, Herbert Xu wrote: > On Sat, Apr 02, 2005 at 11:21:30AM +0200, Eric Dumazet wrote: > > > > Well, I began my work because of the overflow bug in rt_check_expire()... > > Then I realize this function could not work as expected. On a loaded > > machine, one timer tick is 1 ms. > > During this time, number of chains that are scanned is ridiculous. > > With the standard timer of 60 second, fact is rt_check_expire() is useless. > > I see. What we've got here is a scalability problem with respect > to the number of hash buckets. As the number of buckets increases, > the amount of work the timer GC has to perform inreases proportionally. > Its classical incremental garbage collection algorithm thats being used i.e something along whats typically refered to as mark-and-sweep. Could the main issue be not the amount of routes in the cache but rather the locking when number of CPUs go up? Incrementing the timer frequency would certainly help but maybe have adverse effects if the frequency is too high because of the across system locking IMO. > Since the timer GC parameters are fixed, this will eventually break. > > Rather than changing the timer GC so that it runs more often to keep > up with the large routing cache, we should get out of this by reducing > the amount of work we have to do. > Refer to my hint above: perhaps per CPU caches? > Imagine an ideal balanced hash table with 2.6 million entries. That > is, all incoming/outgoing packets belong to flows that are already in > the hash table. Imagine also that there is no PMTU/link failure taking > place so all entries are valid forever. > > > In this state there is absolutely no need to execute the timer GC. > Yeah, but memory is finite friend. True, if you can imagine infinite memory we would not need gc ;-> > Let's remove one of those assumptions and allow there to be entries > which need to expire after a set period. > Instead of having the timer GC clean them up, we can move the expire > check to the place where the entries are used. That is, we make > ip_route_input/ip_route_output/ipv4_dst_check check whether the > entry has expired. > If you can show lock grabbing is the main contentious issue; i believe it is as CPUs go up. Then this is a valuable idea since you are already grabbing the locks anyways. > On the face of it we're doing more work since every routing cache > hit will need to check the validity of the dst. However, because > it's a single subtraction it is actually pretty cheap. There is > also no additional cache miss compared to doing it in the timer > GC since we have to read the dst anyway. > In the case of slower machine, the compute is also an issue. To be honest i feel like handwaving - experimenting and collecting profiles would help nail it. > Let's go one step further and make the routing cache come to life. > Now there are new entries coming in and we need to remove old ones > in order to make room for them. > > That task is currently carried out by the timer GC in rt_check_expire > and on demand by rt_garbage_collect. Either way we have to walk the > entire routing cache looking for entries to get rid of. > we dont really do the whole route cache everytime - I am sure you know that. > This is quite expensive when the routing cache is large. However, > there is a better way. > > The reason we keep a cap on the routing cache (for a given hash size) > is so that individual chains do not degenerate into long linked lists. > > In other words, we don't really care about how many entries there are > in the routing cache. But we do care about how long each hash chain > is. > > So instead of walking the entire routing cache to keep the number of > entries down, what we should do is keep each hash chain as short as > possible. > Thats certainly one solution .. reading on how you achive this .. > Assuming that the hash function is good, this should achieve the > same end result. > > Here is how it can be done: Every time a routing entry is inserted into > a hash chain, we perform GC on that chain unconditionally. > May not be a good idea to do it unconditionally - in particular on SMP where another CPU maybe spinning waiting for you to let go of bucket lock. In particular if a burst of packets accessing the same bucket show up on different processors, this would be aggravated. You may wanna kick in this algorithm only when things start going past a certain threshold. > It might seem that we're doing more work again. However, as before > because we're traversing the chain anyway, it is very cheap to perform > the GC operations which mainly involve the checks in rt_may_expire. > > OK that's enough thinking and it's time to write some code to see > whether this is all bullshit :) > I think there are some good ideas in there; the bottleneck could be perceived as one of either the locks are too expensive (clearly so in SMP as number of CPUs go up) or the compute is taking too long (clearly so in slower systems - but a general fact of life as well). For the first issue, amortizing the lock grabbing via compute as you suggest maybe of value or make per cpu caches. cheers, jamal ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 21:05 ` jamal @ 2005-04-03 7:38 ` Herbert Xu 2005-04-03 7:41 ` Herbert Xu 1 sibling, 0 replies; 44+ messages in thread From: Herbert Xu @ 2005-04-03 7:38 UTC (permalink / raw) To: jamal; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson On Sat, Apr 02, 2005 at 04:05:55PM -0500, jamal wrote: > > > In this state there is absolutely no need to execute the timer GC. > > Yeah, but memory is finite friend. True, if you can imagine infinite > memory we would not need gc ;-> True. However running the GC when you can't free most of the entries is a waste of time. On a busy system where the routing cache is near capacity and new entries are coming in all the time, we should arrange it so that the old entries are expired when entries are inserted. Assuming the hash function is good, then as long as there is a steady stream of entries coming in, the old entries will be expired automatically. Of course, we should not leave the systems that have experienced a burst of flows at a disadvantage. Indeed there is a rather simple way of doing GC for them without having to do work that's proportional to the number of hash chains in the routing cache. The key is that the GC is only useful when the routing cache contains enough entries that can be freed. Let's say that if we can free more than 1/3 of the entries then the GC should be run. Of course you can define this to be whatever you want. So now the problem is to quickly determine whether there are enough entries in the cache that can be freed. What we can do is take a leaf out of the politicians' book :) We take a poll on a small sample of the routing cache. That is, we run the GC on a fixed number of chains, e.g., 256 chains. After that we tally the total number of entries and the number of entries freed. Since the hash function should be spreading entries throughout the chains evenly, the ratio here can be extrapolated out to the entire cache. Therefore once the ratio exceeds the defined threshold, we perform GC over the entire cache, preferably in a kernel thread. If not then we'll simply let the GC roam along at the constant pace of 256 chains. The advantage of this is that the GC will free entries in the entire table as soon as that becomes possible without having to do work proportional to the number of chains in each GC interval. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: Get rid of rt_check_expire and rt_garbage_collect 2005-04-02 21:05 ` jamal 2005-04-03 7:38 ` Herbert Xu @ 2005-04-03 7:41 ` Herbert Xu 1 sibling, 0 replies; 44+ messages in thread From: Herbert Xu @ 2005-04-03 7:41 UTC (permalink / raw) To: jamal; +Cc: Eric Dumazet, David S. Miller, netdev, Robert Olsson On Sat, Apr 02, 2005 at 04:05:55PM -0500, jamal wrote: > > > Here is how it can be done: Every time a routing entry is inserted into > > a hash chain, we perform GC on that chain unconditionally. > > May not be a good idea to do it unconditionally - in particular on SMP > where another CPU maybe spinning waiting for you to let go of bucket > lock. In particular if a burst of packets accessing the same bucket show > up on different processors, this would be aggravated. > You may wanna kick in this algorithm only when things start going past a > certain threshold. This isn't too bad because: 1. The fast path is lockless using RCU. 2. The number of locks exceeds the number of CPUs by some insane amount. 3. The cost of performing GC is really cheap, it's just a matter of calling rt_may_expire. Anyway, I agree that all of these ideas are simply fantasy until we have some code. So let me work on that and then we can let the benchmarks do the talking :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 9:21 ` Eric Dumazet 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu @ 2005-04-02 13:48 ` Robert Olsson 2005-04-02 14:10 ` Eric Dumazet 2005-04-02 19:32 ` Herbert Xu 1 sibling, 2 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-02 13:48 UTC (permalink / raw) To: Eric Dumazet; +Cc: Herbert Xu, davem, netdev, Robert.Olsson Eric Dumazet writes: > > This patch is doing too many things. How about splitting it up? > > > > For instance the spin lock stuff is pretty straightforward and > > should be in its own patch. Yes a good idea so it can be tested separatly.... > > The benefits of the GC changes are not obvious to me. rt_check_expire > > is simply meant to kill off old entries. It's not really meant to be > > used to free up entries when the table gets full. Agree with Herbert... > entries| in_hit|in_slow_|in_slow_|in_no_ro| in_brd|in_marti|in_marti| > out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis| > | | tot| mc| ute| | an_dst| an_src| | _tot| _mc| | ed| miss| verflow| > _search|t_search| > 2618087| 28581| 7673| 0| 0| 0| 0| 0| 1800| 1450| 0| 0| 0| 0| 0| > Without serious tuning, this machine could not handle this load, or even half of it. Yes thats a pretty much load. Very short flows some reason? What's your ip_rt_gc_min_interval? GC should be allowed to run frequent to smoothen out the GC load. Also good idea to decrease gc_thresh and you hash is really huge. > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die. A good idea to increase the secret_interval interval but it should survive. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 13:48 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson @ 2005-04-02 14:10 ` Eric Dumazet 2005-04-02 14:46 ` Robert Olsson 2005-04-02 20:47 ` jamal 2005-04-02 19:32 ` Herbert Xu 1 sibling, 2 replies; 44+ messages in thread From: Eric Dumazet @ 2005-04-02 14:10 UTC (permalink / raw) To: Robert Olsson; +Cc: Herbert Xu, davem, netdev Robert Olsson a écrit : > Eric Dumazet writes: > Yes thats a pretty much load. Very short flows some reason? Well... yes. This is a real server, not a DOS simulation. 1 million TCP flows, and about 3 million peers using UDP frames. > What's your ip_rt_gc_min_interval? GC should be allowed to > run frequent to smoothen out the GC load. Also good idea > to decrease gc_thresh and you hash is really huge. No. As soon as I lower gc_thresh (and let gc running), the machine starts to drop connections and crash some seconds later. I found I had to make the hash table very large (but lowering elasticity, ie chain length) . It needs lot of ram, but at least CPU usage of net/ipv4/route.c is close to 0. # grep . /proc/sys/net/ipv4/route/* /proc/sys/net/ipv4/route/error_burst:5000 /proc/sys/net/ipv4/route/error_cost:1000 /proc/sys/net/ipv4/route/gc_elasticity:2 /proc/sys/net/ipv4/route/gc_interval:1 /proc/sys/net/ipv4/route/gc_min_interval:0 /proc/sys/net/ipv4/route/gc_min_interval_ms:500 /proc/sys/net/ipv4/route/gc_thresh:2900000 /proc/sys/net/ipv4/route/gc_timeout:155 /proc/sys/net/ipv4/route/max_delay:10 /proc/sys/net/ipv4/route/max_size:16777216 /proc/sys/net/ipv4/route/min_adv_mss:256 /proc/sys/net/ipv4/route/min_delay:2 /proc/sys/net/ipv4/route/min_pmtu:552 /proc/sys/net/ipv4/route/mtu_expires:600 /proc/sys/net/ipv4/route/redirect_load:20 /proc/sys/net/ipv4/route/redirect_number:9 /proc/sys/net/ipv4/route/redirect_silence:20480 /proc/sys/net/ipv4/route/secret_interval:36000 ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 14:10 ` Eric Dumazet @ 2005-04-02 14:46 ` Robert Olsson 2005-04-02 20:47 ` jamal 1 sibling, 0 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-02 14:46 UTC (permalink / raw) To: Eric Dumazet; +Cc: Robert Olsson, Herbert Xu, davem, netdev Eric Dumazet writes: > Well... yes. This is a real server, not a DOS simulation. > 1 million TCP flows, and about 3 million peers using UDP frames. I see. > > What's your ip_rt_gc_min_interval? GC should be allowed to > > run frequent to smoothen out the GC load. Also good idea > > to decrease gc_thresh and you hash is really huge. > No. As soon as I lower gc_thresh (and let gc running), the machine starts to drop connections and crash some seconds later. > I found I had to make the hash table very large (but lowering elasticity, ie chain length) . > It needs lot of ram, but at least CPU usage of net/ipv4/route.c is close to 0. OK! Not so bad. Most of your GC likely happens in rt_intern_hash chain pruning. This way you keep hash-chains short and get "datadriven" GC. But there must be bugs causing the crash... Maybe there should be an explicit control hash lengths not via elasticity but adding even more tuning knobs hurts. :) --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 14:10 ` Eric Dumazet 2005-04-02 14:46 ` Robert Olsson @ 2005-04-02 20:47 ` jamal 1 sibling, 0 replies; 44+ messages in thread From: jamal @ 2005-04-02 20:47 UTC (permalink / raw) To: Eric Dumazet; +Cc: Robert Olsson, Herbert Xu, David S. Miller, netdev On Sat, 2005-04-02 at 09:10, Eric Dumazet wrote: > Robert Olsson a écrit : > > Eric Dumazet writes: > > > Yes thats a pretty much load. Very short flows some reason? > > Well... yes. This is a real server, not a DOS simulation. > 1 million TCP flows, and about 3 million peers using UDP frames. SMP? How many processors? cheers, jamal ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 13:48 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson 2005-04-02 14:10 ` Eric Dumazet @ 2005-04-02 19:32 ` Herbert Xu 2005-04-02 19:55 ` David S. Miller 2005-04-03 19:36 ` Robert Olsson 1 sibling, 2 replies; 44+ messages in thread From: Herbert Xu @ 2005-04-02 19:32 UTC (permalink / raw) To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev On Sat, Apr 02, 2005 at 03:48:32PM +0200, Robert Olsson wrote: > > > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die. > > A good idea to increase the secret_interval interval but it should survive. Incidentally we should change the way the rehashing is triggered. Instead of doing it regularly, we can do it when we notice that a specific hash chain grows beyond a certain size. The idea is that if someone is attacking our hash then they can only do so by lengthening the chains. If they're not doing that then even if they knew how to attack us we don't really care. Of course when it does happen it'll still kill your machine unless we can find a way to amortise this. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 19:32 ` Herbert Xu @ 2005-04-02 19:55 ` David S. Miller 2005-04-03 7:43 ` Herbert Xu 2005-04-03 19:36 ` Robert Olsson 1 sibling, 1 reply; 44+ messages in thread From: David S. Miller @ 2005-04-02 19:55 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert.Olsson, dada1, netdev On Sun, 3 Apr 2005 05:32:24 +1000 Herbert Xu <herbert@gondor.apana.org.au> wrote: > On Sat, Apr 02, 2005 at 03:48:32PM +0200, Robert Olsson wrote: > > > > > Crashes usually occurs when secret_interval interval is elapsed : rt_cache_flush(0); is called, and the whole machine begins to die. > > > > A good idea to increase the secret_interval interval but it should survive. > > Incidentally we should change the way the rehashing is triggered. > Instead of doing it regularly, we can do it when we notice that a > specific hash chain grows beyond a certain size. > > The idea is that if someone is attacking our hash then they can > only do so by lengthening the chains. If they're not doing that > then even if they knew how to attack us we don't really care. Yes, the secret_interval is way too short. It is a very paranoid default value selected when initially fixing that DoS. I think we should, in the short term, increase the secret interval where it exists in the tree (netfilter conntrack is another instance for example). ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 19:55 ` David S. Miller @ 2005-04-03 7:43 ` Herbert Xu 2005-04-03 19:57 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-03 7:43 UTC (permalink / raw) To: David S. Miller; +Cc: Robert.Olsson, dada1, netdev On Sat, Apr 02, 2005 at 11:55:28AM -0800, David S. Miller wrote: > > I think we should, in the short term, increase the secret interval > where it exists in the tree (netfilter conntrack is another instance > for example). We could also move rt_cache_flush into a kernel thread. When the number of chains is large this function is really expensive for a softirq handler. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-03 7:43 ` Herbert Xu @ 2005-04-03 19:57 ` Robert Olsson 2005-04-03 21:45 ` Herbert Xu 0 siblings, 1 reply; 44+ messages in thread From: Robert Olsson @ 2005-04-03 19:57 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, Robert.Olsson, dada1, netdev Herbert Xu writes: > We could also move rt_cache_flush into a kernel thread. When the > number of chains is large this function is really expensive for a > softirq handler. It can also be done via /proc and left to administrators to find suitable policy. Kernel just provides the mechanism to rehash. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-03 19:57 ` Robert Olsson @ 2005-04-03 21:45 ` Herbert Xu 2005-04-04 10:27 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-03 21:45 UTC (permalink / raw) To: Robert Olsson; +Cc: David S. Miller, dada1, netdev On Sun, Apr 03, 2005 at 09:57:08PM +0200, Robert Olsson wrote: > > Herbert Xu writes: > > > We could also move rt_cache_flush into a kernel thread. When the > > number of chains is large this function is really expensive for a > > softirq handler. > > It can also be done via /proc and left to administrators to find > suitable policy. Kernel just provides the mechanism to rehash. The reason I'm suggesting the move to a kernel thread is because softirq context is not preemptible. So doing a large amount of work in it when your table is big means that a UP machine will freeze for a while. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-03 21:45 ` Herbert Xu @ 2005-04-04 10:27 ` Robert Olsson 2005-04-04 10:38 ` Herbert Xu 0 siblings, 1 reply; 44+ messages in thread From: Robert Olsson @ 2005-04-04 10:27 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert Olsson, David S. Miller, dada1, netdev Herbert Xu writes: > The reason I'm suggesting the move to a kernel thread is because > softirq context is not preemptible. > > So doing a large amount of work in it when your table is big means > that a UP machine will freeze for a while. The flush transient will happen also on UP... as I understand this When we have changed the rt_hash_rnd and therefore invalidated all current entries it would be best to blackhole *all* traffic until all old entries are deleted this to avoid transients. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-04 10:27 ` Robert Olsson @ 2005-04-04 10:38 ` Herbert Xu 2005-04-04 12:29 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-04 10:38 UTC (permalink / raw) To: Robert Olsson; +Cc: David S. Miller, dada1, netdev On Mon, Apr 04, 2005 at 12:27:43PM +0200, Robert Olsson wrote: > > The flush transient will happen also on UP... as I understand this > When we have changed the rt_hash_rnd and therefore invalidated all current > entries it would be best to blackhole *all* traffic until all old entries > are deleted this to avoid transients. That's nasty because if you have a large cache like Eric, then you'll be dropping packets for quite a while :) Actually, what's so bad about seeing transients? One cost that I can see is that you'll be walking a chain only to conclude that none of the entries might match. But this is pretty cheap as long as we keep the chain lengths short. The other cost is that we might be creating an entry that gets flushed straight away. However, that's no worse than not using the cache at all since in that case we'll be creating one entry for each packet anyway. Both of these can be avoided too if we really cared. All we need is one bit per chain that indicated whether it's been flushed. So when ip_route_* hits a chain that hasn't been flushed, it could 1) Skip the lookup step. 2) Create the rt entry as usual. 3) Flush the chain while we insert the entry and set the bit. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-04 10:38 ` Herbert Xu @ 2005-04-04 12:29 ` Robert Olsson 0 siblings, 0 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-04 12:29 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert Olsson, David S. Miller, dada1, netdev Herbert Xu writes: > That's nasty because if you have a large cache like Eric, then you'll > be dropping packets for quite a while :) > > Actually, what's so bad about seeing transients? One cost that > I can see is that you'll be walking a chain only to conclude that > none of the entries might match. But this is pretty cheap as long as > we keep the chain lengths short. > > The other cost is that we might be creating an entry that gets flushed > straight away. However, that's no worse than not using the cache at > all since in that case we'll be creating one entry for each packet > anyway. Maybe you're right and systems seems to survive. But the transit period should be as short as possible. > Both of these can be avoided too if we really cared. All we need > is one bit per chain that indicated whether it's been flushed. So > when ip_route_* hits a chain that hasn't been flushed, it could > > 1) Skip the lookup step. > 2) Create the rt entry as usual. > 3) Flush the chain while we insert the entry and set the bit. Yes better was thinking of something like this too. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-02 19:32 ` Herbert Xu 2005-04-02 19:55 ` David S. Miller @ 2005-04-03 19:36 ` Robert Olsson 2005-04-03 21:43 ` Herbert Xu 1 sibling, 1 reply; 44+ messages in thread From: Robert Olsson @ 2005-04-03 19:36 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev Herbert Xu writes: > Incidentally we should change the way the rehashing is triggered. > Instead of doing it regularly, we can do it when we notice that a > specific hash chain grows beyond a certain size. > The idea is that if someone is attacking our hash then they can > only do so by lengthening the chains. If they're not doing that > then even if they knew how to attack us we don't really care. Well I don't see how we detect the need for rehash just be looking at the hash chains. How does the the "lengthening" look like that are allowed to trigger a rehash? Agree with Dave that we can increase the interval to start with. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-03 19:36 ` Robert Olsson @ 2005-04-03 21:43 ` Herbert Xu 2005-04-04 10:38 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-03 21:43 UTC (permalink / raw) To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev On Sun, Apr 03, 2005 at 09:36:52PM +0200, Robert Olsson wrote: > > Well I don't see how we detect the need for rehash just be looking > at the hash chains. How does the the "lengthening" look like that > are allowed to trigger a rehash? The only way to attack a hash is by exploiting collisions and create one or more excessively long chains. This can be detected as follows at each rt hash insertion. If (total number of entries in cache >> (hash length - user defined length)) < current bucket length is true, then we schedule a rehash/flush. Hash length is the number of bits in the hash, i.e., 1 << hash length == number of buckets I'd suggest a default shift length of 3. That is, if any individual chain is growing beyond 8 times the average chain length then we've got a problem. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-03 21:43 ` Herbert Xu @ 2005-04-04 10:38 ` Robert Olsson 2005-04-04 10:48 ` Herbert Xu 0 siblings, 1 reply; 44+ messages in thread From: Robert Olsson @ 2005-04-04 10:38 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev Herbert Xu writes: > The only way to attack a hash is by exploiting collisions and > create one or more excessively long chains. > > This can be detected as follows at each rt hash insertion. If > > (total number of entries in cache >> (hash length - user defined length)) < > current bucket length > > is true, then we schedule a rehash/flush. > > Hash length is the number of bits in the hash, i.e., > > 1 << hash length == number of buckets > > I'd suggest a default shift length of 3. That is, if any individual > chain is growing beyond 8 times the average chain length then we've > got a problem. This is likely to happen in rt_intern_hash? I don't see how this can get along with chain-pruning there? IMO the thoughts of extending in-flow GC etc are interesting and can hopefully give us more robust performance. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-04 10:38 ` Robert Olsson @ 2005-04-04 10:48 ` Herbert Xu 2005-04-04 13:17 ` Robert Olsson 0 siblings, 1 reply; 44+ messages in thread From: Herbert Xu @ 2005-04-04 10:48 UTC (permalink / raw) To: Robert Olsson; +Cc: Eric Dumazet, davem, netdev On Mon, Apr 04, 2005 at 12:38:03PM +0200, Robert Olsson wrote: > > This is likely to happen in rt_intern_hash? I don't see how this can > get along with chain-pruning there? What I'm trying to catch is the case when you've got x number of entries in the table and a large fraction of them are all in one chain. This does not conflict with the goal of keeping the chains short. Even if you strictly allow only 8 entries per chain, it's trivial to exceed 8 times the average chain length. Remember the average chain length can be fractions like 0.1. Of course we need to set a minimum value that the chain needs to grow beyond before this check kicks in. > IMO the thoughts of extending in-flow GC etc are interesting and can > hopefully give us more robust performance. Indeed, it looks like Alexey has already put the code there. It just needs to be made more strict :) It needs to free entries even if they are in use. After all, freeing an entry in use can't be much worse than not having a cache at all. OTOH, having a very long chain is definitely much worse than not having a cache :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [BUG] overflow in net/ipv4/route.c rt_check_expire() 2005-04-04 10:48 ` Herbert Xu @ 2005-04-04 13:17 ` Robert Olsson 0 siblings, 0 replies; 44+ messages in thread From: Robert Olsson @ 2005-04-04 13:17 UTC (permalink / raw) To: Herbert Xu; +Cc: Robert Olsson, Eric Dumazet, davem, netdev Herbert Xu writes: > What I'm trying to catch is the case when you've got x number of > entries in the table and a large fraction of them are all in one > chain. > > This does not conflict with the goal of keeping the chains short. > > Even if you strictly allow only 8 entries per chain, it's trivial > to exceed 8 times the average chain length. OK! Since deletions doen't happen instantly.. Try some code it can print a warning to start with. > > IMO the thoughts of extending in-flow GC etc are interesting and can > > hopefully give us more robust performance. > > Indeed, it looks like Alexey has already put the code there. It just > needs to be made more strict :) It needs to free entries even if they > are in use. > > After all, freeing an entry in use can't be much worse than not having > a cache at all. OTOH, having a very long chain is definitely much worse > than not having a cache :) FYI I'm experimenting with "new" routing algo that does 24-bit ipv4 lookup and routing without route hash to see if we can come close route hash performance This needs memory. :) IP: FIB routing table of 16777216 buckets, 65536Kbytes for table id=255 Needs some more work before testing. --ro ^ permalink raw reply [flat|nested] 44+ messages in thread
end of thread, other threads:[~2005-04-04 13:17 UTC | newest] Thread overview: 44+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-03-15 16:13 [PATCH] no more rwlock_t inside tcp_ehash_bucket Eric Dumazet 2005-03-15 18:32 ` David S. Miller 2005-03-16 10:47 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet 2005-03-16 18:05 ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet 2005-03-16 22:10 ` David S. Miller 2005-03-16 22:09 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller 2005-03-17 19:52 ` Eric Dumazet 2005-04-01 6:13 ` David S. Miller 2005-04-01 14:39 ` Eric Dumazet 2005-04-01 15:53 ` Robert Olsson 2005-04-01 16:34 ` Eric Dumazet 2005-04-01 17:26 ` Robert Olsson 2005-04-01 20:28 ` David S. Miller 2005-04-01 21:05 ` Eric Dumazet 2005-04-01 21:08 ` David S. Miller 2005-04-01 21:43 ` Eric Dumazet 2005-04-01 22:34 ` David S. Miller 2005-04-01 23:21 ` Eric Dumazet 2005-04-01 23:54 ` David S. Miller 2005-04-02 8:21 ` Herbert Xu 2005-04-02 9:21 ` Eric Dumazet 2005-04-02 11:23 ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu 2005-04-02 13:58 ` Eric Dumazet 2005-04-02 14:03 ` Robert Olsson 2005-04-02 21:05 ` jamal 2005-04-03 7:38 ` Herbert Xu 2005-04-03 7:41 ` Herbert Xu 2005-04-02 13:48 ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson 2005-04-02 14:10 ` Eric Dumazet 2005-04-02 14:46 ` Robert Olsson 2005-04-02 20:47 ` jamal 2005-04-02 19:32 ` Herbert Xu 2005-04-02 19:55 ` David S. Miller 2005-04-03 7:43 ` Herbert Xu 2005-04-03 19:57 ` Robert Olsson 2005-04-03 21:45 ` Herbert Xu 2005-04-04 10:27 ` Robert Olsson 2005-04-04 10:38 ` Herbert Xu 2005-04-04 12:29 ` Robert Olsson 2005-04-03 19:36 ` Robert Olsson 2005-04-03 21:43 ` Herbert Xu 2005-04-04 10:38 ` Robert Olsson 2005-04-04 10:48 ` Herbert Xu 2005-04-04 13:17 ` Robert Olsson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).