Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Eric Dumazet <dada1@cosmosbay.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: netdev@oss.sgi.com
Subject: Re: [BUG] overflow in net/ipv4/route.c rt_check_expire()
Date: Thu, 17 Mar 2005 20:52:44 +0100	[thread overview]
Message-ID: <4239E00C.4080309@cosmosbay.com> (raw)
In-Reply-To: <20050316140915.0f6b9528.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1147 bytes --]

David S. Miller a écrit :
> On Wed, 16 Mar 2005 11:47:34 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> 
>>The rt_check_expire() has a serious problem on machines with large
>>route caches, and a standard HZ value of 1000.
>>
>>With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;
>>
>>the loop count :
>>
>>     for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
>>
>>
>>overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
>>slots in route cache hash table)
> 
>  ...
> 
>>I am experimenting some changes that I will share when ready.
> 
> 
> Good catch, let us know when you have a patch for review.
> 
> 

Hi

This is the patch I'm currently running on several machines (with very high pressure on route cache)

If you want to comment it before I submit it with a nice looking mail :)

Description :

    - Makes rt_check_expire() do some real work, auto-adapting the timer interval (chosen between gc_interval/8 and gc_interval)

    - Move the spinlocks out of tr_hash_table[] to a fixed size table : Saves a lot of memory (particulary on UP)

Thank you

Eric Dumazet

[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 10920 bytes --]

diff -Nru linux-2.6.11/net/ipv4/route.c linux-2.6.11-ed/net/ipv4/route.c
--- linux-2.6.11/net/ipv4/route.c	2005-03-17 11:19:45.000000000 +0100
+++ linux-2.6.11-ed/net/ipv4/route.c	2005-03-17 18:33:20.000000000 +0100
@@ -54,6 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -107,12 +108,13 @@
 #define IP_MAX_MTU	0xFFF0
 
 #define RT_GC_TIMEOUT (300*HZ)
+#define RT_GC_INTERVAL (RT_GC_TIMEOUT/10) /* rt_check_expire() scans 1/10 of the table each round */
 
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
-static int ip_rt_gc_interval		= 60 * HZ;
+static int ip_rt_gc_interval		= RT_GC_INTERVAL;
 static int ip_rt_gc_min_interval	= HZ / 2;
 static int ip_rt_redirect_number	= 9;
 static int ip_rt_redirect_load		= HZ / 50;
@@ -124,6 +126,7 @@
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static int ip_rt_debug ;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
@@ -197,8 +200,24 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of fixed size spinlocks
+ */
+# define RT_HASH_LOCK_SZ 256
+	static spinlock_t	rt_hash_lock[RT_HASH_LOCK_SZ];
+# define rt_hash_lock_addr(slot) &rt_hash_lock[slot & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_lock[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -470,7 +489,7 @@
 		rth->u.dst.expires;
 }
 
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
 	int ret = 0;
@@ -516,45 +535,84 @@
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	static unsigned int effective_interval = RT_GC_INTERVAL;
+	static unsigned int cached_gc_interval = RT_GC_INTERVAL;
+	unsigned int i = rover, t;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
+	unsigned int freed = 0 , t0;
+	u64 mult;
 
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
-		unsigned long tmo = ip_rt_gc_timeout;
-
+	if (cached_gc_interval != ip_rt_gc_interval) { /* ip_rt_gc_interval may have changed with sysctl */
+		cached_gc_interval = ip_rt_gc_interval;
+		effective_interval = cached_gc_interval;
+	}
+	/* computes the number of slots we should examin in this run */
+	mult = ((u64)effective_interval) << rt_hash_log;
+	do_div(mult, ip_rt_gc_timeout);
+	t = (unsigned int)mult;
+
+	if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/8) {
+		t <<= 1; /* be more aggressive */
+		if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/4) {
+			t <<= 1; /* be more aggressive */
+			if (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size/2) {
+				t <<= 1; /* be more aggressive */
+				now++; /* give us one more tick (time) to do our job */
+			}
+		}
+	}
+	if (t > rt_hash_mask) t = rt_hash_mask + 1;
+	t0 = t;
+	for ( ; t > 0; t -= 1) {
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
-
-		spin_lock(&rt_hash_table[i].lock);
-		while ((rth = *rthp) != NULL) {
-			if (rth->u.dst.expires) {
-				/* Entry is expired even if it is in use */
-				if (time_before_eq(now, rth->u.dst.expires)) {
+		if (*rthp) {
+			unsigned long tmo = ip_rt_gc_timeout;
+			spin_lock(rt_hash_lock_addr(i));
+			while ((rth = *rthp) != NULL) {
+				if (rth->u.dst.expires) {
+					/* Entry is expired even if it is in use */
+					if (time_before_eq(now, rth->u.dst.expires)) {
+						tmo >>= 1;
+						rthp = &rth->u.rt_next;
+						continue;
+					}
+				} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 					tmo >>= 1;
 					rthp = &rth->u.rt_next;
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.rt_next;
-				continue;
+	
+				/* Cleanup aged off entries. */
+				*rthp = rth->u.rt_next;
+				freed++;
+				rt_free(rth);
 			}
-
-			/* Cleanup aged off entries. */
-			*rthp = rth->u.rt_next;
-			rt_free(rth);
+			spin_unlock(rt_hash_lock_addr(i));
 		}
-		spin_unlock(&rt_hash_table[i].lock);
-
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	if (t) {
+		/* Not enough time to perform our job, try to adjust the timer.
+		 * Firing the timer sooner means less planned work.
+		 * We allow the timer to be 1/8 of the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval/8)/2;
+	}
+	else {
+		/* We finished our job before time limit, try to increase the timer
+		 * The limit is the sysctl value.
+		 */
+		effective_interval = (effective_interval + cached_gc_interval)/2;
+	}
+	if (ip_rt_debug & 1)
+		printk(KERN_WARNING "rt_check_expire() : %u freed, t=%u/%u interval=%u ticks\n", freed, t, t0, effective_interval);
+	mod_timer(&rt_periodic_timer, jiffies + effective_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
@@ -570,11 +628,11 @@
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -704,7 +762,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -715,7 +773,7 @@
 				rt_free(rth);
 				goal--;
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -792,7 +850,7 @@
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
 			/* Put it first */
@@ -813,7 +871,7 @@
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -854,7 +912,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -895,7 +953,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -962,7 +1020,7 @@
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -971,7 +1029,7 @@
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -2569,6 +2627,23 @@
 		.strategy	= &sysctl_jiffies,
 	},
 	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL_MS,
+		.procname	= "gc_interval_ms",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_ms_jiffies,
+		.strategy	= &sysctl_ms_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_DEBUG,
+		.procname	= "gc_debug",
+		.data		= &ip_rt_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
@@ -2718,7 +2793,7 @@
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
+	int order, goal, rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
@@ -2766,14 +2841,12 @@
 	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
 		/* NOTHING */;
 
-	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
+	memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = rt_hash_mask;
+	ip_rt_max_size = rt_hash_mask * 16;
+	rt_hash_mask--;
 
 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat)
diff -Nru linux-2.6.11/Documentation/filesystems/proc.txt linux-2.6.11-ed/Documentation/filesystems/proc.txt
--- linux-2.6.11/Documentation/filesystems/proc.txt	2005-03-17 18:42:54.000000000 +0100
+++ linux-2.6.11-ed/Documentation/filesystems/proc.txt	2005-03-17 18:42:14.000000000 +0100
@@ -1709,12 +1709,13 @@
 
 Writing to this file results in a flush of the routing cache.
 
-gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
+gc_elasticity, gc_interval_ms, gc_min_interval_ms, gc_timeout, gc_thresh, gc_debug
 ---------------------------------------------------------------------
 
 Values to  control  the  frequency  and  behavior  of  the  garbage collection
 algorithm for the routing cache. gc_min_interval is deprecated and replaced
-by gc_min_interval_ms.
+by gc_min_interval_ms. gc_interval is deprecated and replaced by
+gc_interval_ms. gc_debug enables some printk()
 
 
 max_size
diff -Nru linux-2.6.11/include/linux/sysctl.h linux-2.6.11-ed/include/linux/sysctl.h
--- linux-2.6.11/include/linux/sysctl.h	2005-03-17 18:37:13.000000000 +0100
+++ linux-2.6.11-ed/include/linux/sysctl.h	2005-03-17 18:36:57.000000000 +0100
@@ -367,6 +367,8 @@
 	NET_IPV4_ROUTE_MIN_ADVMSS=17,
 	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
 	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19,
+	NET_IPV4_ROUTE_GC_INTERVAL_MS=20,
+	NET_IPV4_ROUTE_GC_DEBUG=21,
 };
 
 enum

next prev parent reply	other threads:[~2005-03-17 19:52 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-03-15 16:13 [PATCH] no more rwlock_t inside tcp_ehash_bucket Eric Dumazet
2005-03-15 18:32 ` David S. Miller
2005-03-16 10:47   ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Eric Dumazet
2005-03-16 18:05     ` [PATCH] reduce sizeof(struct inet_peer) from 128 to 64 bytes on 64bits architectures Eric Dumazet
2005-03-16 22:10       ` David S. Miller
2005-03-16 22:09     ` [BUG] overflow in net/ipv4/route.c rt_check_expire() David S. Miller
2005-03-17 19:52       ` Eric Dumazet [this message]
2005-04-01  6:13         ` David S. Miller
2005-04-01 14:39           ` Eric Dumazet
2005-04-01 15:53             ` Robert Olsson
2005-04-01 16:34               ` Eric Dumazet
2005-04-01 17:26                 ` Robert Olsson
2005-04-01 20:28             ` David S. Miller
2005-04-01 21:05               ` Eric Dumazet
2005-04-01 21:08                 ` David S. Miller
2005-04-01 21:43                   ` Eric Dumazet
2005-04-01 22:34                     ` David S. Miller
2005-04-01 23:21                       ` Eric Dumazet
2005-04-01 23:54                         ` David S. Miller
2005-04-02  8:21                         ` Herbert Xu
2005-04-02  9:21                           ` Eric Dumazet
2005-04-02 11:23                             ` Get rid of rt_check_expire and rt_garbage_collect Herbert Xu
2005-04-02 13:58                               ` Eric Dumazet
2005-04-02 14:03                               ` Robert Olsson
2005-04-02 21:05                               ` jamal
2005-04-03  7:38                                 ` Herbert Xu
2005-04-03  7:41                                 ` Herbert Xu
2005-04-02 13:48                             ` [BUG] overflow in net/ipv4/route.c rt_check_expire() Robert Olsson
2005-04-02 14:10                               ` Eric Dumazet
2005-04-02 14:46                                 ` Robert Olsson
2005-04-02 20:47                                 ` jamal
2005-04-02 19:32                               ` Herbert Xu
2005-04-02 19:55                                 ` David S. Miller
2005-04-03  7:43                                   ` Herbert Xu
2005-04-03 19:57                                     ` Robert Olsson
2005-04-03 21:45                                       ` Herbert Xu
2005-04-04 10:27                                         ` Robert Olsson
2005-04-04 10:38                                           ` Herbert Xu
2005-04-04 12:29                                             ` Robert Olsson
2005-04-03 19:36                                 ` Robert Olsson
2005-04-03 21:43                                   ` Herbert Xu
2005-04-04 10:38                                     ` Robert Olsson
2005-04-04 10:48                                       ` Herbert Xu
2005-04-04 13:17                                         ` Robert Olsson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4239E00C.4080309@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=davem@davemloft.net \
    --cc=netdev@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.