[PATCH 2/2] [IPV4] route: Dynamic hash table sizing.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
Subject: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
Date: Wed, 09 Aug 2006 00:53:37 -0700 (PDT)	[thread overview]
Message-ID: <20060809.005337.104642177.davem@davemloft.net> (raw)
In-Reply-To: <20060809.004920.59469729.davem@davemloft.net>


[IPV4] route: Dynamic hash table sizing.

The algorithm is stupid, this changeset is about infrastructure.

Currently it starts at 16 entries (or whatever rhash_entries was
specified as), and allows growing up to 8MB.

The code can handle both growing and shrinking just fine, the only
tweaks necessary are to the rthash_new_size() function and the places
where rtcache_work is scheduled.

hashdist is now used at run-time so we need to drop the __initdata
tag.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 mm/page_alloc.c  |    2 -
 net/ipv4/route.c |  179 +++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 158 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
 	return 0;
 }
 
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a7b4ca2..897e67c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -94,6 +94,9 @@ #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/seqlock.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -120,6 +123,7 @@ #define RT_GC_TIMEOUT (300*HZ)
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
+static int ip_rt_hashsz_limit		= (8 * 1024 * 1024) / sizeof(void *);
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
 static int ip_rt_gc_interval		= 60 * HZ;
 static int ip_rt_gc_min_interval	= HZ / 2;
@@ -308,6 +312,135 @@ static void rt_hash_snapshot(struct rt_h
 	} while (read_seqretry(&rt_hash_seq, seq));
 }
 
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+	struct rt_hash_bucket *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kmalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+	else
+		n = (struct rt_hash_bucket *)
+			__get_free_pages(GFP_KERNEL, get_order(sz));
+
+	if (n)
+		memset(n, 0, sz);
+
+	return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(r);
+	else if (hashdist)
+		vfree(r);
+	else
+		free_pages((unsigned long)r, get_order(sz));
+}
+
+static void rtcache_transfer(struct rtable *list, struct rt_hash_bucket *new_table, unsigned int nhashmask)
+{
+	while (list) {
+		struct rtable *next = list->u.rt_next;
+		struct rt_hash_bucket *ent;
+		int iface = list->fl.iif;
+		unsigned int hash;
+
+		if (!iface)
+			iface = list->fl.oif;
+		hash = __rt_hash_code(list->fl.fl4_dst,
+				      list->fl.fl4_src &
+				      (iface << 5),
+				      nhashmask);
+		ent = &new_table[hash];
+		list->u.rt_next = ent->chain;
+		ent->chain = list;
+
+		list = next;
+	}
+}
+
+static unsigned long rthash_new_size(void)
+{
+	return ((__rt_hash_mask + 1) << 1) *
+		sizeof(struct rt_hash_bucket);
+}
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+	multipath_remove(rt);
+	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void rtcache_resize(void *__unused)
+{
+	struct rt_hash_bucket *new, *old;
+	unsigned long nsize;
+	unsigned int nhashmask, ohashmask;
+	int i;
+
+	mutex_lock(&hash_resize_mutex);
+
+	nsize = rthash_new_size();
+	new = rthash_alloc(nsize);
+	if (!new)
+		goto out_unlock;
+
+	write_seqlock_bh(&rt_hash_seq);
+
+	nhashmask = (nsize / sizeof(struct rt_hash_bucket)) - 1U;
+	for (i = __rt_hash_mask; i >= 0; i--) {
+		struct rtable *rth;
+
+		spin_lock_bh(rt_hash_lock_addr(i));
+		rth = __rt_hash_table[i].chain;
+		if (rth)
+			__rt_hash_table[i].chain = NULL;
+		spin_unlock_bh(rt_hash_lock_addr(i));
+
+		rtcache_transfer(rth, new, nhashmask);
+	}
+
+	old = __rt_hash_table;
+	ohashmask = __rt_hash_mask;
+
+	__rt_hash_table = new;
+	__rt_hash_mask = nhashmask;
+
+	/* XXX Do something more intelligent with these things.  */
+	ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+	ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+
+	write_sequnlock_bh(&rt_hash_seq);
+
+	synchronize_net();
+
+	/* It is possible that some entries got hashed into the old
+	 * table, free any such remnants.  No locking is necessary on
+	 * the chains as this table is no longer viewable by other
+	 * processors.
+	 */
+	for (i = ohashmask; i >= 0; i--) {
+		struct rtable *rth, *next;
+
+		for (rth = old[i].chain; rth; rth = next) {
+			next = rth->u.rt_next;
+			rt_free(rth);
+		}
+	}
+
+	rthash_free(old, (ohashmask + 1) * sizeof(struct rt_hash_bucket));
+
+out_unlock:
+	mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(rtcache_work, rtcache_resize, NULL);
+
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	struct rt_hash_bucket *table;
@@ -540,12 +673,6 @@ static struct file_operations rt_cpu_seq
 
 #endif /* CONFIG_PROC_FS */
   
-static __inline__ void rt_free(struct rtable *rt)
-{
-	multipath_remove(rt);
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
 static __inline__ void rt_drop(struct rtable *rt)
 {
 	multipath_remove(rt);
@@ -676,7 +803,7 @@ static void rt_check_expire(unsigned lon
 
 	rt_hash_snapshot(&table, &hmask);
 
-	mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
+	mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
 
 	/* Calculate number of entries, which we want to expire now. */
 	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << long_log2(hmask + 1));
+		((hmask + 1) << ip_rt_gc_elasticity);
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
@@ -971,7 +1098,11 @@ #if RT_CACHE_DEBUG >= 2
 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
 #endif
-out:	return 0;
+out:
+	if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
+	    (hmask + 1) < ip_rt_hashsz_limit)
+		schedule_work(&rtcache_work);
+	return 0;
 }
 
 static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
@@ -3201,15 +3332,23 @@ #endif /* CONFIG_NET_CLS_ROUTE */
 static __initdata unsigned long rhash_entries;
 static int __init set_rhash_entries(char *str)
 {
+	unsigned long val;
+
 	if (!str)
 		return 0;
-	rhash_entries = simple_strtoul(str, &str, 0);
+	val = simple_strtoul(str, &str, 0);
+
+	/* Only use it if it's a power-of-2. */
+	if (!(val & (val - 1)))
+		rhash_entries = val;
+
 	return 1;
 }
 __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
+	unsigned long sz;
 	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3236,22 +3375,18 @@ #endif
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	__rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(num_physpages >= 128 * 1024) ?
-					15 : 17,
-					HASH_HIGHMEM,
-					NULL,
-					&__rt_hash_mask,
-					0);
-	memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
+	sz = (rhash_entries ? rhash_entries : 16);
+	sz *= sizeof(struct rt_hash_bucket);
 
+	__rt_hash_table = rthash_alloc(sz);
+	if (!__rt_hash_table)
+		panic("IP: failed to allocate routing cache hash table");
+	__rt_hash_mask = (sz / sizeof(struct rt_hash_bucket)) - 1;
 	ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
 	ip_rt_max_size = (__rt_hash_mask + 1) * 16;
 
+	rt_hash_lock_init();
+
 	devinet_init();
 	ip_fib_init();
 
-- 
1.4.2.rc2.g3e042

next prev parent reply	other threads:[~2006-08-09  7:54 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-09  7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
2006-08-09  7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
2006-08-09 11:31   ` Herbert Xu
2006-08-09 23:05     ` David Miller
2006-08-09  7:53 ` David Miller [this message]
2006-08-09  8:32   ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing Eric Dumazet
2006-08-09  9:09     ` David Miller
2006-08-09 10:09   ` Eric Dumazet
2006-08-09 10:12     ` Michael Tokarev
2006-08-09 10:18       ` Evgeniy Polyakov
2006-08-09 11:14     ` David Miller
2006-08-15  8:35       ` David Miller

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:54a4f53 dfblob:3b5358a dfblob:a7b4ca2 dfblob:897e67c )
 OR (
bs:"[PATCH 2/2] [IPV4] route: Dynamic hash table sizing." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060809.005337.104642177.davem@davemloft.net \
    --to=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).