From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
Subject: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
Date: Wed, 09 Aug 2006 00:53:37 -0700 (PDT) [thread overview]
Message-ID: <20060809.005337.104642177.davem@davemloft.net> (raw)
In-Reply-To: <20060809.004920.59469729.davem@davemloft.net>
[IPV4] route: Dynamic hash table sizing.
The algorithm is stupid, this changeset is about infrastructure.
Currently it starts at 16 entries (or whatever rhash_entries was
specified as), and allows growing up to 8MB.
The code can handle both growing and shrinking just fine, the only
tweaks necessary are to the rthash_new_size() function and the places
where rtcache_work is scheduled.
hashdist is now used at run-time so we need to drop the __initdata
tag.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
mm/page_alloc.c | 2 -
net/ipv4/route.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 158 insertions(+), 23 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
return 0;
}
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a7b4ca2..897e67c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -94,6 +94,9 @@ #include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/seqlock.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
@@ -120,6 +123,7 @@ #define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_min_delay = 2 * HZ;
static int ip_rt_max_delay = 10 * HZ;
static int ip_rt_max_size;
+static int ip_rt_hashsz_limit = (8 * 1024 * 1024) / sizeof(void *);
static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
static int ip_rt_gc_interval = 60 * HZ;
static int ip_rt_gc_min_interval = HZ / 2;
@@ -308,6 +312,135 @@ static void rt_hash_snapshot(struct rt_h
} while (read_seqretry(&rt_hash_seq, seq));
}
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+ struct rt_hash_bucket *n;
+
+ if (sz <= PAGE_SIZE)
+ n = kmalloc(sz, GFP_KERNEL);
+ else if (hashdist)
+ n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+ else
+ n = (struct rt_hash_bucket *)
+ __get_free_pages(GFP_KERNEL, get_order(sz));
+
+ if (n)
+ memset(n, 0, sz);
+
+ return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+ if (sz <= PAGE_SIZE)
+ kfree(r);
+ else if (hashdist)
+ vfree(r);
+ else
+ free_pages((unsigned long)r, get_order(sz));
+}
+
+static void rtcache_transfer(struct rtable *list, struct rt_hash_bucket *new_table, unsigned int nhashmask)
+{
+ while (list) {
+ struct rtable *next = list->u.rt_next;
+ struct rt_hash_bucket *ent;
+ int iface = list->fl.iif;
+ unsigned int hash;
+
+ if (!iface)
+ iface = list->fl.oif;
+ hash = __rt_hash_code(list->fl.fl4_dst,
+ list->fl.fl4_src &
+ (iface << 5),
+ nhashmask);
+ ent = &new_table[hash];
+ list->u.rt_next = ent->chain;
+ ent->chain = list;
+
+ list = next;
+ }
+}
+
+static unsigned long rthash_new_size(void)
+{
+ return ((__rt_hash_mask + 1) << 1) *
+ sizeof(struct rt_hash_bucket);
+}
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+ multipath_remove(rt);
+ call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void rtcache_resize(void *__unused)
+{
+ struct rt_hash_bucket *new, *old;
+ unsigned long nsize;
+ unsigned int nhashmask, ohashmask;
+ int i;
+
+ mutex_lock(&hash_resize_mutex);
+
+ nsize = rthash_new_size();
+ new = rthash_alloc(nsize);
+ if (!new)
+ goto out_unlock;
+
+ write_seqlock_bh(&rt_hash_seq);
+
+ nhashmask = (nsize / sizeof(struct rt_hash_bucket)) - 1U;
+ for (i = __rt_hash_mask; i >= 0; i--) {
+ struct rtable *rth;
+
+ spin_lock_bh(rt_hash_lock_addr(i));
+ rth = __rt_hash_table[i].chain;
+ if (rth)
+ __rt_hash_table[i].chain = NULL;
+ spin_unlock_bh(rt_hash_lock_addr(i));
+
+ rtcache_transfer(rth, new, nhashmask);
+ }
+
+ old = __rt_hash_table;
+ ohashmask = __rt_hash_mask;
+
+ __rt_hash_table = new;
+ __rt_hash_mask = nhashmask;
+
+ /* XXX Do something more intelligent with these things. */
+ ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+ ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+
+ write_sequnlock_bh(&rt_hash_seq);
+
+ synchronize_net();
+
+ /* It is possible that some entries got hashed into the old
+ * table, free any such remnants. No locking is necessary on
+ * the chains as this table is no longer viewable by other
+ * processors.
+ */
+ for (i = ohashmask; i >= 0; i--) {
+ struct rtable *rth, *next;
+
+ for (rth = old[i].chain; rth; rth = next) {
+ next = rth->u.rt_next;
+ rt_free(rth);
+ }
+ }
+
+ rthash_free(old, (ohashmask + 1) * sizeof(struct rt_hash_bucket));
+
+out_unlock:
+ mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(rtcache_work, rtcache_resize, NULL);
+
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
struct rt_hash_bucket *table;
@@ -540,12 +673,6 @@ static struct file_operations rt_cpu_seq
#endif /* CONFIG_PROC_FS */
-static __inline__ void rt_free(struct rtable *rt)
-{
- multipath_remove(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
static __inline__ void rt_drop(struct rtable *rt)
{
multipath_remove(rt);
@@ -676,7 +803,7 @@ static void rt_check_expire(unsigned lon
rt_hash_snapshot(&table, &hmask);
- mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
+ mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
if (ip_rt_gc_timeout > 1)
do_div(mult, ip_rt_gc_timeout);
goal = (unsigned int)mult;
@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
/* Calculate number of entries, which we want to expire now. */
goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << long_log2(hmask + 1));
+ ((hmask + 1) << ip_rt_gc_elasticity);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
@@ -971,7 +1098,11 @@ #if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
atomic_read(&ipv4_dst_ops.entries), goal, rover);
#endif
-out: return 0;
+out:
+ if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
+ (hmask + 1) < ip_rt_hashsz_limit)
+ schedule_work(&rtcache_work);
+ return 0;
}
static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
@@ -3201,15 +3332,23 @@ #endif /* CONFIG_NET_CLS_ROUTE */
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
{
+ unsigned long val;
+
if (!str)
return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
+ val = simple_strtoul(str, &str, 0);
+
+ /* Only use it if it's a power-of-2. */
+ if (!(val & (val - 1)))
+ rhash_entries = val;
+
return 1;
}
__setup("rhash_entries=", set_rhash_entries);
int __init ip_rt_init(void)
{
+ unsigned long sz;
int rc = 0;
rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3236,22 +3375,18 @@ #endif
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
- __rt_hash_table = (struct rt_hash_bucket *)
- alloc_large_system_hash("IP route cache",
- sizeof(struct rt_hash_bucket),
- rhash_entries,
- (num_physpages >= 128 * 1024) ?
- 15 : 17,
- HASH_HIGHMEM,
- NULL,
- &__rt_hash_mask,
- 0);
- memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
- rt_hash_lock_init();
+ sz = (rhash_entries ? rhash_entries : 16);
+ sz *= sizeof(struct rt_hash_bucket);
+ __rt_hash_table = rthash_alloc(sz);
+ if (!__rt_hash_table)
+ panic("IP: failed to allocate routing cache hash table");
+ __rt_hash_mask = (sz / sizeof(struct rt_hash_bucket)) - 1;
ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+ rt_hash_lock_init();
+
devinet_init();
ip_fib_init();
--
1.4.2.rc2.g3e042
next prev parent reply other threads:[~2006-08-09 7:54 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-08-09 7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
2006-08-09 7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
2006-08-09 11:31 ` Herbert Xu
2006-08-09 23:05 ` David Miller
2006-08-09 7:53 ` David Miller [this message]
2006-08-09 8:32 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing Eric Dumazet
2006-08-09 9:09 ` David Miller
2006-08-09 10:09 ` Eric Dumazet
2006-08-09 10:12 ` Michael Tokarev
2006-08-09 10:18 ` Evgeniy Polyakov
2006-08-09 11:14 ` David Miller
2006-08-15 8:35 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060809.005337.104642177.davem@davemloft.net \
--to=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).