* [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing.
2006-08-09 7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
@ 2006-08-09 7:53 ` David Miller
2006-08-09 11:31 ` Herbert Xu
2006-08-09 7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2006-08-09 7:53 UTC (permalink / raw)
To: netdev
[IPV4] route: Locking infrastructure for dynamic routing cache sizing.
The basic idea is to wrap captures of the hash table base and hash
mask inside of a seqlock sequence.
The rest of the locking remains unchanged.
Furthermore, rt_hash_table and rt_hash_mask have two-underscores
prepended to their names in order to show that they must be accessed
in a special way.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/route.c | 259 ++++++++++++++++++++++++++++++++++++------------------
1 files changed, 172 insertions(+), 87 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 19bd49d..a7b4ca2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -93,6 +93,7 @@ #include <linux/random.h>
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
+#include <linux/seqlock.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
@@ -243,37 +244,86 @@ # define rt_hash_lock_addr(slot) NULL
# define rt_hash_lock_init()
#endif
-static struct rt_hash_bucket *rt_hash_table;
-static unsigned rt_hash_mask;
-static int rt_hash_log;
-static unsigned int rt_hash_rnd;
+static seqlock_t rt_hash_seq __read_mostly =
+ __SEQLOCK_UNLOCKED(rt_hash_seq);
+static struct rt_hash_bucket *__rt_hash_table __read_mostly;
+static unsigned __rt_hash_mask __read_mostly;
+static unsigned int rt_hash_rnd __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) \
(__raw_get_cpu_var(rt_cache_stat).field++)
-static int rt_intern_hash(unsigned hash, struct rtable *rth,
- struct rtable **res);
-
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
+static unsigned int __rt_hash_code(u32 daddr, u32 saddr, unsigned int hash_mask)
{
return (jhash_2words(daddr, saddr, rt_hash_rnd)
- & rt_hash_mask);
+ & hash_mask);
+}
+
+/* XXX hash table resizing will need to work in three phases
+ * XXX first do the initial transfer to the new table
+ * XXX then instantiate the new table and synchronize_net
+ * XXX finally purge any remnants that got inserted into the old table
+ */
+static struct rt_hash_bucket *rt_get_bucket(u32 daddr, u32 saddr, unsigned int *hp)
+{
+ struct rt_hash_bucket *r;
+ unsigned long seq;
+
+ do {
+ unsigned int hash;
+
+ seq = read_seqbegin(&rt_hash_seq);
+ *hp = hash = __rt_hash_code(daddr, saddr, __rt_hash_mask);
+ r = &__rt_hash_table[hash];
+ } while (read_seqretry(&rt_hash_seq, seq));
+
+ return r;
+}
+
+static struct rt_hash_bucket *rt_get_bucket_nohash(u32 daddr, u32 saddr)
+{
+ struct rt_hash_bucket *r;
+ unsigned long seq;
+
+ do {
+ unsigned int hash;
+
+ seq = read_seqbegin(&rt_hash_seq);
+ hash = __rt_hash_code(daddr, saddr, __rt_hash_mask);
+ r = &__rt_hash_table[hash];
+ } while (read_seqretry(&rt_hash_seq, seq));
+
+ return r;
+}
+
+static void rt_hash_snapshot(struct rt_hash_bucket **table, unsigned int *hmask)
+{
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&rt_hash_seq);
+ *hmask = __rt_hash_mask;
+ *table = __rt_hash_table;
+ } while (read_seqretry(&rt_hash_seq, seq));
}
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
+ struct rt_hash_bucket *table;
int bucket;
};
static struct rtable *rt_cache_get_first(struct seq_file *seq)
{
- struct rtable *r = NULL;
struct rt_cache_iter_state *st = seq->private;
+ struct rtable *r = NULL;
+ unsigned int hmask;
- for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+ rt_hash_snapshot(&st->table, &hmask);
+ for (st->bucket = hmask; st->bucket >= 0; --st->bucket) {
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = st->table[st->bucket].chain;
if (r)
break;
rcu_read_unlock_bh();
@@ -291,7 +341,7 @@ static struct rtable *rt_cache_get_next(
if (--st->bucket < 0)
break;
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = st->table[st->bucket].chain;
}
return r;
}
@@ -620,18 +670,23 @@ static void rt_check_expire(unsigned lon
unsigned int i = rover, goal;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
+ struct rt_hash_bucket *table;
+ unsigned int hmask;
u64 mult;
- mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+ rt_hash_snapshot(&table, &hmask);
+
+ mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
if (ip_rt_gc_timeout > 1)
do_div(mult, ip_rt_gc_timeout);
goal = (unsigned int)mult;
- if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+ if (goal > hmask + 1)
+ goal = hmask + 1;
for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
+ i = (i + 1) & hmask;
+ rthp = &table[i].chain;
if (*rthp == 0)
continue;
@@ -655,7 +710,7 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
/* remove all related balanced entries if necessary */
if (rth->u.dst.flags & DST_BALANCED) {
rthp = rt_remove_balanced_route(
- &rt_hash_table[i].chain,
+ &table[i].chain,
rth, NULL);
if (!rthp)
break;
@@ -683,18 +738,21 @@ #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACH
*/
static void rt_run_flush(unsigned long dummy)
{
- int i;
struct rtable *rth, *next;
+ struct rt_hash_bucket *table;
+ unsigned int hmask;
+ int i;
rt_deadline = 0;
get_random_bytes(&rt_hash_rnd, 4);
- for (i = rt_hash_mask; i >= 0; i--) {
+ rt_hash_snapshot(&table, &hmask);
+ for (i = hmask; i >= 0; i--) {
spin_lock_bh(rt_hash_lock_addr(i));
- rth = rt_hash_table[i].chain;
+ rth = table[i].chain;
if (rth)
- rt_hash_table[i].chain = NULL;
+ table[i].chain = NULL;
spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth; rth = next) {
@@ -777,7 +835,9 @@ static int rt_garbage_collect(void)
static int rover;
static int equilibrium;
struct rtable *rth, **rthp;
+ struct rt_hash_bucket *table;
unsigned long now = jiffies;
+ unsigned int hmask;
int goal;
/*
@@ -793,22 +853,24 @@ static int rt_garbage_collect(void)
goto out;
}
+ rt_hash_snapshot(&table, &hmask);
+
/* Calculate number of entries, which we want to expire now. */
goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << rt_hash_log);
+ (ip_rt_gc_elasticity << long_log2(hmask + 1));
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
if (goal > 0) {
- equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ equilibrium += min_t(unsigned int, goal / 2, hmask + 1);
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
- goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ goal = max_t(unsigned int, goal / 2, hmask + 1);
equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
}
@@ -823,11 +885,11 @@ static int rt_garbage_collect(void)
do {
int i, k;
- for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+ for (i = hmask, k = rover; i >= 0; i--) {
unsigned long tmo = expire;
- k = (k + 1) & rt_hash_mask;
- rthp = &rt_hash_table[k].chain;
+ k = (k + 1) & hmask;
+ rthp = &table[k].chain;
spin_lock_bh(rt_hash_lock_addr(k));
while ((rth = *rthp) != NULL) {
if (!rt_may_expire(rth, tmo, expire)) {
@@ -843,7 +905,7 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
int r;
rthp = rt_remove_balanced_route(
- &rt_hash_table[k].chain,
+ &table[k].chain,
rth,
&r);
goal -= r;
@@ -912,7 +974,7 @@ #endif
out: return 0;
}
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
{
struct rtable *rth, **rthp;
unsigned long now;
@@ -928,7 +990,7 @@ restart:
candp = NULL;
now = jiffies;
- rthp = &rt_hash_table[hash].chain;
+ rthp = &r->chain;
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
@@ -945,13 +1007,12 @@ #endif
* must be visible to another weakly ordered CPU before
* the insertion at the start of the hash chain.
*/
- rcu_assign_pointer(rth->u.rt_next,
- rt_hash_table[hash].chain);
+ rcu_assign_pointer(rth->u.rt_next, r->chain);
/*
* Since lookup is lockfree, the update writes
* must be ordered for consistency on SMP.
*/
- rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+ rcu_assign_pointer(r->chain, rth);
rth->u.dst.__use++;
dst_hold(&rth->u.dst);
@@ -1026,7 +1087,7 @@ #endif
}
}
- rt->u.rt_next = rt_hash_table[hash].chain;
+ rt->u.rt_next = r->chain;
#if RT_CACHE_DEBUG >= 2
if (rt->u.rt_next) {
struct rtable *trt;
@@ -1037,7 +1098,7 @@ #if RT_CACHE_DEBUG >= 2
printk("\n");
}
#endif
- rt_hash_table[hash].chain = rt;
+ r->chain = rt;
spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
return 0;
@@ -1102,19 +1163,19 @@ void __ip_select_ident(struct iphdr *iph
ip_select_fb_ident(iph);
}
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt)
{
struct rtable **rthp;
spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
- for (rthp = &rt_hash_table[hash].chain; *rthp;
- rthp = &(*rthp)->u.rt_next)
+ for (rthp = &r->chain; *rthp; rthp = &(*rthp)->u.rt_next) {
if (*rthp == rt) {
*rthp = rt->u.rt_next;
rt_free(rt);
break;
}
+ }
spin_unlock_bh(rt_hash_lock_addr(hash));
}
@@ -1147,10 +1208,14 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
for (i = 0; i < 2; i++) {
for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash_code(daddr,
- skeys[i] ^ (ikeys[k] << 5));
+ struct rt_hash_bucket *r;
+ unsigned int hash;
+
+ r = rt_get_bucket(daddr,
+ skeys[i] ^ (ikeys[k] << 5),
+ &hash);
- rthp=&rt_hash_table[hash].chain;
+ rthp=&r->chain;
rcu_read_lock();
while ((rth = rcu_dereference(*rthp)) != NULL) {
@@ -1224,8 +1289,8 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
call_netevent_notifiers(NETEVENT_REDIRECT,
&netevent);
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt))
+ rt_del(r, hash, rth);
+ if (!rt_intern_hash(r, hash, rt, &rt))
ip_rt_put(rt);
goto do_next;
}
@@ -1260,15 +1325,18 @@ static struct dst_entry *ipv4_negative_a
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
rt->u.dst.expires) {
- unsigned hash = rt_hash_code(rt->fl.fl4_dst,
- rt->fl.fl4_src ^
- (rt->fl.oif << 5));
+ struct rt_hash_bucket *r;
+ unsigned int hash;
+ r = rt_get_bucket(rt->fl.fl4_dst,
+ rt->fl.fl4_src ^
+ (rt->fl.oif << 5),
+ &hash);
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ip_rt_advice: redirect to "
"%u.%u.%u.%u/%02x dropped\n",
NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
#endif
- rt_del(hash, rt);
+ rt_del(r, hash, rt);
ret = NULL;
}
}
@@ -1405,10 +1473,12 @@ unsigned short ip_rt_frag_needed(struct
return 0;
for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash_code(daddr, skeys[i]);
+ struct rt_hash_bucket *r;
+
+ r = rt_get_bucket_nohash(daddr, skeys[i]);
rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ for (rth = rcu_dereference(r->chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == skeys[i] &&
@@ -1599,8 +1669,9 @@ #endif
static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
u8 tos, struct net_device *dev, int our)
{
- unsigned hash;
struct rtable *rth;
+ struct rt_hash_bucket *r;
+ unsigned int hash;
u32 spec_dst;
struct in_device *in_dev = in_dev_get(dev);
u32 itag = 0;
@@ -1665,8 +1736,8 @@ #endif
RT_CACHE_STAT_INC(in_slow_mc);
in_dev_put(in_dev);
- hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
- return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+ r = rt_get_bucket(daddr, saddr ^ (dev->ifindex << 5), &hash);
+ return rt_intern_hash(r, hash, rth, (struct rtable**) &skb->dst);
e_nobufs:
in_dev_put(in_dev);
@@ -1816,8 +1887,9 @@ static inline int ip_mkroute_input_def(s
u32 daddr, u32 saddr, u32 tos)
{
struct rtable* rth = NULL;
+ struct rt_hash_bucket *r;
+ unsigned int hash;
int err;
- unsigned hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
@@ -1830,8 +1902,8 @@ #endif
return err;
/* put it into the cache */
- hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
- return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ r = rt_get_bucket(daddr, saddr ^ (fl->iif << 5), &hash);
+ return rt_intern_hash(r, hash, rth, (struct rtable**)&skb->dst);
}
static inline int ip_mkroute_input(struct sk_buff *skb,
@@ -1844,7 +1916,6 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
struct rtable* rth = NULL, *rtres;
unsigned char hop, hopcount;
int err = -EINVAL;
- unsigned int hash;
if (res->fi)
hopcount = res->fi->fib_nhs;
@@ -1858,6 +1929,9 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
/* add all alternatives to the routing cache */
for (hop = 0; hop < hopcount; hop++) {
+ struct rt_hash_bucket *r;
+ unsigned int hash;
+
res->nh_sel = hop;
/* put reference to previous result */
@@ -1871,8 +1945,8 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
return err;
/* put it into the cache */
- hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
- err = rt_intern_hash(hash, rth, &rtres);
+ r = rt_get_bucket(daddr, saddr ^ (fl->iif << 5), &hash);
+ err = rt_intern_hash(r, hash, rth, &rtres);
if (err)
return err;
@@ -1919,7 +1993,8 @@ #endif
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
- unsigned hash;
+ struct rt_hash_bucket *r;
+ unsigned int hash;
u32 spec_dst;
int err = -EINVAL;
int free_res = 0;
@@ -2048,8 +2123,8 @@ #endif
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
- err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ r = rt_get_bucket(daddr, saddr ^ (fl.iif << 5), &hash);
+ err = rt_intern_hash(r, hash, rth, (struct rtable**)&skb->dst);
goto done;
no_route:
@@ -2090,15 +2165,15 @@ martian_source:
int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
u8 tos, struct net_device *dev)
{
- struct rtable * rth;
- unsigned hash;
+ struct rt_hash_bucket *r;
+ struct rtable *rth;
int iif = dev->ifindex;
tos &= IPTOS_RT_MASK;
- hash = rt_hash_code(daddr, saddr ^ (iif << 5));
+ r = rt_get_bucket_nohash(daddr, saddr ^ (iif << 5));
rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ for (rth = rcu_dereference(r->chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == saddr &&
@@ -2291,11 +2366,15 @@ static inline int ip_mkroute_output_def(
{
struct rtable *rth = NULL;
int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
+
if (err == 0) {
- hash = rt_hash_code(oldflp->fl4_dst,
- oldflp->fl4_src ^ (oldflp->oif << 5));
- err = rt_intern_hash(hash, rth, rp);
+ struct rt_hash_bucket *r;
+ unsigned int hash;
+
+ r = rt_get_bucket(oldflp->fl4_dst,
+ oldflp->fl4_src ^ (oldflp->oif << 5),
+ &hash);
+ err = rt_intern_hash(r, hash, rth, rp);
}
return err;
@@ -2310,7 +2389,6 @@ static inline int ip_mkroute_output(stru
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
unsigned char hop;
- unsigned hash;
int err = -EINVAL;
struct rtable *rth = NULL;
@@ -2319,6 +2397,8 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
for (hop = 0; hop < hopcount; hop++) {
struct net_device *dev2nexthop;
+ struct rt_hash_bucket *r;
+ unsigned int hash;
res->nh_sel = hop;
@@ -2336,10 +2416,10 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (err != 0)
goto cleanup;
- hash = rt_hash_code(oldflp->fl4_dst,
- oldflp->fl4_src ^
- (oldflp->oif << 5));
- err = rt_intern_hash(hash, rth, rp);
+ r = rt_get_bucket(oldflp->fl4_dst,
+ oldflp->fl4_src ^
+ (oldflp->oif << 5), &hash);
+ err = rt_intern_hash(r, hash, rth, rp);
/* forward hop information to multipath impl. */
multipath_set_nhinfo(rth,
@@ -2564,13 +2644,13 @@ out: return err;
int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
{
- unsigned hash;
+ struct rt_hash_bucket *r;
struct rtable *rth;
- hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
+ r = rt_get_bucket_nohash(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
rcu_read_lock_bh();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+ for (rth = rcu_dereference(r->chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
@@ -2820,18 +2900,23 @@ out_free:
int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct rt_hash_bucket *table;
struct rtable *rt;
+ unsigned int hmask;
int h, s_h;
int idx, s_idx;
+ rt_hash_snapshot(&table, &hmask);
+
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- for (h = 0; h <= rt_hash_mask; h++) {
- if (h < s_h) continue;
+ for (h = 0; h <= hmask; h++) {
+ if (h < s_h)
+ continue;
if (h > s_h)
s_idx = 0;
rcu_read_lock_bh();
- for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+ for (rt = rcu_dereference(table[h].chain), idx = 0; rt;
rt = rcu_dereference(rt->u.rt_next), idx++) {
if (idx < s_idx)
continue;
@@ -3151,21 +3236,21 @@ #endif
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
- rt_hash_table = (struct rt_hash_bucket *)
+ __rt_hash_table = (struct rt_hash_bucket *)
alloc_large_system_hash("IP route cache",
sizeof(struct rt_hash_bucket),
rhash_entries,
(num_physpages >= 128 * 1024) ?
15 : 17,
HASH_HIGHMEM,
- &rt_hash_log,
- &rt_hash_mask,
+ NULL,
+ &__rt_hash_mask,
0);
- memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+ memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
rt_hash_lock_init();
- ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
- ip_rt_max_size = (rt_hash_mask + 1) * 16;
+ ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+ ip_rt_max_size = (__rt_hash_mask + 1) * 16;
devinet_init();
ip_fib_init();
--
1.4.2.rc2.g3e042
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
2006-08-09 7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
2006-08-09 7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
@ 2006-08-09 7:53 ` David Miller
2006-08-09 8:32 ` Eric Dumazet
2006-08-09 10:09 ` Eric Dumazet
1 sibling, 2 replies; 12+ messages in thread
From: David Miller @ 2006-08-09 7:53 UTC (permalink / raw)
To: netdev
[IPV4] route: Dynamic hash table sizing.
The algorithm is stupid, this changeset is about infrastructure.
Currently it starts at 16 entries (or whatever rhash_entries was
specified as), and allows growing up to 8MB.
The code can handle both growing and shrinking just fine, the only
tweaks necessary are to the rthash_new_size() function and the places
where rtcache_work is scheduled.
hashdist is now used at run-time so we need to drop the __initdata
tag.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
mm/page_alloc.c | 2 -
net/ipv4/route.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 158 insertions(+), 23 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
return 0;
}
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a7b4ca2..897e67c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -94,6 +94,9 @@ #include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/seqlock.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
@@ -120,6 +123,7 @@ #define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_min_delay = 2 * HZ;
static int ip_rt_max_delay = 10 * HZ;
static int ip_rt_max_size;
+static int ip_rt_hashsz_limit = (8 * 1024 * 1024) / sizeof(void *);
static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
static int ip_rt_gc_interval = 60 * HZ;
static int ip_rt_gc_min_interval = HZ / 2;
@@ -308,6 +312,135 @@ static void rt_hash_snapshot(struct rt_h
} while (read_seqretry(&rt_hash_seq, seq));
}
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+ struct rt_hash_bucket *n;
+
+ if (sz <= PAGE_SIZE)
+ n = kmalloc(sz, GFP_KERNEL);
+ else if (hashdist)
+ n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+ else
+ n = (struct rt_hash_bucket *)
+ __get_free_pages(GFP_KERNEL, get_order(sz));
+
+ if (n)
+ memset(n, 0, sz);
+
+ return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+ if (sz <= PAGE_SIZE)
+ kfree(r);
+ else if (hashdist)
+ vfree(r);
+ else
+ free_pages((unsigned long)r, get_order(sz));
+}
+
+static void rtcache_transfer(struct rtable *list, struct rt_hash_bucket *new_table, unsigned int nhashmask)
+{
+ while (list) {
+ struct rtable *next = list->u.rt_next;
+ struct rt_hash_bucket *ent;
+ int iface = list->fl.iif;
+ unsigned int hash;
+
+ if (!iface)
+ iface = list->fl.oif;
+ hash = __rt_hash_code(list->fl.fl4_dst,
+ list->fl.fl4_src &
+ (iface << 5),
+ nhashmask);
+ ent = &new_table[hash];
+ list->u.rt_next = ent->chain;
+ ent->chain = list;
+
+ list = next;
+ }
+}
+
+static unsigned long rthash_new_size(void)
+{
+ return ((__rt_hash_mask + 1) << 1) *
+ sizeof(struct rt_hash_bucket);
+}
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+ multipath_remove(rt);
+ call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void rtcache_resize(void *__unused)
+{
+ struct rt_hash_bucket *new, *old;
+ unsigned long nsize;
+ unsigned int nhashmask, ohashmask;
+ int i;
+
+ mutex_lock(&hash_resize_mutex);
+
+ nsize = rthash_new_size();
+ new = rthash_alloc(nsize);
+ if (!new)
+ goto out_unlock;
+
+ write_seqlock_bh(&rt_hash_seq);
+
+ nhashmask = (nsize / sizeof(struct rt_hash_bucket)) - 1U;
+ for (i = __rt_hash_mask; i >= 0; i--) {
+ struct rtable *rth;
+
+ spin_lock_bh(rt_hash_lock_addr(i));
+ rth = __rt_hash_table[i].chain;
+ if (rth)
+ __rt_hash_table[i].chain = NULL;
+ spin_unlock_bh(rt_hash_lock_addr(i));
+
+ rtcache_transfer(rth, new, nhashmask);
+ }
+
+ old = __rt_hash_table;
+ ohashmask = __rt_hash_mask;
+
+ __rt_hash_table = new;
+ __rt_hash_mask = nhashmask;
+
+ /* XXX Do something more intelligent with these things. */
+ ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+ ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+
+ write_sequnlock_bh(&rt_hash_seq);
+
+ synchronize_net();
+
+ /* It is possible that some entries got hashed into the old
+ * table, free any such remnants. No locking is necessary on
+ * the chains as this table is no longer viewable by other
+ * processors.
+ */
+ for (i = ohashmask; i >= 0; i--) {
+ struct rtable *rth, *next;
+
+ for (rth = old[i].chain; rth; rth = next) {
+ next = rth->u.rt_next;
+ rt_free(rth);
+ }
+ }
+
+ rthash_free(old, (ohashmask + 1) * sizeof(struct rt_hash_bucket));
+
+out_unlock:
+ mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(rtcache_work, rtcache_resize, NULL);
+
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
struct rt_hash_bucket *table;
@@ -540,12 +673,6 @@ static struct file_operations rt_cpu_seq
#endif /* CONFIG_PROC_FS */
-static __inline__ void rt_free(struct rtable *rt)
-{
- multipath_remove(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
static __inline__ void rt_drop(struct rtable *rt)
{
multipath_remove(rt);
@@ -676,7 +803,7 @@ static void rt_check_expire(unsigned lon
rt_hash_snapshot(&table, &hmask);
- mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
+ mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
if (ip_rt_gc_timeout > 1)
do_div(mult, ip_rt_gc_timeout);
goal = (unsigned int)mult;
@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
/* Calculate number of entries, which we want to expire now. */
goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << long_log2(hmask + 1));
+ ((hmask + 1) << ip_rt_gc_elasticity);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
@@ -971,7 +1098,11 @@ #if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
atomic_read(&ipv4_dst_ops.entries), goal, rover);
#endif
-out: return 0;
+out:
+ if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
+ (hmask + 1) < ip_rt_hashsz_limit)
+ schedule_work(&rtcache_work);
+ return 0;
}
static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
@@ -3201,15 +3332,23 @@ #endif /* CONFIG_NET_CLS_ROUTE */
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
{
+ unsigned long val;
+
if (!str)
return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
+ val = simple_strtoul(str, &str, 0);
+
+ /* Only use it if it's a power-of-2. */
+ if (!(val & (val - 1)))
+ rhash_entries = val;
+
return 1;
}
__setup("rhash_entries=", set_rhash_entries);
int __init ip_rt_init(void)
{
+ unsigned long sz;
int rc = 0;
rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3236,22 +3375,18 @@ #endif
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
- __rt_hash_table = (struct rt_hash_bucket *)
- alloc_large_system_hash("IP route cache",
- sizeof(struct rt_hash_bucket),
- rhash_entries,
- (num_physpages >= 128 * 1024) ?
- 15 : 17,
- HASH_HIGHMEM,
- NULL,
- &__rt_hash_mask,
- 0);
- memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
- rt_hash_lock_init();
+ sz = (rhash_entries ? rhash_entries : 16);
+ sz *= sizeof(struct rt_hash_bucket);
+ __rt_hash_table = rthash_alloc(sz);
+ if (!__rt_hash_table)
+ panic("IP: failed to allocate routing cache hash table");
+ __rt_hash_mask = (sz / sizeof(struct rt_hash_bucket)) - 1;
ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+ rt_hash_lock_init();
+
devinet_init();
ip_fib_init();
--
1.4.2.rc2.g3e042
^ permalink raw reply related [flat|nested] 12+ messages in thread