[RFC PATCH]: Dynamically sized routing cache hash table.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH]: Dynamically sized routing cache hash table.
@ 2007-03-06  4:26 David Miller
  2007-03-06  7:14 ` Eric Dumazet
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: David Miller @ 2007-03-06  4:26 UTC (permalink / raw)
  To: netdev; +Cc: dada1, robert.olsson, npiggin


This is essentially a "port" of Nick Piggin's dcache hash table
patches to the routing cache.  It solves the locking issues
during table grow/shrink that I couldn't handle properly last
time I tried to code up a patch like this.

But one of the core issues of this kind of change still remains.
There is a conflict between the desire of routing cache garbage
collection to reach a state of equilibrium and the hash table
grow code's desire to match the table size to the current state
of affairs.

Actually, more accurately, the conflict exists in how this GC
logic is implemented.  The core issue is that hash table size
guides the GC processing, and hash table growth therefore
modifies those GC goals.  So with the patch below we'll just
keep growing the hash table instead of giving GC some time to
try to keep the working set in equilibrium before doing the
hash grow.

One idea is to put the hash grow check in the garbage collector,
and put the hash shrink check in rt_del().

In fact, it would be a good time to perhaps hack up some entirely
new passive GC logic for the routing cache.

BTW, another thing that plays into this is that Robert's TRASH work
could make this patch not necessary :-)

Finally, I know that (due to some of Nick's helpful comments the
other day) that I'm missing some rcu_assign_pointer()'s in here.
Fixes in this area are most welcome.

This patch passes basic testing on UP sparc64, but please handle
with care :)

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0b3d7bf..57e004a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -92,6 +92,9 @@
 #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -242,28 +245,195 @@ static spinlock_t	*rt_hash_locks;
 # define rt_hash_lock_init()
 #endif
 
-static struct rt_hash_bucket 	*rt_hash_table;
-static unsigned			rt_hash_mask;
-static int			rt_hash_log;
-static unsigned int		rt_hash_rnd;
+#define MIN_RTHASH_SHIFT 4
+#if BITS_PER_LONG == 32
+#define MAX_RTHASH_SHIFT 24
+#else
+#define MAX_RTHASH_SHIFT 30
+#endif
+
+struct rt_hash {
+	struct rt_hash_bucket	*table;
+	unsigned int		mask;
+	unsigned int		log;
+};
+
+struct rt_hash *rt_hash __read_mostly;
+struct rt_hash *old_rt_hash __read_mostly;
+static unsigned int rt_hash_rnd __read_mostly;
+static DEFINE_SEQLOCK(resize_transfer_lock);
+static DEFINE_MUTEX(resize_mutex);
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) \
 	(__raw_get_cpu_var(rt_cache_stat).field++)
 
-static int rt_intern_hash(unsigned hash, struct rtable *rth,
-				struct rtable **res);
+static void rt_hash_resize(unsigned int new_shift);
+static void check_nr_rthash(void)
+{
+	unsigned int sz = rt_hash->mask + 1;
+	unsigned int nr = atomic_read(&ipv4_dst_ops.entries);
+
+	if (unlikely(nr > (sz + (sz >> 1))))
+		rt_hash_resize(rt_hash->log + 1);
+	else if (unlikely(nr < (sz >> 1)))
+		rt_hash_resize(rt_hash->log - 1);
+}
 
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+	struct rt_hash_bucket *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kmalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+	else
+		n = (struct rt_hash_bucket *)
+			__get_free_pages(GFP_KERNEL, get_order(sz));
+
+	if (n)
+		memset(n, 0, sz);
+
+	return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(r);
+	else if (hashdist)
+		vfree(r);
+	else
+		free_pages((unsigned long)r, get_order(sz));
+}
+
+static unsigned int rt_hash_code(struct rt_hash *hashtable,
+				 u32 daddr, u32 saddr)
 {
 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
-		& rt_hash_mask);
+		& hashtable->mask);
 }
 
-#define rt_hash(daddr, saddr, idx) \
-	rt_hash_code((__force u32)(__be32)(daddr),\
+#define rt_hashfn(htab, daddr, saddr, idx) \
+	rt_hash_code(htab, (__force u32)(__be32)(daddr),\
 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 
+static unsigned int resize_new_shift;
+
+static void rt_hash_resize_work(struct work_struct *work)
+{
+	struct rt_hash *new_hash, *old_hash;
+	unsigned int new_size, old_size, transferred;
+	int i;
+
+	if (!mutex_trylock(&resize_mutex))
+		goto out;
+
+	new_hash = kmalloc(sizeof(struct rt_hash), GFP_KERNEL);
+	if (!new_hash)
+		goto out_unlock;
+
+	new_hash->log = resize_new_shift;
+	new_size = 1 << new_hash->log;
+	new_hash->mask = new_size - 1;
+	new_hash->table = rthash_alloc(new_size*sizeof(struct hlist_head));
+	if (!new_hash->table)
+		goto out_kfree;
+
+	old_rt_hash = rt_hash;
+	/*
+	 * ensure that if the reader sees the new dentry_hash,
+	 * then they will also see the old_dentry_hash assignment,
+	 * above.
+	 */
+	smp_wmb();
+	rt_hash = new_hash;
+	synchronize_rcu();
+
+	old_size = 1 << old_rt_hash->log;
+	transferred = 0;
+	for (i = 0; i < old_size; i++) {
+		struct rtable **head = &old_rt_hash->table[i].chain;
+
+		if (!*head)
+			continue;
+
+		spin_lock_bh(rt_hash_lock_addr(i));
+		write_seqlock(&resize_transfer_lock);
+		while (*head) {
+			struct rtable *rth = *head;
+			int iface = rth->fl.iif;
+			unsigned int hash;
+
+			if (!iface)
+				iface = rth->fl.oif;
+
+			*head = rth->u.dst.rt_next;
+
+			hash = rt_hashfn(rt_hash,
+					 rth->fl.fl4_dst,
+					 rth->fl.fl4_src,
+					 iface);
+			rth->u.dst.rt_next = rt_hash->table[hash].chain;
+			rt_hash->table[hash].chain = rth;
+
+			transferred++;
+		}
+		write_sequnlock(&resize_transfer_lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
+	}
+
+	printk("resize route hash from %u to %u, moved %u entries\n",
+	       old_size, new_size, transferred);
+
+	old_hash = old_rt_hash;
+	old_rt_hash = NULL;
+	mutex_unlock(&resize_mutex);
+	synchronize_rcu();
+	rthash_free(old_hash->table, old_size * sizeof(struct rt_hash_bucket));
+	kfree(old_hash);
+
+	resize_new_shift = 0;
+	return;
+
+out_kfree:
+	kfree(new_hash);
+out_unlock:
+	mutex_unlock(&resize_mutex);
+out:
+	resize_new_shift = 0;
+	return;
+}
+
+static DEFINE_SPINLOCK(resize_lock);
+
+static void rt_hash_resize(unsigned int new_shift)
+{
+	static DECLARE_WORK(resize_work, rt_hash_resize_work);
+
+	if (new_shift < MIN_RTHASH_SHIFT ||
+	    new_shift > MAX_RTHASH_SHIFT)
+		return;
+
+	if (resize_new_shift)
+		return;
+	spin_lock(&resize_lock);
+	if (resize_new_shift) {
+		spin_unlock(&resize_lock);
+		return;
+	}
+	resize_new_shift = new_shift;
+	spin_unlock(&resize_lock);
+
+	printk("rt_hash_resize: new_shift=%u\n", new_shift);
+
+	schedule_work(&resize_work);
+}
+
+static int rt_intern_hash(struct rt_hash *h, unsigned int hash,
+			  struct rtable *rth, struct rtable **res);
+
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	int bucket;
@@ -274,9 +444,9 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
 	struct rtable *r = NULL;
 	struct rt_cache_iter_state *st = seq->private;
 
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+	for (st->bucket = rt_hash->mask; st->bucket >= 0; --st->bucket) {
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = rt_hash->table[st->bucket].chain;
 		if (r)
 			break;
 		rcu_read_unlock_bh();
@@ -294,7 +464,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 		if (--st->bucket < 0)
 			break;
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = rt_hash->table[st->bucket].chain;
 	}
 	return r;
 }
@@ -629,16 +799,16 @@ static void rt_check_expire(unsigned long dummy)
 	unsigned long now = jiffies;
 	u64 mult;
 
-	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	mult = ((u64)ip_rt_gc_interval) << rt_hash->log;
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	if (goal > rt_hash->mask) goal = rt_hash->mask + 1;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
+		i = (i + 1) & rt_hash->mask;
+		rthp = &rt_hash->table[i].chain;
 
 		if (*rthp == 0)
 			continue;
@@ -662,7 +832,7 @@ static void rt_check_expire(unsigned long dummy)
 			/* remove all related balanced entries if necessary */
 			if (rth->u.dst.flags & DST_BALANCED) {
 				rthp = rt_remove_balanced_route(
-					&rt_hash_table[i].chain,
+					&rt_hash->table[i].chain,
 					rth, NULL);
 				if (!rthp)
 					break;
@@ -697,11 +867,11 @@ static void rt_run_flush(unsigned long dummy)
 
 	get_random_bytes(&rt_hash_rnd, 4);
 
-	for (i = rt_hash_mask; i >= 0; i--) {
+	for (i = rt_hash->mask; i >= 0; i--) {
 		spin_lock_bh(rt_hash_lock_addr(i));
-		rth = rt_hash_table[i].chain;
+		rth = rt_hash->table[i].chain;
 		if (rth)
-			rt_hash_table[i].chain = NULL;
+			rt_hash->table[i].chain = NULL;
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
@@ -709,6 +879,7 @@ static void rt_run_flush(unsigned long dummy)
 			rt_free(rth);
 		}
 	}
+	check_nr_rthash();
 }
 
 static DEFINE_SPINLOCK(rt_flush_lock);
@@ -802,20 +973,20 @@ static int rt_garbage_collect(void)
 
 	/* Calculate number of entries, which we want to expire now. */
 	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << rt_hash_log);
+		(ip_rt_gc_elasticity << rt_hash->log);
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+			equilibrium += min_t(unsigned int, goal / 2, rt_hash->mask + 1);
 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 		}
 	} else {
 		/* We are in dangerous area. Try to reduce cache really
 		 * aggressively.
 		 */
-		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+		goal = max_t(unsigned int, goal / 2, rt_hash->mask + 1);
 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 	}
 
@@ -830,11 +1001,11 @@ static int rt_garbage_collect(void)
 	do {
 		int i, k;
 
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+		for (i = rt_hash->mask, k = rover; i >= 0; i--) {
 			unsigned long tmo = expire;
 
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
+			k = (k + 1) & rt_hash->mask;
+			rthp = &rt_hash->table[k].chain;
 			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
@@ -850,7 +1021,7 @@ static int rt_garbage_collect(void)
 					int r;
 
 					rthp = rt_remove_balanced_route(
-						&rt_hash_table[k].chain,
+						&rt_hash->table[k].chain,
 						rth,
 						&r);
 					goal -= r;
@@ -919,7 +1090,8 @@ work_done:
 out:	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(struct rt_hash *h, unsigned hash,
+			  struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -935,7 +1107,7 @@ restart:
 	candp = NULL;
 	now = jiffies;
 
-	rthp = &rt_hash_table[hash].chain;
+	rthp = &h->table[hash].chain;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -953,12 +1125,12 @@ restart:
 			 * the insertion at the start of the hash chain.
 			 */
 			rcu_assign_pointer(rth->u.dst.rt_next,
-					   rt_hash_table[hash].chain);
+					   h->table[hash].chain);
 			/*
 			 * Since lookup is lockfree, the update writes
 			 * must be ordered for consistency on SMP.
 			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+			rcu_assign_pointer(h->table[hash].chain, rth);
 
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
@@ -1033,7 +1205,7 @@ restart:
 		}
 	}
 
-	rt->u.dst.rt_next = rt_hash_table[hash].chain;
+	rt->u.dst.rt_next = h->table[hash].chain;
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.dst.rt_next) {
 		struct rtable *trt;
@@ -1044,9 +1216,10 @@ restart:
 		printk("\n");
 	}
 #endif
-	rt_hash_table[hash].chain = rt;
+	h->table[hash].chain = rt;
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
+	check_nr_rthash();
 	return 0;
 }
 
@@ -1109,13 +1282,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 	ip_select_fb_ident(iph);
 }
 
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(struct rt_hash *h, unsigned hash, struct rtable *rt)
 {
 	struct rtable **rthp;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
-	for (rthp = &rt_hash_table[hash].chain; *rthp;
+	for (rthp = &h->table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.dst.rt_next)
 		if (*rthp == rt) {
 			*rthp = rt->u.dst.rt_next;
@@ -1123,6 +1296,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 			break;
 		}
 	spin_unlock_bh(rt_hash_lock_addr(hash));
+	check_nr_rthash();
 }
 
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
@@ -1154,9 +1328,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
 	for (i = 0; i < 2; i++) {
 		for (k = 0; k < 2; k++) {
-			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
+			struct rt_hash *h = rt_hash;
+			unsigned hash = rt_hashfn(h, daddr, skeys[i], ikeys[k]);
 
-			rthp=&rt_hash_table[hash].chain;
+			rthp=&h->table[hash].chain;
 
 			rcu_read_lock();
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
@@ -1230,8 +1405,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				call_netevent_notifiers(NETEVENT_REDIRECT,
 							&netevent);
 
-				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt))
+				rt_del(h, hash, rth);
+				if (!rt_intern_hash(h, hash, rt, &rt))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1266,14 +1441,15 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   rt->u.dst.expires) {
-			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
-						rt->fl.oif);
+			struct rt_hash *h = rt_hash;
+			unsigned hash = rt_hashfn(h, rt->fl.fl4_dst,
+						  rt->fl.fl4_src, rt->fl.oif);
 #if RT_CACHE_DEBUG >= 1
 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
 					  "%u.%u.%u.%u/%02x dropped\n",
 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
 #endif
-			rt_del(hash, rt);
+			rt_del(h, hash, rt);
 			ret = NULL;
 		}
 	}
@@ -1411,10 +1587,11 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 		return 0;
 
 	for (i = 0; i < 2; i++) {
-		unsigned hash = rt_hash(daddr, skeys[i], 0);
+		struct rt_hash *h = rt_hash;
+		unsigned hash = rt_hashfn(h, daddr, skeys[i], 0);
 
 		rcu_read_lock();
-		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+		for (rth = rcu_dereference(h->table[hash].chain); rth;
 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
 			if (rth->fl.fl4_dst == daddr &&
 			    rth->fl.fl4_src == skeys[i] &&
@@ -1669,8 +1846,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	RT_CACHE_STAT_INC(in_slow_mc);
 
 	in_dev_put(in_dev);
-	hash = rt_hash(daddr, saddr, dev->ifindex);
-	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+	hash = rt_hashfn(rt_hash, daddr, saddr, dev->ifindex);
+	return rt_intern_hash(rt_hash, hash, rth, (struct rtable**) &skb->dst);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -1833,8 +2010,8 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
 		return err;
 
 	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl->iif);
-	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+	hash = rt_hashfn(rt_hash, daddr, saddr, fl->iif);
+	return rt_intern_hash(rt_hash, hash, rth, (struct rtable**)&skb->dst);
 }
 
 static inline int ip_mkroute_input(struct sk_buff *skb,
@@ -1874,8 +2051,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
 			return err;
 
 		/* put it into the cache */
-		hash = rt_hash(daddr, saddr, fl->iif);
-		err = rt_intern_hash(hash, rth, &rtres);
+		hash = rt_hashfn(rt_hash, daddr, saddr, fl->iif);
+		err = rt_intern_hash(rt_hash, hash, rth, &rtres);
 		if (err)
 			return err;
 
@@ -2047,8 +2224,8 @@ local_input:
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	rth->rt_type	= res.type;
-	hash = rt_hash(daddr, saddr, fl.iif);
-	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+	hash = rt_hashfn(rt_hash, daddr, saddr, fl.iif);
+	err = rt_intern_hash(rt_hash, hash, rth, (struct rtable**)&skb->dst);
 	goto done;
 
 no_route:
@@ -2086,18 +2263,13 @@ martian_source:
 	goto e_inval;
 }
 
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		   u8 tos, struct net_device *dev)
+static int __input_find(struct rt_hash *h, struct sk_buff *skb,
+			__be32 daddr, __be32 saddr, u8 tos, int iif)
 {
-	struct rtable * rth;
-	unsigned	hash;
-	int iif = dev->ifindex;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif);
+	unsigned int hash = rt_hashfn(h, daddr, saddr, iif);
+	struct rtable *rth;
 
-	rcu_read_lock();
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	for (rth = rcu_dereference(h->table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
 		if (rth->fl.fl4_dst == daddr &&
 		    rth->fl.fl4_src == saddr &&
@@ -2109,14 +2281,50 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
 			skb->dst = (struct dst_entry*)rth;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
 	}
+
+	return 1;
+}
+
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
+{
+	struct rt_hash *htab, *old_htab;
+	int iif = dev->ifindex;
+	int ret;
+
+	tos &= IPTOS_RT_MASK;
+
+	rcu_read_lock();
+	htab = rt_hash;
+	smp_rmb();
+	old_htab = old_rt_hash;
+	if (unlikely(old_htab)) {
+		unsigned long seq;
+		do {
+			seq = read_seqbegin(&resize_transfer_lock);
+			ret = __input_find(old_htab, skb, daddr,
+					   saddr, tos, iif);
+			if (!ret)
+				goto out_rcu;
+			ret = __input_find(htab, skb, daddr,
+					   saddr, tos, iif);
+			if (!ret)
+				goto out_rcu;
+		} while (read_seqretry(&resize_transfer_lock, seq));
+	} else {
+		ret = __input_find(htab, skb, daddr, saddr, tos, iif);
+	}
+out_rcu:
 	rcu_read_unlock();
 
+	if (!ret)
+		return ret;
+
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2288,8 +2496,9 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
 	unsigned hash;
 	if (err == 0) {
-		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
-		err = rt_intern_hash(hash, rth, rp);
+		hash = rt_hashfn(rt_hash,
+				 oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
+		err = rt_intern_hash(rt_hash, hash, rth, rp);
 	}
 
 	return err;
@@ -2330,9 +2539,9 @@ static inline int ip_mkroute_output(struct rtable** rp,
 			if (err != 0)
 				goto cleanup;
 
-			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
-					oldflp->oif);
-			err = rt_intern_hash(hash, rth, rp);
+			hash = rt_hashfn(rt_hash, oldflp->fl4_dst,
+					 oldflp->fl4_src,	oldflp->oif);
+			err = rt_intern_hash(rt_hash, hash, rth, rp);
 
 			/* forward hop information to multipath impl. */
 			multipath_set_nhinfo(rth,
@@ -2553,15 +2762,13 @@ make_route:
 out:	return err;
 }
 
-int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+static int __output_find(struct rt_hash *h, struct rtable **rp,
+			 const struct flowi *flp)
 {
-	unsigned hash;
+	unsigned int hash = rt_hashfn(h, flp->fl4_dst, flp->fl4_src, flp->oif);
 	struct rtable *rth;
 
-	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	for (rth = rcu_dereference(h->table[hash].chain); rth;
 		rth = rcu_dereference(rth->u.dst.rt_next)) {
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
 		    rth->fl.fl4_src == flp->fl4_src &&
@@ -2577,7 +2784,6 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 			if (multipath_select_route(flp, rth, rp)) {
 				dst_hold(&(*rp)->u.dst);
 				RT_CACHE_STAT_INC(out_hit);
-				rcu_read_unlock_bh();
 				return 0;
 			}
 
@@ -2585,14 +2791,44 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
 			*rp = rth;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(out_hlist_search);
 	}
+
+	return 1;
+}
+
+int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+{
+	struct rt_hash *htab, *old_htab;
+	int ret;
+
+	rcu_read_lock_bh();
+	htab = rt_hash;
+	smp_rmb();
+	old_htab = old_rt_hash;
+	if (unlikely(old_htab)) {
+		unsigned long seq;
+		do {
+			seq = read_seqbegin(&resize_transfer_lock);
+			ret = __output_find(old_htab, rp, flp);
+			if (!ret)
+				goto out_rcu;
+			ret = __output_find(htab, rp, flp);
+			if (!ret)
+				goto out_rcu;
+		} while (read_seqretry(&resize_transfer_lock, seq));
+	} else {
+		ret = __output_find(htab, rp, flp);
+	}
+out_rcu:
 	rcu_read_unlock_bh();
 
+	if (!ret)
+		return 0;
+
 	return ip_route_output_slow(rp, flp);
 }
 
@@ -2810,20 +3046,21 @@ errout_free:
 	goto errout;
 }
 
-int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct rt_hash *htab = rt_hash;
 	struct rtable *rt;
 	int h, s_h;
 	int idx, s_idx;
 
 	s_h = cb->args[0];
 	s_idx = idx = cb->args[1];
-	for (h = 0; h <= rt_hash_mask; h++) {
+	for (h = 0; h <= htab->mask; h++) {
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
 		rcu_read_lock_bh();
-		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+		for (rt = rcu_dereference(htab->table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
 			if (idx < s_idx)
 				continue;
@@ -3116,6 +3353,7 @@ __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
+	unsigned int hash_size;
 	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3138,21 +3376,21 @@ int __init ip_rt_init(void)
 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(num_physpages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					0);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+	rt_hash = kmalloc(sizeof(struct rt_hash), GFP_ATOMIC);
+	if (!rt_hash)
+		panic("Failed to allocate rt_hash\n");
+	rt_hash->log = MIN_RTHASH_SHIFT;
+	hash_size = 1 << rt_hash->log;
+	rt_hash->mask = hash_size - 1;
+	rt_hash->table = rthash_alloc(hash_size *
+				      sizeof(struct rt_hash_bucket));
+	if (!rt_hash->table)
+		panic("Failed to allocate rt_hash->table\n");
+
 	rt_hash_lock_init();
 
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = (rt_hash->mask + 1);
+	ip_rt_max_size = (rt_hash->mask + 1) * 16;
 
 	devinet_init();
 	ip_fib_init();

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  4:26 [RFC PATCH]: Dynamically sized routing cache hash table David Miller
@ 2007-03-06  7:14 ` Eric Dumazet
  2007-03-06  7:23   ` David Miller
  2007-03-06 13:42   ` [RFC PATCH]: Dynamically sized routing cache hash table Robert Olsson
  2007-03-06  9:11 ` Nick Piggin
  2007-03-06 13:26 ` Robert Olsson
  2 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06  7:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, robert.olsson, npiggin

David Miller a écrit :
> This is essentially a "port" of Nick Piggin's dcache hash table
> patches to the routing cache.  It solves the locking issues
> during table grow/shrink that I couldn't handle properly last
> time I tried to code up a patch like this.
> 
> But one of the core issues of this kind of change still remains.
> There is a conflict between the desire of routing cache garbage
> collection to reach a state of equilibrium and the hash table
> grow code's desire to match the table size to the current state
> of affairs.
> 
> Actually, more accurately, the conflict exists in how this GC
> logic is implemented.  The core issue is that hash table size
> guides the GC processing, and hash table growth therefore
> modifies those GC goals.  So with the patch below we'll just
> keep growing the hash table instead of giving GC some time to
> try to keep the working set in equilibrium before doing the
> hash grow.
> 
> One idea is to put the hash grow check in the garbage collector,
> and put the hash shrink check in rt_del().
> 
> In fact, it would be a good time to perhaps hack up some entirely
> new passive GC logic for the routing cache.
> 
> BTW, another thing that plays into this is that Robert's TRASH work
> could make this patch not necessary :-)

Well, maybe... but after looking robert's trash, I discovered its model is 
essentially a big (2^18 slots) root node (our hash table), and very few 
order:1,2,3 nodes.

Almost all leaves... work in progress anyway.

Please find my comments in your patch
> 
> Finally, I know that (due to some of Nick's helpful comments the
> other day) that I'm missing some rcu_assign_pointer()'s in here.
> Fixes in this area are most welcome.
> 
> This patch passes basic testing on UP sparc64, but please handle
> with care :)
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 0b3d7bf..57e004a 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -92,6 +92,9 @@
>  #include <linux/jhash.h>
>  #include <linux/rcupdate.h>
>  #include <linux/times.h>
> +#include <linux/workqueue.h>
> +#include <linux/vmalloc.h>
> +#include <linux/mutex.h>
>  #include <net/protocol.h>
>  #include <net/ip.h>
>  #include <net/route.h>
> @@ -242,28 +245,195 @@ static spinlock_t	*rt_hash_locks;
>  # define rt_hash_lock_init()
>  #endif
>  
> -static struct rt_hash_bucket 	*rt_hash_table;
> -static unsigned			rt_hash_mask;
> -static int			rt_hash_log;
> -static unsigned int		rt_hash_rnd;
> +#define MIN_RTHASH_SHIFT 4

I wonder... are you sure this has no relation with the size of rt_hash_locks / 
RT_HASH_LOCK_SZ ?
One entry must have the same lock in the two tables when resizing is in flight.
#define MIN_RTHASH_SHIFT LOG2(RT_HASH_LOCK_SZ)

> +#if BITS_PER_LONG == 32
> +#define MAX_RTHASH_SHIFT 24
> +#else
> +#define MAX_RTHASH_SHIFT 30
> +#endif
> +
> +struct rt_hash {
> +	struct rt_hash_bucket	*table;
> +	unsigned int		mask;
> +	unsigned int		log;
> +};
> +
> +struct rt_hash *rt_hash __read_mostly;
> +struct rt_hash *old_rt_hash __read_mostly;
> +static unsigned int rt_hash_rnd __read_mostly;
> +static DEFINE_SEQLOCK(resize_transfer_lock);
> +static DEFINE_MUTEX(resize_mutex);

I think a better model would be a structure, with a part containing 'read 
mostly' data, and part of 'higly modified' data with appropriate align_to_cache

For example, resize_transfer_lock should be in the first part, like rt_hash 
and old_rt_hash, dont you think ?

All static data of this file should be placed on this single structure so that 
we can easily avoid false sharing and have optimal placement.

>  
>  static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
>  #define RT_CACHE_STAT_INC(field) \
>  	(__raw_get_cpu_var(rt_cache_stat).field++)
>  
> -static int rt_intern_hash(unsigned hash, struct rtable *rth,
> -				struct rtable **res);
> +static void rt_hash_resize(unsigned int new_shift);
> +static void check_nr_rthash(void)
> +{
> +	unsigned int sz = rt_hash->mask + 1;
> +	unsigned int nr = atomic_read(&ipv4_dst_ops.entries);
> +
> +	if (unlikely(nr > (sz + (sz >> 1))))
> +		rt_hash_resize(rt_hash->log + 1);
> +	else if (unlikely(nr < (sz >> 1)))
> +		rt_hash_resize(rt_hash->log - 1);
> +}
>  
> -static unsigned int rt_hash_code(u32 daddr, u32 saddr)
> +static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
> +{
> +	struct rt_hash_bucket *n;
> +
> +	if (sz <= PAGE_SIZE)
> +		n = kmalloc(sz, GFP_KERNEL);
> +	else if (hashdist)
> +		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
> +	else
> +		n = (struct rt_hash_bucket *)
> +			__get_free_pages(GFP_KERNEL, get_order(sz));

I dont feel well with this.
Maybe we could try a __get_free_pages(), and in case of failure, fallback to 
vmalloc(). Then keep a flag to be able to free memory correctly. Anyway, if 
(get_order(sz)>=MAX_ORDER) we know __get_free_pages() will fail.



> +
> +	if (n)
> +		memset(n, 0, sz);
> +
> +	return n;
> +}
> +
> +static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
> +{
> +	if (sz <= PAGE_SIZE)
> +		kfree(r);
> +	else if (hashdist)
> +		vfree(r);
> +	else
> +		free_pages((unsigned long)r, get_order(sz));
> +}
> +
> +static unsigned int rt_hash_code(struct rt_hash *hashtable,
> +				 u32 daddr, u32 saddr)

Could you add const qualifiers to 'struct rt_hash *' in prototypes where 
appropriate ?

>  {
>  	return (jhash_2words(daddr, saddr, rt_hash_rnd)
> -		& rt_hash_mask);
> +		& hashtable->mask);
>  }
>  
> -#define rt_hash(daddr, saddr, idx) \
> -	rt_hash_code((__force u32)(__be32)(daddr),\
> +#define rt_hashfn(htab, daddr, saddr, idx) \
> +	rt_hash_code(htab, (__force u32)(__be32)(daddr),\
>  		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
>  
> +static unsigned int resize_new_shift;
> +
> +static void rt_hash_resize_work(struct work_struct *work)
> +{
> +	struct rt_hash *new_hash, *old_hash;
> +	unsigned int new_size, old_size, transferred;
> +	int i;
> +
> +	if (!mutex_trylock(&resize_mutex))
> +		goto out;
> +
> +	new_hash = kmalloc(sizeof(struct rt_hash), GFP_KERNEL);
> +	if (!new_hash)
> +		goto out_unlock;
> +
> +	new_hash->log = resize_new_shift;
> +	new_size = 1 << new_hash->log;
> +	new_hash->mask = new_size - 1;
> +	new_hash->table = rthash_alloc(new_size*sizeof(struct hlist_head));

Maybe that for small tables (less than PAGE_SIZE/2), we could embed them in 
'struct rt_hash'

> +	if (!new_hash->table)
> +		goto out_kfree;
> +
> +	old_rt_hash = rt_hash;
> +	/*
> +	 * ensure that if the reader sees the new dentry_hash,
> +	 * then they will also see the old_dentry_hash assignment,
> +	 * above.
> +	 */
> +	smp_wmb();
> +	rt_hash = new_hash;
> +	synchronize_rcu();
> +
> +	old_size = 1 << old_rt_hash->log;
> +	transferred = 0;
> +	for (i = 0; i < old_size; i++) {
> +		struct rtable **head = &old_rt_hash->table[i].chain;
> +
> +		if (!*head)
> +			continue;
> +
> +		spin_lock_bh(rt_hash_lock_addr(i));
> +		write_seqlock(&resize_transfer_lock);
> +		while (*head) {
> +			struct rtable *rth = *head;
> +			int iface = rth->fl.iif;
> +			unsigned int hash;
> +
> +			if (!iface)
> +				iface = rth->fl.oif;
> +
> +			*head = rth->u.dst.rt_next;
> +
> +			hash = rt_hashfn(rt_hash,
> +					 rth->fl.fl4_dst,
> +					 rth->fl.fl4_src,
> +					 iface);
> +			rth->u.dst.rt_next = rt_hash->table[hash].chain;
> +			rt_hash->table[hash].chain = rth;
> +
> +			transferred++;
> +		}
> +		write_sequnlock(&resize_transfer_lock);
> +		spin_unlock_bh(rt_hash_lock_addr(i));
> +	}
> +
> +	printk("resize route hash from %u to %u, moved %u entries\n",
> +	       old_size, new_size, transferred);
> +
> +	old_hash = old_rt_hash;
> +	old_rt_hash = NULL;
> +	mutex_unlock(&resize_mutex);
> +	synchronize_rcu();
> +	rthash_free(old_hash->table, old_size * sizeof(struct rt_hash_bucket));
> +	kfree(old_hash);
> +
> +	resize_new_shift = 0;
> +	return;
> +
> +out_kfree:
> +	kfree(new_hash);
> +out_unlock:
> +	mutex_unlock(&resize_mutex);
> +out:
> +	resize_new_shift = 0;
> +	return;
> +}
> +
> +static DEFINE_SPINLOCK(resize_lock);

Could we group all static vars at the begining of this file, so that we 
clearly see where we should place them, to avoid false sharing.

> +
> +static void rt_hash_resize(unsigned int new_shift)
> +{
> +	static DECLARE_WORK(resize_work, rt_hash_resize_work);
> +
> +	if (new_shift < MIN_RTHASH_SHIFT ||
> +	    new_shift > MAX_RTHASH_SHIFT)
> +		return;
> +
> +	if (resize_new_shift)
> +		return;
> +	spin_lock(&resize_lock);
> +	if (resize_new_shift) {
> +		spin_unlock(&resize_lock);
> +		return;
> +	}
> +	resize_new_shift = new_shift;
> +	spin_unlock(&resize_lock);
> +
> +	printk("rt_hash_resize: new_shift=%u\n", new_shift);
> +
> +	schedule_work(&resize_work);
> +}
> +
> +static int rt_intern_hash(struct rt_hash *h, unsigned int hash,
> +			  struct rtable *rth, struct rtable **res);
> +
>  #ifdef CONFIG_PROC_FS
>  struct rt_cache_iter_state {
>  	int bucket;
> @@ -274,9 +444,9 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
>  	struct rtable *r = NULL;
>  	struct rt_cache_iter_state *st = seq->private;
>  
> -	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
> +	for (st->bucket = rt_hash->mask; st->bucket >= 0; --st->bucket) {
>  		rcu_read_lock_bh();
> -		r = rt_hash_table[st->bucket].chain;
> +		r = rt_hash->table[st->bucket].chain;
>  		if (r)
>  			break;
>  		rcu_read_unlock_bh();
> @@ -294,7 +464,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
>  		if (--st->bucket < 0)
>  			break;
>  		rcu_read_lock_bh();
> -		r = rt_hash_table[st->bucket].chain;
> +		r = rt_hash->table[st->bucket].chain;
>  	}
>  	return r;
>  }
> @@ -629,16 +799,16 @@ static void rt_check_expire(unsigned long dummy)
>  	unsigned long now = jiffies;
>  	u64 mult;
>  
> -	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
> +	mult = ((u64)ip_rt_gc_interval) << rt_hash->log;
>  	if (ip_rt_gc_timeout > 1)
>  		do_div(mult, ip_rt_gc_timeout);
>  	goal = (unsigned int)mult;
> -	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
> +	if (goal > rt_hash->mask) goal = rt_hash->mask + 1;
>  	for (; goal > 0; goal--) {
>  		unsigned long tmo = ip_rt_gc_timeout;
>  
> -		i = (i + 1) & rt_hash_mask;
> -		rthp = &rt_hash_table[i].chain;
> +		i = (i + 1) & rt_hash->mask;
> +		rthp = &rt_hash->table[i].chain;
>  
>  		if (*rthp == 0)
>  			continue;
> @@ -662,7 +832,7 @@ static void rt_check_expire(unsigned long dummy)
>  			/* remove all related balanced entries if necessary */
>  			if (rth->u.dst.flags & DST_BALANCED) {
>  				rthp = rt_remove_balanced_route(
> -					&rt_hash_table[i].chain,
> +					&rt_hash->table[i].chain,
>  					rth, NULL);
>  				if (!rthp)
>  					break;
> @@ -697,11 +867,11 @@ static void rt_run_flush(unsigned long dummy)
>  
>  	get_random_bytes(&rt_hash_rnd, 4);
>  
> -	for (i = rt_hash_mask; i >= 0; i--) {
> +	for (i = rt_hash->mask; i >= 0; i--) {
>  		spin_lock_bh(rt_hash_lock_addr(i));
> -		rth = rt_hash_table[i].chain;
> +		rth = rt_hash->table[i].chain;
>  		if (rth)
> -			rt_hash_table[i].chain = NULL;
> +			rt_hash->table[i].chain = NULL;
>  		spin_unlock_bh(rt_hash_lock_addr(i));
>  
>  		for (; rth; rth = next) {
> @@ -709,6 +879,7 @@ static void rt_run_flush(unsigned long dummy)
>  			rt_free(rth);
>  		}
>  	}
> +	check_nr_rthash();
>  }
>  
>  static DEFINE_SPINLOCK(rt_flush_lock);
> @@ -802,20 +973,20 @@ static int rt_garbage_collect(void)
>  
>  	/* Calculate number of entries, which we want to expire now. */
>  	goal = atomic_read(&ipv4_dst_ops.entries) -
> -		(ip_rt_gc_elasticity << rt_hash_log);
> +		(ip_rt_gc_elasticity << rt_hash->log);
>  	if (goal <= 0) {
>  		if (equilibrium < ipv4_dst_ops.gc_thresh)
>  			equilibrium = ipv4_dst_ops.gc_thresh;
>  		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
>  		if (goal > 0) {
> -			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
> +			equilibrium += min_t(unsigned int, goal / 2, rt_hash->mask + 1);
>  			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
>  		}
>  	} else {
>  		/* We are in dangerous area. Try to reduce cache really
>  		 * aggressively.
>  		 */
> -		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
> +		goal = max_t(unsigned int, goal / 2, rt_hash->mask + 1);
>  		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
>  	}
>  
> @@ -830,11 +1001,11 @@ static int rt_garbage_collect(void)
>  	do {
>  		int i, k;
>  
> -		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
> +		for (i = rt_hash->mask, k = rover; i >= 0; i--) {
>  			unsigned long tmo = expire;
>  
> -			k = (k + 1) & rt_hash_mask;
> -			rthp = &rt_hash_table[k].chain;
> +			k = (k + 1) & rt_hash->mask;
> +			rthp = &rt_hash->table[k].chain;
>  			spin_lock_bh(rt_hash_lock_addr(k));
>  			while ((rth = *rthp) != NULL) {
>  				if (!rt_may_expire(rth, tmo, expire)) {
> @@ -850,7 +1021,7 @@ static int rt_garbage_collect(void)
>  					int r;
>  
>  					rthp = rt_remove_balanced_route(
> -						&rt_hash_table[k].chain,
> +						&rt_hash->table[k].chain,
>  						rth,
>  						&r);
>  					goal -= r;
> @@ -919,7 +1090,8 @@ work_done:
>  out:	return 0;
>  }
>  
> -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
> +static int rt_intern_hash(struct rt_hash *h, unsigned hash,
> +			  struct rtable *rt, struct rtable **rp)
>  {
>  	struct rtable	*rth, **rthp;
>  	unsigned long	now;
> @@ -935,7 +1107,7 @@ restart:
>  	candp = NULL;
>  	now = jiffies;
>  
> -	rthp = &rt_hash_table[hash].chain;
> +	rthp = &h->table[hash].chain;
>  
>  	spin_lock_bh(rt_hash_lock_addr(hash));
>  	while ((rth = *rthp) != NULL) {
> @@ -953,12 +1125,12 @@ restart:
>  			 * the insertion at the start of the hash chain.
>  			 */
>  			rcu_assign_pointer(rth->u.dst.rt_next,
> -					   rt_hash_table[hash].chain);
> +					   h->table[hash].chain);
>  			/*
>  			 * Since lookup is lockfree, the update writes
>  			 * must be ordered for consistency on SMP.
>  			 */
> -			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
> +			rcu_assign_pointer(h->table[hash].chain, rth);
>  
>  			rth->u.dst.__use++;
>  			dst_hold(&rth->u.dst);
> @@ -1033,7 +1205,7 @@ restart:
>  		}
>  	}
>  
> -	rt->u.dst.rt_next = rt_hash_table[hash].chain;
> +	rt->u.dst.rt_next = h->table[hash].chain;
>  #if RT_CACHE_DEBUG >= 2
>  	if (rt->u.dst.rt_next) {
>  		struct rtable *trt;
> @@ -1044,9 +1216,10 @@ restart:
>  		printk("\n");
>  	}
>  #endif
> -	rt_hash_table[hash].chain = rt;
> +	h->table[hash].chain = rt;
>  	spin_unlock_bh(rt_hash_lock_addr(hash));
>  	*rp = rt;
> +	check_nr_rthash();
>  	return 0;
>  }
>  
> @@ -1109,13 +1282,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
>  	ip_select_fb_ident(iph);
>  }
>  
> -static void rt_del(unsigned hash, struct rtable *rt)
> +static void rt_del(struct rt_hash *h, unsigned hash, struct rtable *rt)
>  {
>  	struct rtable **rthp;
>  
>  	spin_lock_bh(rt_hash_lock_addr(hash));
>  	ip_rt_put(rt);
> -	for (rthp = &rt_hash_table[hash].chain; *rthp;
> +	for (rthp = &h->table[hash].chain; *rthp;
>  	     rthp = &(*rthp)->u.dst.rt_next)
>  		if (*rthp == rt) {
>  			*rthp = rt->u.dst.rt_next;
> @@ -1123,6 +1296,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
>  			break;
>  		}
>  	spin_unlock_bh(rt_hash_lock_addr(hash));
> +	check_nr_rthash();
>  }
>  
>  void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
> @@ -1154,9 +1328,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
>  
>  	for (i = 0; i < 2; i++) {
>  		for (k = 0; k < 2; k++) {
> -			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
> +			struct rt_hash *h = rt_hash;
> +			unsigned hash = rt_hashfn(h, daddr, skeys[i], ikeys[k]);
>  
> -			rthp=&rt_hash_table[hash].chain;
> +			rthp=&h->table[hash].chain;
>  
>  			rcu_read_lock();
>  			while ((rth = rcu_dereference(*rthp)) != NULL) {
> @@ -1230,8 +1405,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
>  				call_netevent_notifiers(NETEVENT_REDIRECT,
>  							&netevent);
>  
> -				rt_del(hash, rth);
> -				if (!rt_intern_hash(hash, rt, &rt))
> +				rt_del(h, hash, rth);
> +				if (!rt_intern_hash(h, hash, rt, &rt))
>  					ip_rt_put(rt);
>  				goto do_next;
>  			}
> @@ -1266,14 +1441,15 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
>  			ret = NULL;
>  		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
>  			   rt->u.dst.expires) {
> -			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
> -						rt->fl.oif);
> +			struct rt_hash *h = rt_hash;
> +			unsigned hash = rt_hashfn(h, rt->fl.fl4_dst,
> +						  rt->fl.fl4_src, rt->fl.oif);
>  #if RT_CACHE_DEBUG >= 1
>  			printk(KERN_DEBUG "ip_rt_advice: redirect to "
>  					  "%u.%u.%u.%u/%02x dropped\n",
>  				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
>  #endif
> -			rt_del(hash, rt);
> +			rt_del(h, hash, rt);
>  			ret = NULL;
>  		}
>  	}
> @@ -1411,10 +1587,11 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
>  		return 0;
>  
>  	for (i = 0; i < 2; i++) {
> -		unsigned hash = rt_hash(daddr, skeys[i], 0);
> +		struct rt_hash *h = rt_hash;
> +		unsigned hash = rt_hashfn(h, daddr, skeys[i], 0);
>  
>  		rcu_read_lock();
> -		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> +		for (rth = rcu_dereference(h->table[hash].chain); rth;
>  		     rth = rcu_dereference(rth->u.dst.rt_next)) {
>  			if (rth->fl.fl4_dst == daddr &&
>  			    rth->fl.fl4_src == skeys[i] &&
> @@ -1669,8 +1846,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>  	RT_CACHE_STAT_INC(in_slow_mc);
>  
>  	in_dev_put(in_dev);
> -	hash = rt_hash(daddr, saddr, dev->ifindex);
> -	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
> +	hash = rt_hashfn(rt_hash, daddr, saddr, dev->ifindex);
> +	return rt_intern_hash(rt_hash, hash, rth, (struct rtable**) &skb->dst);
>  
>  e_nobufs:
>  	in_dev_put(in_dev);
> @@ -1833,8 +2010,8 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
>  		return err;
>  
>  	/* put it into the cache */
> -	hash = rt_hash(daddr, saddr, fl->iif);
> -	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
> +	hash = rt_hashfn(rt_hash, daddr, saddr, fl->iif);
> +	return rt_intern_hash(rt_hash, hash, rth, (struct rtable**)&skb->dst);
>  }
>  
>  static inline int ip_mkroute_input(struct sk_buff *skb,
> @@ -1874,8 +2051,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
>  			return err;
>  
>  		/* put it into the cache */
> -		hash = rt_hash(daddr, saddr, fl->iif);
> -		err = rt_intern_hash(hash, rth, &rtres);
> +		hash = rt_hashfn(rt_hash, daddr, saddr, fl->iif);
> +		err = rt_intern_hash(rt_hash, hash, rth, &rtres);
>  		if (err)
>  			return err;
>  
> @@ -2047,8 +2224,8 @@ local_input:
>  		rth->rt_flags 	&= ~RTCF_LOCAL;
>  	}
>  	rth->rt_type	= res.type;
> -	hash = rt_hash(daddr, saddr, fl.iif);
> -	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
> +	hash = rt_hashfn(rt_hash, daddr, saddr, fl.iif);
> +	err = rt_intern_hash(rt_hash, hash, rth, (struct rtable**)&skb->dst);
>  	goto done;
>  
>  no_route:
> @@ -2086,18 +2263,13 @@ martian_source:
>  	goto e_inval;
>  }
>  
> -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
> -		   u8 tos, struct net_device *dev)
> +static int __input_find(struct rt_hash *h, struct sk_buff *skb,
> +			__be32 daddr, __be32 saddr, u8 tos, int iif)
>  {
> -	struct rtable * rth;
> -	unsigned	hash;
> -	int iif = dev->ifindex;
> -
> -	tos &= IPTOS_RT_MASK;
> -	hash = rt_hash(daddr, saddr, iif);
> +	unsigned int hash = rt_hashfn(h, daddr, saddr, iif);
> +	struct rtable *rth;
>  
> -	rcu_read_lock();
> -	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> +	for (rth = rcu_dereference(h->table[hash].chain); rth;
>  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
>  		if (rth->fl.fl4_dst == daddr &&
>  		    rth->fl.fl4_src == saddr &&
> @@ -2109,14 +2281,50 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>  			dst_hold(&rth->u.dst);
>  			rth->u.dst.__use++;
>  			RT_CACHE_STAT_INC(in_hit);
> -			rcu_read_unlock();
>  			skb->dst = (struct dst_entry*)rth;
>  			return 0;
>  		}
>  		RT_CACHE_STAT_INC(in_hlist_search);
>  	}
> +
> +	return 1;
> +}
> +
> +int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
> +		   u8 tos, struct net_device *dev)
> +{
> +	struct rt_hash *htab, *old_htab;
> +	int iif = dev->ifindex;
> +	int ret;
> +
> +	tos &= IPTOS_RT_MASK;
> +
> +	rcu_read_lock();
> +	htab = rt_hash;
> +	smp_rmb();
> +	old_htab = old_rt_hash;
> +	if (unlikely(old_htab)) {
> +		unsigned long seq;
> +		do {
> +			seq = read_seqbegin(&resize_transfer_lock);
> +			ret = __input_find(old_htab, skb, daddr,
> +					   saddr, tos, iif);
> +			if (!ret)
> +				goto out_rcu;
> +			ret = __input_find(htab, skb, daddr,
> +					   saddr, tos, iif);
> +			if (!ret)
> +				goto out_rcu;
> +		} while (read_seqretry(&resize_transfer_lock, seq));
> +	} else {
> +		ret = __input_find(htab, skb, daddr, saddr, tos, iif);
> +	}
> +out_rcu:
>  	rcu_read_unlock();
>  
> +	if (!ret)
> +		return ret;
> +
>  	/* Multicast recognition logic is moved from route cache to here.
>  	   The problem was that too many Ethernet cards have broken/missing
>  	   hardware multicast filters :-( As result the host on multicasting
> @@ -2288,8 +2496,9 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
>  	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
>  	unsigned hash;
>  	if (err == 0) {
> -		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
> -		err = rt_intern_hash(hash, rth, rp);
> +		hash = rt_hashfn(rt_hash,
> +				 oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
> +		err = rt_intern_hash(rt_hash, hash, rth, rp);
>  	}
>  
>  	return err;
> @@ -2330,9 +2539,9 @@ static inline int ip_mkroute_output(struct rtable** rp,
>  			if (err != 0)
>  				goto cleanup;
>  
> -			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
> -					oldflp->oif);
> -			err = rt_intern_hash(hash, rth, rp);
> +			hash = rt_hashfn(rt_hash, oldflp->fl4_dst,
> +					 oldflp->fl4_src,	oldflp->oif);
> +			err = rt_intern_hash(rt_hash, hash, rth, rp);
>  
>  			/* forward hop information to multipath impl. */
>  			multipath_set_nhinfo(rth,
> @@ -2553,15 +2762,13 @@ make_route:
>  out:	return err;
>  }
>  
> -int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
> +static int __output_find(struct rt_hash *h, struct rtable **rp,
> +			 const struct flowi *flp)
>  {
> -	unsigned hash;
> +	unsigned int hash = rt_hashfn(h, flp->fl4_dst, flp->fl4_src, flp->oif);
>  	struct rtable *rth;
>  
> -	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
> -
> -	rcu_read_lock_bh();
> -	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> +	for (rth = rcu_dereference(h->table[hash].chain); rth;
>  		rth = rcu_dereference(rth->u.dst.rt_next)) {
>  		if (rth->fl.fl4_dst == flp->fl4_dst &&
>  		    rth->fl.fl4_src == flp->fl4_src &&
> @@ -2577,7 +2784,6 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
>  			if (multipath_select_route(flp, rth, rp)) {
>  				dst_hold(&(*rp)->u.dst);
>  				RT_CACHE_STAT_INC(out_hit);
> -				rcu_read_unlock_bh();
>  				return 0;
>  			}
>  
> @@ -2585,14 +2791,44 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
>  			dst_hold(&rth->u.dst);
>  			rth->u.dst.__use++;
>  			RT_CACHE_STAT_INC(out_hit);
> -			rcu_read_unlock_bh();
>  			*rp = rth;
>  			return 0;
>  		}
>  		RT_CACHE_STAT_INC(out_hlist_search);
>  	}
> +
> +	return 1;
> +}
> +
> +int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
> +{
> +	struct rt_hash *htab, *old_htab;
> +	int ret;
> +
> +	rcu_read_lock_bh();
> +	htab = rt_hash;
> +	smp_rmb();
> +	old_htab = old_rt_hash;
> +	if (unlikely(old_htab)) {
> +		unsigned long seq;
> +		do {
> +			seq = read_seqbegin(&resize_transfer_lock);
> +			ret = __output_find(old_htab, rp, flp);
> +			if (!ret)
> +				goto out_rcu;
> +			ret = __output_find(htab, rp, flp);
> +			if (!ret)
> +				goto out_rcu;
> +		} while (read_seqretry(&resize_transfer_lock, seq));
> +	} else {
> +		ret = __output_find(htab, rp, flp);
> +	}
> +out_rcu:
>  	rcu_read_unlock_bh();
>  
> +	if (!ret)
> +		return 0;
> +
>  	return ip_route_output_slow(rp, flp);
>  }
>  
> @@ -2810,20 +3046,21 @@ errout_free:
>  	goto errout;
>  }
>  
> -int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
> +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
>  {
> +	struct rt_hash *htab = rt_hash;
>  	struct rtable *rt;
>  	int h, s_h;
>  	int idx, s_idx;
>  
>  	s_h = cb->args[0];
>  	s_idx = idx = cb->args[1];
> -	for (h = 0; h <= rt_hash_mask; h++) {
> +	for (h = 0; h <= htab->mask; h++) {
>  		if (h < s_h) continue;
>  		if (h > s_h)
>  			s_idx = 0;
>  		rcu_read_lock_bh();
> -		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
> +		for (rt = rcu_dereference(htab->table[h].chain), idx = 0; rt;
>  		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
>  			if (idx < s_idx)
>  				continue;
> @@ -3116,6 +3353,7 @@ __setup("rhash_entries=", set_rhash_entries);
>  
>  int __init ip_rt_init(void)
>  {
> +	unsigned int hash_size;
>  	int rc = 0;
>  
>  	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
> @@ -3138,21 +3376,21 @@ int __init ip_rt_init(void)
>  		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
>  				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
>  
> -	rt_hash_table = (struct rt_hash_bucket *)
> -		alloc_large_system_hash("IP route cache",
> -					sizeof(struct rt_hash_bucket),
> -					rhash_entries,
> -					(num_physpages >= 128 * 1024) ?
> -					15 : 17,
> -					0,
> -					&rt_hash_log,
> -					&rt_hash_mask,
> -					0);
> -	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
> +	rt_hash = kmalloc(sizeof(struct rt_hash), GFP_ATOMIC);
> +	if (!rt_hash)
> +		panic("Failed to allocate rt_hash\n");
> +	rt_hash->log = MIN_RTHASH_SHIFT;
> +	hash_size = 1 << rt_hash->log;
> +	rt_hash->mask = hash_size - 1;
> +	rt_hash->table = rthash_alloc(hash_size *
> +				      sizeof(struct rt_hash_bucket));
> +	if (!rt_hash->table)
> +		panic("Failed to allocate rt_hash->table\n");
> +
>  	rt_hash_lock_init();
>  
> -	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
> -	ip_rt_max_size = (rt_hash_mask + 1) * 16;
> +	ipv4_dst_ops.gc_thresh = (rt_hash->mask + 1);
> +	ip_rt_max_size = (rt_hash->mask + 1) * 16;
>  
>  	devinet_init();
>  	ip_fib_init();
> 
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  7:14 ` Eric Dumazet
@ 2007-03-06  7:23   ` David Miller
  2007-03-06  7:58     ` Eric Dumazet
  2007-03-06 13:42   ` [RFC PATCH]: Dynamically sized routing cache hash table Robert Olsson
  1 sibling, 1 reply; 21+ messages in thread
From: David Miller @ 2007-03-06  7:23 UTC (permalink / raw)
  To: dada1; +Cc: netdev, robert.olsson, npiggin

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 06 Mar 2007 08:14:46 +0100

> I wonder... are you sure this has no relation with the size of rt_hash_locks / 
> RT_HASH_LOCK_SZ ?
> One entry must have the same lock in the two tables when resizing is in flight.
> #define MIN_RTHASH_SHIFT LOG2(RT_HASH_LOCK_SZ)

Good point.

> > +static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
> > +{
> > +	struct rt_hash_bucket *n;
> > +
> > +	if (sz <= PAGE_SIZE)
> > +		n = kmalloc(sz, GFP_KERNEL);
> > +	else if (hashdist)
> > +		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
> > +	else
> > +		n = (struct rt_hash_bucket *)
> > +			__get_free_pages(GFP_KERNEL, get_order(sz));
> 
> I dont feel well with this.
> Maybe we could try a __get_free_pages(), and in case of failure, fallback to 
> vmalloc(). Then keep a flag to be able to free memory correctly. Anyway, if 
> (get_order(sz)>=MAX_ORDER) we know __get_free_pages() will fail.

We have to use vmalloc() for the hashdist case so that the pages
are spread out properly on NUMA systems.  That's exactly what the
large system hash allocator is going to do on bootup anyways.

Look, either both are right or both are wrong.  I'm just following
protocol above and you'll note the PRECISE same logic exists in other
dynamically growing hash table implementations such as
net/xfrm/xfrm_hash.c

> Could you add const qualifiers to 'struct rt_hash *' in prototypes where 
> appropriate ?

Sure, no problem.

> Maybe that for small tables (less than PAGE_SIZE/2), we could embed them in 
> 'struct rt_hash'

Not worth the pain nor the in-kernel-image-space it would chew up,
in my opinion.

After you visit a handful of web sites you'll get beyond that
threshold.

> Could we group all static vars at the begining of this file, so that we 
> clearly see where we should place them, to avoid false sharing.

Sure.

> > +
> > +static void rt_hash_resize(unsigned int new_shift)

Damn, please don't quote such huge portions of a patch without any
comments, this has to go out to several thousand recipients you know
:-/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  7:23   ` David Miller
@ 2007-03-06  7:58     ` Eric Dumazet
  2007-03-06  9:05       ` David Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06  7:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, robert.olsson, npiggin

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 06 Mar 2007 08:14:46 +0100
> 
>> I wonder... are you sure this has no relation with the size of rt_hash_locks / 
>> RT_HASH_LOCK_SZ ?
>> One entry must have the same lock in the two tables when resizing is in flight.
>> #define MIN_RTHASH_SHIFT LOG2(RT_HASH_LOCK_SZ)
> 
> Good point.
> 
>>> +static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
>>> +{
>>> +	struct rt_hash_bucket *n;
>>> +
>>> +	if (sz <= PAGE_SIZE)
>>> +		n = kmalloc(sz, GFP_KERNEL);
>>> +	else if (hashdist)
>>> +		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
>>> +	else
>>> +		n = (struct rt_hash_bucket *)
>>> +			__get_free_pages(GFP_KERNEL, get_order(sz));
>> I dont feel well with this.
>> Maybe we could try a __get_free_pages(), and in case of failure, fallback to 
>> vmalloc(). Then keep a flag to be able to free memory correctly. Anyway, if 
>> (get_order(sz)>=MAX_ORDER) we know __get_free_pages() will fail.
> 
> We have to use vmalloc() for the hashdist case so that the pages
> are spread out properly on NUMA systems.  That's exactly what the
> large system hash allocator is going to do on bootup anyways.

Yes, but on bootup you have an appropriate NUMA active policy. (Well... we 
hope so, but it broke several time in the past)
I am not sure what kind of mm policy is active for scheduled works.

Anyway I have some XX GB machines, non NUMA, and I would love to be able to 
have a 2^20 slots hash table, without having to increase MAX_ORDER.

> 
> Look, either both are right or both are wrong.  I'm just following
> protocol above and you'll note the PRECISE same logic exists in other
> dynamically growing hash table implementations such as
> net/xfrm/xfrm_hash.c
> 


Yes, they are both wrong/dumb :)

Can we be smarter, or do we have to stay dumb ? :)

struct rt_hash_bucket *n = NULL;

	if (sz <= PAGE_SIZE) {
		n = kmalloc(sz, GFP_KERNEL);
		*kind = allocated_by_kmalloc;
	}
	else if (!hashdist) {
		n = (struct rt_hash_bucket *)
			__get_free_pages(GFP_KERNEL, get_order(sz));
		*kind = allocated_by_get_free_pages;
	}
	if (!n) {
		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
		*kind = allocated_by_vmalloc;
	}


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  7:58     ` Eric Dumazet
@ 2007-03-06  9:05       ` David Miller
  2007-03-06 10:33         ` [PATCH] NET : Optimizes inet_getpeer() Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: David Miller @ 2007-03-06  9:05 UTC (permalink / raw)
  To: dada1; +Cc: netdev, robert.olsson, npiggin

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 06 Mar 2007 08:58:45 +0100

> Yes, but on bootup you have an appropriate NUMA active policy. (Well... we 
> hope so, but it broke several time in the past)
> I am not sure what kind of mm policy is active for scheduled works.

Good point, that definitely needs investigation.

Thanks for all of your comments so far Eric, very useful :)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  4:26 [RFC PATCH]: Dynamically sized routing cache hash table David Miller
  2007-03-06  7:14 ` Eric Dumazet
@ 2007-03-06  9:11 ` Nick Piggin
  2007-03-06  9:17   ` David Miller
  2007-03-06  9:23   ` Eric Dumazet
  2007-03-06 13:26 ` Robert Olsson
  2 siblings, 2 replies; 21+ messages in thread
From: Nick Piggin @ 2007-03-06  9:11 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, dada1, robert.olsson, Paul McKenney

On Mon, Mar 05, 2007 at 08:26:32PM -0800, David Miller wrote:
> 
> This is essentially a "port" of Nick Piggin's dcache hash table
> patches to the routing cache.  It solves the locking issues
> during table grow/shrink that I couldn't handle properly last
> time I tried to code up a patch like this.
> 
> But one of the core issues of this kind of change still remains.
> There is a conflict between the desire of routing cache garbage
> collection to reach a state of equilibrium and the hash table
> grow code's desire to match the table size to the current state
> of affairs.
> 
> Actually, more accurately, the conflict exists in how this GC
> logic is implemented.  The core issue is that hash table size
> guides the GC processing, and hash table growth therefore
> modifies those GC goals.  So with the patch below we'll just
> keep growing the hash table instead of giving GC some time to
> try to keep the working set in equilibrium before doing the
> hash grow.
> 
> One idea is to put the hash grow check in the garbage collector,
> and put the hash shrink check in rt_del().
> 
> In fact, it would be a good time to perhaps hack up some entirely
> new passive GC logic for the routing cache.
> 
> BTW, another thing that plays into this is that Robert's TRASH work
> could make this patch not necessary :-)
> 
> Finally, I know that (due to some of Nick's helpful comments the
> other day) that I'm missing some rcu_assign_pointer()'s in here.
> Fixes in this area are most welcome.

Cool! I have some fixes for the rcu barrier issues, with some C-style
comments and questions :)

I was going to send you a fix first for the rcu barriers, then a
second to convert the read-side to a barrier-less one that I described,
however considering that your patch is a WIP in progress anyway, I
won't worry too much about the normal protocol.

I _think_ my reasoning regarding the rcu barriers and grace periods
is correct. I'll keep thinking about it though. (Paul cc'ed).

I'm not so familiar with this code, so I have sprinkled around a lot
of comments that could be pure crap ;) They are mainly just to help
you ensure that you cover all bases... compile tested only at this
stage.

--
Index: linux-2.6/net/ipv4/route.c
===================================================================
--- linux-2.6.orig/net/ipv4/route.c
+++ linux-2.6/net/ipv4/route.c
@@ -311,6 +311,8 @@ static void rthash_free(struct rt_hash_b
 static unsigned int rt_hash_code(struct rt_hash *hashtable,
 				 u32 daddr, u32 saddr)
 {
+	/* BUG_ON(!rcu_read_protected()) */
+
 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
 		& hashtable->mask);
 }
@@ -343,11 +345,16 @@ static void rt_hash_resize_work(struct w
 
 	old_rt_hash = rt_hash;
 	/*
-	 * ensure that if the reader sees the new dentry_hash,
-	 * then they will also see the old_dentry_hash assignment,
-	 * above.
+	 * ensure that if the reader sees the new rt_hash, then they will also
+	 * see the old_rt_hash assignment, above. synchronize_rcu() is used
+	 * rather than smp_wmb(), in order to avoid the smp_rmb() in the
+	 * read-sidde. However synchronize_rcu() also implies a smp_wmb(), so
+	 * that also means we can skip rcu_assign_pointer().
+	 *
+	 * The readers can then also skip rcu_dereference, because a grace
+	 * period implies that all readers have performed memory barriers.
 	 */
-	smp_wmb();
+	synchronize_rcu();
 	rt_hash = new_hash;
 	synchronize_rcu();
 
@@ -1100,6 +1107,8 @@ static int rt_intern_hash(struct rt_hash
 	int		chain_length;
 	int attempts = !in_softirq();
 
+	/* BUG_ON(!rcu_read_protected()) */
+
 restart:
 	chain_length = 0;
 	min_score = ~(u32)0;
@@ -1286,6 +1295,8 @@ static void rt_del(struct rt_hash *h, un
 {
 	struct rtable **rthp;
 
+	/* BUG_ON(!rcu_read_protected()) */
+
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &h->table[hash].chain; *rthp;
@@ -1328,12 +1339,24 @@ void ip_rt_redirect(__be32 old_gw, __be3
 
 	for (i = 0; i < 2; i++) {
 		for (k = 0; k < 2; k++) {
-			struct rt_hash *h = rt_hash;
-			unsigned hash = rt_hashfn(h, daddr, skeys[i], ikeys[k]);
+			struct rt_hash *h;
+			unsigned hash;
+
+			/*
+			 * rcu_read_lock() must cover the load of rt_hash, in
+			 * order to satisfy our RCU protected dynamic hash
+			 * sizing scheme; and it must also cover the hash list
+			 * traversal, to satisfy our RCU protected lockless
+			 * hash entry lookups.
+			 *
+			 * This note applies throughout the file.
+			 */
+			rcu_read_lock();
+			h = rt_hash;
+			hash = rt_hashfn(h, daddr, skeys[i], ikeys[k]);
 
 			rthp=&h->table[hash].chain;
 
-			rcu_read_lock();
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
 				struct rtable *rt;
 
@@ -1449,6 +1472,12 @@ static struct dst_entry *ipv4_negative_a
 					  "%u.%u.%u.%u/%02x dropped\n",
 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
 #endif
+			/* XXX:
+			 * What if rt does not exist in rt_hash, but is in
+			 * old_rt_hash? Don't we have to also check there?
+			 * Similar questions for a couple of other places that
+			 * look at rt_hash, but not old_rt_hash.
+			 */
 			rt_del(h, hash, rt);
 			ret = NULL;
 		}
@@ -1587,10 +1616,13 @@ unsigned short ip_rt_frag_needed(struct 
 		return 0;
 
 	for (i = 0; i < 2; i++) {
-		struct rt_hash *h = rt_hash;
-		unsigned hash = rt_hashfn(h, daddr, skeys[i], 0);
+		struct rt_hash *h;
+		unsigned hash;
 
 		rcu_read_lock();
+		h = rt_hash;
+		hash = rt_hashfn(h, daddr, skeys[i], 0);
+
 		for (rth = rcu_dereference(h->table[hash].chain); rth;
 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
 			if (rth->fl.fl4_dst == daddr &&
@@ -2269,6 +2301,8 @@ static int __input_find(struct rt_hash *
 	unsigned int hash = rt_hashfn(h, daddr, saddr, iif);
 	struct rtable *rth;
 
+	/* BUG_ON(!rcu_read_protected()) */
+
 	for (rth = rcu_dereference(h->table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
 		if (rth->fl.fl4_dst == daddr &&
@@ -2301,7 +2335,6 @@ int ip_route_input(struct sk_buff *skb, 
 
 	rcu_read_lock();
 	htab = rt_hash;
-	smp_rmb();
 	old_htab = old_rt_hash;
 	if (unlikely(old_htab)) {
 		unsigned long seq;
@@ -2768,6 +2801,8 @@ static int __output_find(struct rt_hash 
 	unsigned int hash = rt_hashfn(h, flp->fl4_dst, flp->fl4_src, flp->oif);
 	struct rtable *rth;
 
+	/* BUG_ON(!rcu_read_protected()) */
+
 	for (rth = rcu_dereference(h->table[hash].chain); rth;
 		rth = rcu_dereference(rth->u.dst.rt_next)) {
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
@@ -2807,7 +2842,6 @@ int __ip_route_output_key(struct rtable 
 
 	rcu_read_lock_bh();
 	htab = rt_hash;
-	smp_rmb();
 	old_htab = old_rt_hash;
 	if (unlikely(old_htab)) {
 		unsigned long seq;
@@ -3048,18 +3082,20 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct rt_hash *htab = rt_hash;
+	struct rt_hash *htab;
 	struct rtable *rt;
 	int h, s_h;
 	int idx, s_idx;
 
 	s_h = cb->args[0];
 	s_idx = idx = cb->args[1];
+
+	rcu_read_lock_bh();
+	htab = rt_hash;
 	for (h = 0; h <= htab->mask; h++) {
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
-		rcu_read_lock_bh();
 		for (rt = rcu_dereference(htab->table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
 			if (idx < s_idx)
@@ -3069,15 +3105,14 @@ int ip_rt_dump(struct sk_buff *skb, stru
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-				rcu_read_unlock_bh();
 				goto done;
 			}
 			dst_release(xchg(&skb->dst, NULL));
 		}
-		rcu_read_unlock_bh();
 	}
-
 done:
+	rcu_read_unlock_bh();
+
 	cb->args[0] = h;
 	cb->args[1] = idx;
 	return skb->len;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  9:11 ` Nick Piggin
@ 2007-03-06  9:17   ` David Miller
  2007-03-06  9:22     ` Nick Piggin
  2007-03-06  9:23   ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: David Miller @ 2007-03-06  9:17 UTC (permalink / raw)
  To: npiggin; +Cc: netdev, dada1, robert.olsson, paulmck

From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Mar 2007 10:11:12 +0100

> @@ -1449,6 +1472,12 @@ static struct dst_entry *ipv4_negative_a
>  					  "%u.%u.%u.%u/%02x dropped\n",
>  				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
>  #endif
> +			/* XXX:
> +			 * What if rt does not exist in rt_hash, but is in
> +			 * old_rt_hash? Don't we have to also check there?
> +			 * Similar questions for a couple of other places that
> +			 * look at rt_hash, but not old_rt_hash.
> +			 */
>  			rt_del(h, hash, rt);
>  			ret = NULL;
>  		}

For the cases like ip_rt_redirect() I made the decision that we'll
just not add the complexity of having to look in the old_rt_hash
table.

In these kinds of cases it's OK to miss events, they will just happen
again.

It's one of the nice things about the routing cache, if you lose
information it's OK because we'll just cook up a new entry from
the persistent backing store that is the real routing tables.
And for events like redirects, if we miss it, we'll just send the
packet to the wrong next-hop again and receive another redirect
message which we'll (hopefully) propagate to a routine cache
entry.

Thanks for your feedback patch Nick, I'll process it tomorrow
hopefully after getting some sleep.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  9:17   ` David Miller
@ 2007-03-06  9:22     ` Nick Piggin
  0 siblings, 0 replies; 21+ messages in thread
From: Nick Piggin @ 2007-03-06  9:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, dada1, robert.olsson, paulmck

On Tue, Mar 06, 2007 at 01:17:06AM -0800, David Miller wrote:
> From: Nick Piggin <npiggin@suse.de>
> Date: Tue, 6 Mar 2007 10:11:12 +0100
> 
> > @@ -1449,6 +1472,12 @@ static struct dst_entry *ipv4_negative_a
> >  					  "%u.%u.%u.%u/%02x dropped\n",
> >  				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
> >  #endif
> > +			/* XXX:
> > +			 * What if rt does not exist in rt_hash, but is in
> > +			 * old_rt_hash? Don't we have to also check there?
> > +			 * Similar questions for a couple of other places that
> > +			 * look at rt_hash, but not old_rt_hash.
> > +			 */
> >  			rt_del(h, hash, rt);
> >  			ret = NULL;
> >  		}
> 
> For the cases like ip_rt_redirect() I made the decision that we'll
> just not add the complexity of having to look in the old_rt_hash
> table.
> 
> In these kinds of cases it's OK to miss events, they will just happen
> again.
> 
> It's one of the nice things about the routing cache, if you lose
> information it's OK because we'll just cook up a new entry from
> the persistent backing store that is the real routing tables.
> And for events like redirects, if we miss it, we'll just send the
> packet to the wrong next-hop again and receive another redirect
> message which we'll (hopefully) propagate to a routine cache
> entry.

Ah, that's a very neat trick. OK so with that question out of the
way, there _may_ just be a few other places where you're working
with an rt_hash table outside an rcu read critical section.

I tried to fix up a couple of obvious ones, and I've just put in
the BUG_ON assertions for you to verify the rest.

> Thanks for your feedback patch Nick, I'll process it tomorrow
> hopefully after getting some sleep.

No problem. Thanks.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  9:11 ` Nick Piggin
  2007-03-06  9:17   ` David Miller
@ 2007-03-06  9:23   ` Eric Dumazet
  2007-03-06  9:41     ` Nick Piggin
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06  9:23 UTC (permalink / raw)
  To: Nick Piggin; +Cc: David Miller, netdev, robert.olsson, Paul McKenney

On Tuesday 06 March 2007 10:11, Nick Piggin wrote:

> Cool! I have some fixes for the rcu barrier issues, with some C-style
> comments and questions :)
>
> I was going to send you a fix first for the rcu barriers, then a
> second to convert the read-side to a barrier-less one that I described,
> however considering that your patch is a WIP in progress anyway, I
> won't worry too much about the normal protocol.
>
> I _think_ my reasoning regarding the rcu barriers and grace periods
> is correct. I'll keep thinking about it though. (Paul cc'ed).
>
> I'm not so familiar with this code, so I have sprinkled around a lot
> of comments that could be pure crap ;) They are mainly just to help
> you ensure that you cover all bases... compile tested only at this
> stage.

I think we missed :

+static void rt_hash_resize_work(struct work_struct *work)

+
+			*head = rth->u.dst.rt_next;
+
+			hash = rt_hashfn(rt_hash,
+					 rth->fl.fl4_dst,
+					 rth->fl.fl4_src,
+					 iface);
+			rth->u.dst.rt_next = rt_hash->table[hash].chain;
+			rt_hash->table[hash].chain = rth;

This really needs some ..._del_rcu()/..._add_rcu()_ ... primitives, no ?
Or else a reader might be very confused...


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  9:23   ` Eric Dumazet
@ 2007-03-06  9:41     ` Nick Piggin
  0 siblings, 0 replies; 21+ messages in thread
From: Nick Piggin @ 2007-03-06  9:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, robert.olsson, Paul McKenney

On Tue, Mar 06, 2007 at 10:23:44AM +0100, Eric Dumazet wrote:
> On Tuesday 06 March 2007 10:11, Nick Piggin wrote:
> 
> > Cool! I have some fixes for the rcu barrier issues, with some C-style
> > comments and questions :)
> >
> > I was going to send you a fix first for the rcu barriers, then a
> > second to convert the read-side to a barrier-less one that I described,
> > however considering that your patch is a WIP in progress anyway, I
> > won't worry too much about the normal protocol.
> >
> > I _think_ my reasoning regarding the rcu barriers and grace periods
> > is correct. I'll keep thinking about it though. (Paul cc'ed).
> >
> > I'm not so familiar with this code, so I have sprinkled around a lot
> > of comments that could be pure crap ;) They are mainly just to help
> > you ensure that you cover all bases... compile tested only at this
> > stage.
> 
> I think we missed :
> 
> +static void rt_hash_resize_work(struct work_struct *work)
> 
> +
> +			*head = rth->u.dst.rt_next;
> +
> +			hash = rt_hashfn(rt_hash,
> +					 rth->fl.fl4_dst,
> +					 rth->fl.fl4_src,
> +					 iface);
> +			rth->u.dst.rt_next = rt_hash->table[hash].chain;
> +			rt_hash->table[hash].chain = rth;
> 
> This really needs some ..._del_rcu()/..._add_rcu()_ ... primitives, no ?
> Or else a reader might be very confused...

I'm not sure... this code really depends on the hash table management,
rather than the management of the hash tables, if you understand me ;)

>From what I can _see_, this is similar to how rt_intern_hash does it.
I don't know exactly why rt_intern_hash can get away without using
rcu_assign_pointer in some cases, however:

Note that we don't need an rcu_assign_pointer for this, because the
memory operations that initialized the entry have already been ordered
when it was first inserted.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] NET : Optimizes inet_getpeer()
  2007-03-06  9:05       ` David Miller
@ 2007-03-06 10:33         ` Eric Dumazet
  2007-03-07  4:23           ` David Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06 10:33 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 588 bytes --]

Hi David

Please find this patch against net-2.6.22

Thank you

[PATCH] NET : Optimizes inet_getpeer()

1) Some sysctl vars are declared __read_mostly

2) We can avoid updating stack[] when doing an AVL lookup only.

    lookup() macro is extended to receive a second parameter, that may be NULL 
in case of a pure lookup (no need to save the AVL path). This removes 
unnecessary instructions, because compiler knows if this _stack parameter is 
NULL or not.

    text size of net/ipv4/inetpeer.o is 2063 bytes instead of 2107 on x86_64

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: inetpeer.patch --]
[-- Type: text/plain, Size: 3282 bytes --]

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index db3ef96..2f44e61 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -87,10 +87,12 @@ #define PEER_MAXDEPTH 40 /* sufficient f
 
 static int peer_total;
 /* Exported for sysctl_net_ipv4.  */
-int inet_peer_threshold = 65536 + 128;	/* start to throw entries more
+int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
 					 * aggressively at this stage */
-int inet_peer_minttl = 120 * HZ;	/* TTL under high load: 120 sec */
-int inet_peer_maxttl = 10 * 60 * HZ;	/* usual time to live: 10 min */
+int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
+int inet_peer_gc_mintime __read_mostly = 10 * HZ;
+int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
 
 static struct inet_peer *inet_peer_unused_head;
 static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
@@ -99,9 +101,6 @@ static DEFINE_SPINLOCK(inet_peer_unused_
 static void peer_check_expire(unsigned long dummy);
 static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
 
-/* Exported for sysctl_net_ipv4.  */
-int inet_peer_gc_mintime = 10 * HZ,
-    inet_peer_gc_maxtime = 120 * HZ;
 
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
@@ -151,20 +150,27 @@ static void unlink_from_unused(struct in
 	spin_unlock_bh(&inet_peer_unused_lock);
 }
 
-/* Called with local BH disabled and the pool lock held. */
-#define lookup(daddr) 						\
+/*
+ * Called with local BH disabled and the pool lock held.
+ * _stack is known to be NULL or not at compile time,
+ * so compiler will optimize the if (_stack) tests.
+ */
+#define lookup(_daddr,_stack) 					\
 ({								\
 	struct inet_peer *u, **v;				\
-	stackptr = stack;					\
-	*stackptr++ = &peer_root;				\
+	if (_stack) {						\
+		stackptr = _stack;				\
+		*stackptr++ = &peer_root;			\
+	}							\
 	for (u = peer_root; u != peer_avl_empty; ) {		\
-		if (daddr == u->v4daddr)			\
+		if (_daddr == u->v4daddr)			\
 			break;					\
-		if ((__force __u32)daddr < (__force __u32)u->v4daddr)	\
+		if ((__force __u32)_daddr < (__force __u32)u->v4daddr)	\
 			v = &u->avl_left;			\
 		else						\
 			v = &u->avl_right;			\
-		*stackptr++ = v;				\
+		if (_stack)					\
+			*stackptr++ = v;			\
 		u = *v;						\
 	}							\
 	u;							\
@@ -288,7 +294,7 @@ static void unlink_from_pool(struct inet
 	if (atomic_read(&p->refcnt) == 1) {
 		struct inet_peer **stack[PEER_MAXDEPTH];
 		struct inet_peer ***stackptr, ***delp;
-		if (lookup(p->v4daddr) != p)
+		if (lookup(p->v4daddr, stack) != p)
 			BUG();
 		delp = stackptr - 1; /* *delp[0] == p */
 		if (p->avl_left == peer_avl_empty) {
@@ -373,7 +379,7 @@ struct inet_peer *inet_getpeer(__be32 da
 
 	/* Look up for the address quickly. */
 	read_lock_bh(&peer_pool_lock);
-	p = lookup(daddr);
+	p = lookup(daddr, NULL);
 	if (p != peer_avl_empty)
 		atomic_inc(&p->refcnt);
 	read_unlock_bh(&peer_pool_lock);
@@ -400,7 +406,7 @@ struct inet_peer *inet_getpeer(__be32 da
 
 	write_lock_bh(&peer_pool_lock);
 	/* Check if an entry has suddenly appeared. */
-	p = lookup(daddr);
+	p = lookup(daddr, stack);
 	if (p != peer_avl_empty)
 		goto out_free;
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  4:26 [RFC PATCH]: Dynamically sized routing cache hash table David Miller
  2007-03-06  7:14 ` Eric Dumazet
  2007-03-06  9:11 ` Nick Piggin
@ 2007-03-06 13:26 ` Robert Olsson
  2007-03-06 22:20   ` David Miller
  2 siblings, 1 reply; 21+ messages in thread
From: Robert Olsson @ 2007-03-06 13:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, dada1, robert.olsson, npiggin

David Miller writes:

 Interesting.

 > Actually, more accurately, the conflict exists in how this GC
 > logic is implemented.  The core issue is that hash table size
 > guides the GC processing, and hash table growth therefore
 > modifies those GC goals.  So with the patch below we'll just
 > keep growing the hash table instead of giving GC some time to
 > try to keep the working set in equilibrium before doing the
 > hash grow.

 AFIK the equilibrium is resizing function as well but using fixed 
 hash table. So can we do without equilibrium resizing if tables 
 are dynamic?  I think so....

 With the hash data structure we could monitor the average chain 
 length or just size and resize hash after that.

 > One idea is to put the hash grow check in the garbage collector,
 > and put the hash shrink check in rt_del().
 > 
 > In fact, it would be a good time to perhaps hack up some entirely
 > new passive GC logic for the routing cache.

 Could be, remeber GC in the hash chain also which was added after
 although it does's decrease the number of entries but it gives
 an upper limit. Also gc-goal must picked so it does not force 
 unwanted resizing.

 > BTW, another thing that plays into this is that Robert's TRASH work
 > could make this patch not necessary :-)

 It has "built-in" resize and chain control and the gc-goal is chosen not 
 to unnecessary resize the root node. 

 Cheers.
					--ro

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06  7:14 ` Eric Dumazet
  2007-03-06  7:23   ` David Miller
@ 2007-03-06 13:42   ` Robert Olsson
  2007-03-06 14:18     ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: Robert Olsson @ 2007-03-06 13:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, robert.olsson, npiggin

Eric Dumazet writes:

 > Well, maybe... but after looking robert's trash, I discovered its model is 
 > essentially a big (2^18 slots) root node (our hash table), and very few 
 > order:1,2,3 nodes.

 It's getting "hashlike" yes. I guess all effective algorithms today is doing
 some sort of "index" lookup and for large number of entries we cannot expect 
 to find the next node in the same cache line so the "tree depth" becomes a
 crucial performance factor. IMO nothing can beat a prefect distributed and 
 perfect sized hash. The trash work is an effort to get close with dynamic 
 data structure.

 Cheers
						--ro

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 13:42   ` [RFC PATCH]: Dynamically sized routing cache hash table Robert Olsson
@ 2007-03-06 14:18     ` Eric Dumazet
  2007-03-06 17:05       ` Robert Olsson
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06 14:18 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David Miller, netdev, robert.olsson, npiggin

On Tuesday 06 March 2007 14:42, Robert Olsson wrote:
> Eric Dumazet writes:
>  > Well, maybe... but after looking robert's trash, I discovered its model
>  > is essentially a big (2^18 slots) root node (our hash table), and very
>  > few order:1,2,3 nodes.
>
>  It's getting "hashlike" yes. I guess all effective algorithms today is
> doing some sort of "index" lookup and for large number of entries we cannot
> expect to find the next node in the same cache line so the "tree depth"
> becomes a crucial performance factor. IMO nothing can beat a prefect
> distributed and perfect sized hash. The trash work is an effort to get
> close with dynamic data structure.

Indeed. It would be nice to see how it performs with say 2^20 elements...
Because with your data, I wonder if the extra complexity of the trash is worth 
it (since most lookups are going to only hit the hash and give the answer 
without intermediate nodes)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 14:18     ` Eric Dumazet
@ 2007-03-06 17:05       ` Robert Olsson
  2007-03-06 17:20         ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Robert Olsson @ 2007-03-06 17:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Robert Olsson, David Miller, netdev, robert.olsson, npiggin

Eric Dumazet writes:

 > Indeed. It would be nice to see how it performs with say 2^20 elements...
 > Because with your data, I wonder if the extra complexity of the trash is worth 
 > it (since most lookups are going to only hit the hash and give the answer 
 > without intermediate nodes)

 I don't know if I understand you fully. Yes in most cases the first lookup via
 "hash-header" will take us to direct to the correct leaf. If there are "collisions" 
 we have to sort them out by adding intermediate nodes.

 Something like where you have resizable hash where is each bucket in turn is a 
 resizable hash etc.

 Cheers
						--ro

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 17:05       ` Robert Olsson
@ 2007-03-06 17:20         ` Eric Dumazet
  2007-03-06 18:55           ` Robert Olsson
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2007-03-06 17:20 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David Miller, netdev, robert.olsson, npiggin

On Tuesday 06 March 2007 18:05, Robert Olsson wrote:
> Eric Dumazet writes:
>  > Indeed. It would be nice to see how it performs with say 2^20
>  > elements... Because with your data, I wonder if the extra complexity of
>  > the trash is worth it (since most lookups are going to only hit the hash
>  > and give the answer without intermediate nodes)
>
>  I don't know if I understand you fully. Yes in most cases the first lookup
> via "hash-header" will take us to direct to the correct leaf. If there are
> "collisions" we have to sort them out by adding intermediate nodes.

With 2^20 entries, your actual limit of 2^19 entries in root node will 
probably show us quite different numbers for order-1,2,3,4... tnodes

>
>  Something like where you have resizable hash where is each bucket in turn
> is a resizable hash etc.

Yes, numbers you gave us basically showed a big root node, and mainly leaves 
and very few tnodes.

I was interested to see the distribution in case the root-node limit is hit, 
and we load into the table a *lot* of entries.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 17:20         ` Eric Dumazet
@ 2007-03-06 18:55           ` Robert Olsson
  0 siblings, 0 replies; 21+ messages in thread
From: Robert Olsson @ 2007-03-06 18:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Robert Olsson, David Miller, netdev, robert.olsson, npiggin


Eric Dumazet writes:

 > With 2^20 entries, your actual limit of 2^19 entries in root node will 
 > probably show us quite different numbers for order-1,2,3,4... tnodes

 Yeep trie will get deeper and lookup more costly as insert and delete.
 The 2^19 was that was getting memory alloction problem that I never
 sorted out.

 > Yes, numbers you gave us basically showed a big root node, and mainly leaves 
 > and very few tnodes.
 > 
 > I was interested to see the distribution in case the root-node limit is hit, 
 > and we load into the table a *lot* of entries.

 Maxlength etc... well maybe root-restriction should be removed and just have 
 maxsize instead.

 Cheers
						--ro

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 13:26 ` Robert Olsson
@ 2007-03-06 22:20   ` David Miller
  2007-03-08  6:26     ` Nick Piggin
  2007-03-08 13:35     ` Robert Olsson
  0 siblings, 2 replies; 21+ messages in thread
From: David Miller @ 2007-03-06 22:20 UTC (permalink / raw)
  To: Robert.Olsson; +Cc: netdev, dada1, robert.olsson, npiggin

From: Robert Olsson <Robert.Olsson@data.slu.se>
Date: Tue, 6 Mar 2007 14:26:04 +0100

> David Miller writes:
>  
>  > Actually, more accurately, the conflict exists in how this GC
>  > logic is implemented.  The core issue is that hash table size
>  > guides the GC processing, and hash table growth therefore
>  > modifies those GC goals.  So with the patch below we'll just
>  > keep growing the hash table instead of giving GC some time to
>  > try to keep the working set in equilibrium before doing the
>  > hash grow.
>  
>  AFIK the equilibrium is resizing function as well but using fixed 
>  hash table. So can we do without equilibrium resizing if tables 
>  are dynamic?  I think so....
> 
>  With the hash data structure we could monitor the average chain 
>  length or just size and resize hash after that.

I'm not so sure, it may be a mistake to eliminate the equilibrium
logic.  One error I think it does have is the usage of chain length.

Even a nearly perfect hash has small lumps in distribution, and we
should not penalize entries which fall into these lumps.

Let us call T the threshold at which we would grow the routing hash
table.  As we approach T we start to GC.  Let's assume hash table
has shift = 2. and T would (with T=N+(N>>1) algorithm) therefore be
6.

TABLE:	[0]	DST1, DST2
	[1]	DST3, DST4, DST5

DST6 arrives, what should we do?

If we just accept it and don't GC some existing entries, we
will grow the hash table.  This is the wrong thing to do if
our true working set is smaller than 6 entries and thus some
of the existing entries are unlikely to be reused and thus
could be purged to keep us from hitting T.

If they are all active, growing is the right thing to do.

This is the crux of the whole routing cache problem.

I am of the opinion that LRU, for routes not attached to sockets, is
probably the best thing to do here.

Furthermore at high packet rates, the current rt_may_expire() logic
probably is not very effective since it's granularity is limited to
jiffies.  We can quite easily create 100,000 or more entries per
jiffie when HZ=100 during rDOS, for example.  So perhaps some global
LRU algorithm using ktime is more appropriate.

Global LRU is not easy without touching a lot of memory.  But I'm
sure some clever trick can be discovered by someone :)

It is amusing, but it seems that for rDOS workload most optimal
routing hash would be tiny one like my example above.  All packets
essentially miss the routing cache and create new entry.  So
keeping the working set as small as possible is what you want
to do since no matter how large you grow your hit rate will be
zero :-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] NET : Optimizes inet_getpeer()
  2007-03-06 10:33         ` [PATCH] NET : Optimizes inet_getpeer() Eric Dumazet
@ 2007-03-07  4:23           ` David Miller
  0 siblings, 0 replies; 21+ messages in thread
From: David Miller @ 2007-03-07  4:23 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 6 Mar 2007 11:33:20 +0100

> [PATCH] NET : Optimizes inet_getpeer()
> 
> 1) Some sysctl vars are declared __read_mostly
> 
> 2) We can avoid updating stack[] when doing an AVL lookup only.
> 
>     lookup() macro is extended to receive a second parameter, that may be NULL 
> in case of a pure lookup (no need to save the AVL path). This removes 
> unnecessary instructions, because compiler knows if this _stack parameter is 
> NULL or not.
> 
>     text size of net/ipv4/inetpeer.o is 2063 bytes instead of 2107 on x86_64
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 22:20   ` David Miller
@ 2007-03-08  6:26     ` Nick Piggin
  2007-03-08 13:35     ` Robert Olsson
  1 sibling, 0 replies; 21+ messages in thread
From: Nick Piggin @ 2007-03-08  6:26 UTC (permalink / raw)
  To: David Miller; +Cc: Robert.Olsson, netdev, dada1, robert.olsson

On Tue, Mar 06, 2007 at 02:20:55PM -0800, David Miller wrote:
> From: Robert Olsson <Robert.Olsson@data.slu.se>
> Date: Tue, 6 Mar 2007 14:26:04 +0100
> 
> > David Miller writes:
> >  
> >  > Actually, more accurately, the conflict exists in how this GC
> >  > logic is implemented.  The core issue is that hash table size
> >  > guides the GC processing, and hash table growth therefore
> >  > modifies those GC goals.  So with the patch below we'll just
> >  > keep growing the hash table instead of giving GC some time to
> >  > try to keep the working set in equilibrium before doing the
> >  > hash grow.
> >  
> >  AFIK the equilibrium is resizing function as well but using fixed 
> >  hash table. So can we do without equilibrium resizing if tables 
> >  are dynamic?  I think so....
> > 
> >  With the hash data structure we could monitor the average chain 
> >  length or just size and resize hash after that.
> 
> I'm not so sure, it may be a mistake to eliminate the equilibrium
> logic.  One error I think it does have is the usage of chain length.
> 
> Even a nearly perfect hash has small lumps in distribution, and we
> should not penalize entries which fall into these lumps.
> 
> Let us call T the threshold at which we would grow the routing hash
> table.  As we approach T we start to GC.  Let's assume hash table
> has shift = 2. and T would (with T=N+(N>>1) algorithm) therefore be
> 6.
> 
> TABLE:	[0]	DST1, DST2
> 	[1]	DST3, DST4, DST5
> 
> DST6 arrives, what should we do?
> 
> If we just accept it and don't GC some existing entries, we
> will grow the hash table.  This is the wrong thing to do if
> our true working set is smaller than 6 entries and thus some
> of the existing entries are unlikely to be reused and thus
> could be purged to keep us from hitting T.
> 
> If they are all active, growing is the right thing to do.
> 
> This is the crux of the whole routing cache problem.

I guess this is similar to our problems with bdev and filesystem
caches as well.

What we do in that case (as you would know), is to let the caches
expand to the size of memory, and the problem just becomes balancing
their relative importance. We just try to go with a reasonable default,
and provide a knob or two for fine tuning.


> I am of the opinion that LRU, for routes not attached to sockets, is
> probably the best thing to do here.
> 
> Furthermore at high packet rates, the current rt_may_expire() logic
> probably is not very effective since it's granularity is limited to
> jiffies.  We can quite easily create 100,000 or more entries per
> jiffie when HZ=100 during rDOS, for example.  So perhaps some global
> LRU algorithm using ktime is more appropriate.
> 
> Global LRU is not easy without touching a lot of memory.  But I'm
> sure some clever trick can be discovered by someone :)

Well we do a pseudo LRU in most vm/vfs caches, where we just set a
bit if it has been touched since last checked. Add another "working
set" list to promote popular routes into, and you have something
like our pagecache active/inactive reclaim.

I don't know if that really applies here, but it might if you decide
to try hooking this cache into the "slab shrinker" thingy...

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH]: Dynamically sized routing cache hash table.
  2007-03-06 22:20   ` David Miller
  2007-03-08  6:26     ` Nick Piggin
@ 2007-03-08 13:35     ` Robert Olsson
  1 sibling, 0 replies; 21+ messages in thread
From: Robert Olsson @ 2007-03-08 13:35 UTC (permalink / raw)
  To: David Miller; +Cc: Robert.Olsson, netdev, dada1, robert.olsson, npiggin

David Miller writes:

 > Even a nearly perfect hash has small lumps in distribution, and we
 > should not penalize entries which fall into these lumps.
 > 
 > Let us call T the threshold at which we would grow the routing hash
 > table.  As we approach T we start to GC.  Let's assume hash table
 > has shift = 2. and T would (with T=N+(N>>1) algorithm) therefore be
 > 6.
 > 
 > TABLE:	[0]	DST1, DST2
 > 	[1]	DST3, DST4, DST5
 > 
 > DST6 arrives, what should we do?
 > 
 > If we just accept it and don't GC some existing entries, we
 > will grow the hash table.  This is the wrong thing to do if
 > our true working set is smaller than 6 entries and thus some
 > of the existing entries are unlikely to be reused and thus
 > could be purged to keep us from hitting T.
 > 
 > If they are all active, growing is the right thing to do.
 > 
 > This is the crux of the whole routing cache problem.

 Yes it very complex... I would be better to have GC processes
 datastructure more independent at least for us mortals.
 With the unicahe I was trying to achive something like this:

 Datastructure pretty independent and optimal wrt inserts and 
 deletes (GC) Well not 100% perfect as  we don't want to resize
 root node - which is close to hash resize. Stefan Nilsson did some 
 work in his "dynamic tries", halve_threshold and  inflate_threshold 
 is controlling the resize alone. AFAIK the two different thresholds 
 was used to prevent oscillation and prevent dampening. Maybe some
 ideas can be considered.

 About GC, if we forget what can be done with active GC for now..
 IMO the "most important" GC for constant load is the on-demanand 
 or passive where we triggered by a fixed threshhold. ( A fixed
 equilibrium point )

 > I am of the opinion that LRU, for routes not attached to sockets, is
 > probably the best thing to do here.

 Yes among other things rt_score checks for this.

 > Furthermore at high packet rates, the current rt_may_expire() logic
 > probably is not very effective since it's granularity is limited to
 > jiffies.  We can quite easily create 100,000 or more entries per
 > jiffie when HZ=100 during rDOS, for example.  So perhaps some global
 > LRU algorithm using ktime is more appropriate.

 Timer-based GC. In my world this is just to get rid of entries when 
 traffic has stopped/dropped. 

 > Global LRU is not easy without touching a lot of memory.  But I'm
 > sure some clever trick can be discovered by someone :)

 Yes as have to scan a entries. To be balanced with the work we have  
 to if we remove something that need to "restored".

 > It is amusing, but it seems that for rDOS workload most optimal
 > routing hash would be tiny one like my example above.  All packets
 > essentially miss the routing cache and create new entry.  So
 > keeping the working set as small as possible is what you want
 > to do since no matter how large you grow your hit rate will be
 > zero :-)

 Yes Alexey and I tried long time ago to limit the lengths of the
 hash-chains. We saw improved result but we didn't find any usable 
 measure for rDoS. 

 Cheers
					--ro

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2007-03-08 13:37 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-06  4:26 [RFC PATCH]: Dynamically sized routing cache hash table David Miller
2007-03-06  7:14 ` Eric Dumazet
2007-03-06  7:23   ` David Miller
2007-03-06  7:58     ` Eric Dumazet
2007-03-06  9:05       ` David Miller
2007-03-06 10:33         ` [PATCH] NET : Optimizes inet_getpeer() Eric Dumazet
2007-03-07  4:23           ` David Miller
2007-03-06 13:42   ` [RFC PATCH]: Dynamically sized routing cache hash table Robert Olsson
2007-03-06 14:18     ` Eric Dumazet
2007-03-06 17:05       ` Robert Olsson
2007-03-06 17:20         ` Eric Dumazet
2007-03-06 18:55           ` Robert Olsson
2007-03-06  9:11 ` Nick Piggin
2007-03-06  9:17   ` David Miller
2007-03-06  9:22     ` Nick Piggin
2007-03-06  9:23   ` Eric Dumazet
2007-03-06  9:41     ` Nick Piggin
2007-03-06 13:26 ` Robert Olsson
2007-03-06 22:20   ` David Miller
2007-03-08  6:26     ` Nick Piggin
2007-03-08 13:35     ` Robert Olsson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).