[PATCHSET]: Dymamic route cache hash sizing

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCHSET]: Dymamic route cache hash sizing
@ 2006-08-09  7:49 David Miller
  2006-08-09  7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
  2006-08-09  7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
  0 siblings, 2 replies; 12+ messages in thread
From: David Miller @ 2006-08-09  7:49 UTC (permalink / raw)
  To: netdev

Ok, I threw something together.  Patches coming.

The initial algorithm is very dumb, but the code seems to work well at
least on my systems.

I encourage folks to send me fixes to the growing and shrinking logic
as that's the only part that really needs any work.

The patches are against net-2.6.19

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing.
  2006-08-09  7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
@ 2006-08-09  7:53 ` David Miller
  2006-08-09 11:31   ` Herbert Xu
  2006-08-09  7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2006-08-09  7:53 UTC (permalink / raw)
  To: netdev


[IPV4] route: Locking infrastructure for dynamic routing cache sizing.

The basic idea is to wrap captures of the hash table base and hash
mask inside of a seqlock sequence.

The rest of the locking remains unchanged.

Furthermore, rt_hash_table and rt_hash_mask have two-underscores
prepended to their names in order to show that they must be accessed
in a special way.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c |  259 ++++++++++++++++++++++++++++++++++++------------------
 1 files changed, 172 insertions(+), 87 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 19bd49d..a7b4ca2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -93,6 +93,7 @@ #include <linux/random.h>
 #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/seqlock.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -243,37 +244,86 @@ # define rt_hash_lock_addr(slot) NULL
 # define rt_hash_lock_init()
 #endif
 
-static struct rt_hash_bucket 	*rt_hash_table;
-static unsigned			rt_hash_mask;
-static int			rt_hash_log;
-static unsigned int		rt_hash_rnd;
+static seqlock_t rt_hash_seq __read_mostly =
+	__SEQLOCK_UNLOCKED(rt_hash_seq);
+static struct rt_hash_bucket 	*__rt_hash_table __read_mostly;
+static unsigned			__rt_hash_mask __read_mostly;
+static unsigned int		rt_hash_rnd __read_mostly;
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) \
 	(__raw_get_cpu_var(rt_cache_stat).field++)
 
-static int rt_intern_hash(unsigned hash, struct rtable *rth,
-				struct rtable **res);
-
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
+static unsigned int __rt_hash_code(u32 daddr, u32 saddr, unsigned int hash_mask)
 {
 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
-		& rt_hash_mask);
+		& hash_mask);
+}
+
+/* XXX hash table resizing will need to work in three phases
+ * XXX first do the initial transfer to the new table
+ * XXX then instantiate the new table and synchronize_net
+ * XXX finally purge any remnants that got inserted into the old table
+ */
+static struct rt_hash_bucket *rt_get_bucket(u32 daddr, u32 saddr, unsigned int *hp)
+{
+	struct rt_hash_bucket *r;
+	unsigned long seq;
+
+	do {
+		unsigned int hash;
+
+		seq = read_seqbegin(&rt_hash_seq);
+		*hp = hash = __rt_hash_code(daddr, saddr, __rt_hash_mask);
+		r = &__rt_hash_table[hash];
+	} while (read_seqretry(&rt_hash_seq, seq));
+
+	return r;
+}
+
+static struct rt_hash_bucket *rt_get_bucket_nohash(u32 daddr, u32 saddr)
+{
+	struct rt_hash_bucket *r;
+	unsigned long seq;
+
+	do {
+		unsigned int hash;
+
+		seq = read_seqbegin(&rt_hash_seq);
+		hash = __rt_hash_code(daddr, saddr, __rt_hash_mask);
+		r = &__rt_hash_table[hash];
+	} while (read_seqretry(&rt_hash_seq, seq));
+
+	return r;
+}
+
+static void rt_hash_snapshot(struct rt_hash_bucket **table, unsigned int *hmask)
+{
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&rt_hash_seq);
+		*hmask = __rt_hash_mask;
+		*table = __rt_hash_table;
+	} while (read_seqretry(&rt_hash_seq, seq));
 }
 
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
+	struct rt_hash_bucket *table;
 	int bucket;
 };
 
 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 {
-	struct rtable *r = NULL;
 	struct rt_cache_iter_state *st = seq->private;
+	struct rtable *r = NULL;
+	unsigned int hmask;
 
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+	rt_hash_snapshot(&st->table, &hmask);
+	for (st->bucket = hmask; st->bucket >= 0; --st->bucket) {
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = st->table[st->bucket].chain;
 		if (r)
 			break;
 		rcu_read_unlock_bh();
@@ -291,7 +341,7 @@ static struct rtable *rt_cache_get_next(
 		if (--st->bucket < 0)
 			break;
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = st->table[st->bucket].chain;
 	}
 	return r;
 }
@@ -620,18 +670,23 @@ static void rt_check_expire(unsigned lon
 	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
+	struct rt_hash_bucket *table;
+	unsigned int hmask;
 	u64 mult;
 
-	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	rt_hash_snapshot(&table, &hmask);
+
+	mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	if (goal > hmask + 1)
+		goal = hmask + 1;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
+		i = (i + 1) & hmask;
+		rthp = &table[i].chain;
 
 		if (*rthp == 0)
 			continue;
@@ -655,7 +710,7 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 			/* remove all related balanced entries if necessary */
 			if (rth->u.dst.flags & DST_BALANCED) {
 				rthp = rt_remove_balanced_route(
-					&rt_hash_table[i].chain,
+					&table[i].chain,
 					rth, NULL);
 				if (!rthp)
 					break;
@@ -683,18 +738,21 @@ #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACH
  */
 static void rt_run_flush(unsigned long dummy)
 {
-	int i;
 	struct rtable *rth, *next;
+	struct rt_hash_bucket *table;
+	unsigned int hmask;
+	int i;
 
 	rt_deadline = 0;
 
 	get_random_bytes(&rt_hash_rnd, 4);
 
-	for (i = rt_hash_mask; i >= 0; i--) {
+	rt_hash_snapshot(&table, &hmask);
+	for (i = hmask; i >= 0; i--) {
 		spin_lock_bh(rt_hash_lock_addr(i));
-		rth = rt_hash_table[i].chain;
+		rth = table[i].chain;
 		if (rth)
-			rt_hash_table[i].chain = NULL;
+			table[i].chain = NULL;
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
@@ -777,7 +835,9 @@ static int rt_garbage_collect(void)
 	static int rover;
 	static int equilibrium;
 	struct rtable *rth, **rthp;
+	struct rt_hash_bucket *table;
 	unsigned long now = jiffies;
+	unsigned int hmask;
 	int goal;
 
 	/*
@@ -793,22 +853,24 @@ static int rt_garbage_collect(void)
 		goto out;
 	}
 
+	rt_hash_snapshot(&table, &hmask);
+
 	/* Calculate number of entries, which we want to expire now. */
 	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << rt_hash_log);
+		(ip_rt_gc_elasticity << long_log2(hmask + 1));
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+			equilibrium += min_t(unsigned int, goal / 2, hmask + 1);
 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 		}
 	} else {
 		/* We are in dangerous area. Try to reduce cache really
 		 * aggressively.
 		 */
-		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+		goal = max_t(unsigned int, goal / 2, hmask + 1);
 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 	}
 
@@ -823,11 +885,11 @@ static int rt_garbage_collect(void)
 	do {
 		int i, k;
 
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+		for (i = hmask, k = rover; i >= 0; i--) {
 			unsigned long tmo = expire;
 
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
+			k = (k + 1) & hmask;
+			rthp = &table[k].chain;
 			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
@@ -843,7 +905,7 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 					int r;
 
 					rthp = rt_remove_balanced_route(
-						&rt_hash_table[k].chain,
+						&table[k].chain,
 						rth,
 						&r);
 					goal -= r;
@@ -912,7 +974,7 @@ #endif
 out:	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -928,7 +990,7 @@ restart:
 	candp = NULL;
 	now = jiffies;
 
-	rthp = &rt_hash_table[hash].chain;
+	rthp = &r->chain;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -945,13 +1007,12 @@ #endif
 			 * must be visible to another weakly ordered CPU before
 			 * the insertion at the start of the hash chain.
 			 */
-			rcu_assign_pointer(rth->u.rt_next,
-					   rt_hash_table[hash].chain);
+			rcu_assign_pointer(rth->u.rt_next, r->chain);
 			/*
 			 * Since lookup is lockfree, the update writes
 			 * must be ordered for consistency on SMP.
 			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+			rcu_assign_pointer(r->chain, rth);
 
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
@@ -1026,7 +1087,7 @@ #endif
 		}
 	}
 
-	rt->u.rt_next = rt_hash_table[hash].chain;
+	rt->u.rt_next = r->chain;
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.rt_next) {
 		struct rtable *trt;
@@ -1037,7 +1098,7 @@ #if RT_CACHE_DEBUG >= 2
 		printk("\n");
 	}
 #endif
-	rt_hash_table[hash].chain = rt;
+	r->chain = rt;
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
@@ -1102,19 +1163,19 @@ void __ip_select_ident(struct iphdr *iph
 	ip_select_fb_ident(iph);
 }
 
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt)
 {
 	struct rtable **rthp;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
-	for (rthp = &rt_hash_table[hash].chain; *rthp;
-	     rthp = &(*rthp)->u.rt_next)
+	for (rthp = &r->chain; *rthp; rthp = &(*rthp)->u.rt_next) {
 		if (*rthp == rt) {
 			*rthp = rt->u.rt_next;
 			rt_free(rt);
 			break;
 		}
+	}
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
@@ -1147,10 +1208,14 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 
 	for (i = 0; i < 2; i++) {
 		for (k = 0; k < 2; k++) {
-			unsigned hash = rt_hash_code(daddr,
-						     skeys[i] ^ (ikeys[k] << 5));
+			struct rt_hash_bucket *r;
+			unsigned int hash;
+
+			r = rt_get_bucket(daddr,
+					  skeys[i] ^ (ikeys[k] << 5),
+					  &hash);
 
-			rthp=&rt_hash_table[hash].chain;
+			rthp=&r->chain;
 
 			rcu_read_lock();
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
@@ -1224,8 +1289,8 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 				call_netevent_notifiers(NETEVENT_REDIRECT, 
 						        &netevent);
 
-				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt))
+				rt_del(r, hash, rth);
+				if (!rt_intern_hash(r, hash, rt, &rt))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1260,15 +1325,18 @@ static struct dst_entry *ipv4_negative_a
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   rt->u.dst.expires) {
-			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
-						     rt->fl.fl4_src ^
-							(rt->fl.oif << 5));
+			struct rt_hash_bucket *r;
+			unsigned int hash;
+			r = rt_get_bucket(rt->fl.fl4_dst,
+					  rt->fl.fl4_src ^
+					  (rt->fl.oif << 5),
+					  &hash);
 #if RT_CACHE_DEBUG >= 1
 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
 					  "%u.%u.%u.%u/%02x dropped\n",
 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
 #endif
-			rt_del(hash, rt);
+			rt_del(r, hash, rt);
 			ret = NULL;
 		}
 	}
@@ -1405,10 +1473,12 @@ unsigned short ip_rt_frag_needed(struct 
 		return 0;
 
 	for (i = 0; i < 2; i++) {
-		unsigned hash = rt_hash_code(daddr, skeys[i]);
+		struct rt_hash_bucket *r;
+
+		r = rt_get_bucket_nohash(daddr, skeys[i]);
 
 		rcu_read_lock();
-		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+		for (rth = rcu_dereference(r->chain); rth;
 		     rth = rcu_dereference(rth->u.rt_next)) {
 			if (rth->fl.fl4_dst == daddr &&
 			    rth->fl.fl4_src == skeys[i] &&
@@ -1599,8 +1669,9 @@ #endif
 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned hash;
 	struct rtable *rth;
+	struct rt_hash_bucket *r;
+	unsigned int hash;
 	u32 spec_dst;
 	struct in_device *in_dev = in_dev_get(dev);
 	u32 itag = 0;
@@ -1665,8 +1736,8 @@ #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
 	in_dev_put(in_dev);
-	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
-	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+	r = rt_get_bucket(daddr, saddr ^ (dev->ifindex << 5), &hash);
+	return rt_intern_hash(r, hash, rth, (struct rtable**) &skb->dst);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -1816,8 +1887,9 @@ static inline int ip_mkroute_input_def(s
 				       u32 daddr, u32 saddr, u32 tos)
 {
 	struct rtable* rth = NULL;
+	struct rt_hash_bucket *r;
+	unsigned int hash;
 	int err;
-	unsigned hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
@@ -1830,8 +1902,8 @@ #endif
 		return err;
 
 	/* put it into the cache */
-	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
-	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);	
+	r = rt_get_bucket(daddr, saddr ^ (fl->iif << 5), &hash);
+	return rt_intern_hash(r, hash, rth, (struct rtable**)&skb->dst);
 }
 
 static inline int ip_mkroute_input(struct sk_buff *skb, 
@@ -1844,7 +1916,6 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 	struct rtable* rth = NULL, *rtres;
 	unsigned char hop, hopcount;
 	int err = -EINVAL;
-	unsigned int hash;
 
 	if (res->fi)
 		hopcount = res->fi->fib_nhs;
@@ -1858,6 +1929,9 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 	
 	/* add all alternatives to the routing cache */
 	for (hop = 0; hop < hopcount; hop++) {
+		struct rt_hash_bucket *r;
+		unsigned int hash;
+
 		res->nh_sel = hop;
 
 		/* put reference to previous result */
@@ -1871,8 +1945,8 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 			return err;
 
 		/* put it into the cache */
-		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
-		err = rt_intern_hash(hash, rth, &rtres);
+		r = rt_get_bucket(daddr, saddr ^ (fl->iif << 5), &hash);
+		err = rt_intern_hash(r, hash, rth, &rtres);
 		if (err)
 			return err;
 
@@ -1919,7 +1993,8 @@ #endif
 	unsigned	flags = 0;
 	u32		itag = 0;
 	struct rtable * rth;
-	unsigned	hash;
+	struct rt_hash_bucket *r;
+	unsigned int hash;
 	u32		spec_dst;
 	int		err = -EINVAL;
 	int		free_res = 0;
@@ -2048,8 +2123,8 @@ #endif
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	rth->rt_type	= res.type;
-	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
-	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+	r = rt_get_bucket(daddr, saddr ^ (fl.iif << 5), &hash);
+	err = rt_intern_hash(r, hash, rth, (struct rtable**)&skb->dst);
 	goto done;
 
 no_route:
@@ -2090,15 +2165,15 @@ martian_source:
 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
 		   u8 tos, struct net_device *dev)
 {
-	struct rtable * rth;
-	unsigned	hash;
+	struct rt_hash_bucket *r;
+	struct rtable *rth;
 	int iif = dev->ifindex;
 
 	tos &= IPTOS_RT_MASK;
-	hash = rt_hash_code(daddr, saddr ^ (iif << 5));
+	r = rt_get_bucket_nohash(daddr, saddr ^ (iif << 5));
 
 	rcu_read_lock();
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	for (rth = rcu_dereference(r->chain); rth;
 	     rth = rcu_dereference(rth->u.rt_next)) {
 		if (rth->fl.fl4_dst == daddr &&
 		    rth->fl.fl4_src == saddr &&
@@ -2291,11 +2366,15 @@ static inline int ip_mkroute_output_def(
 {
 	struct rtable *rth = NULL;
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
-	unsigned hash;
+
 	if (err == 0) {
-		hash = rt_hash_code(oldflp->fl4_dst, 
-				    oldflp->fl4_src ^ (oldflp->oif << 5));
-		err = rt_intern_hash(hash, rth, rp);
+		struct rt_hash_bucket *r;
+		unsigned int hash;
+
+		r = rt_get_bucket(oldflp->fl4_dst,
+				  oldflp->fl4_src ^ (oldflp->oif << 5),
+				  &hash);
+		err = rt_intern_hash(r, hash, rth, rp);
 	}
 	
 	return err;
@@ -2310,7 +2389,6 @@ static inline int ip_mkroute_output(stru
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 	unsigned char hop;
-	unsigned hash;
 	int err = -EINVAL;
 	struct rtable *rth = NULL;
 
@@ -2319,6 +2397,8 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
 		for (hop = 0; hop < hopcount; hop++) {
 			struct net_device *dev2nexthop;
+			struct rt_hash_bucket *r;
+			unsigned int hash;
 
 			res->nh_sel = hop;
 
@@ -2336,10 +2416,10 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 			if (err != 0)
 				goto cleanup;
 
-			hash = rt_hash_code(oldflp->fl4_dst, 
-					    oldflp->fl4_src ^
-					    (oldflp->oif << 5));
-			err = rt_intern_hash(hash, rth, rp);
+			r = rt_get_bucket(oldflp->fl4_dst,
+					  oldflp->fl4_src ^
+					  (oldflp->oif << 5), &hash);
+			err = rt_intern_hash(r, hash, rth, rp);
 
 			/* forward hop information to multipath impl. */
 			multipath_set_nhinfo(rth,
@@ -2564,13 +2644,13 @@ out:	return err;
 
 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 {
-	unsigned hash;
+	struct rt_hash_bucket *r;
 	struct rtable *rth;
 
-	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
+	r = rt_get_bucket_nohash(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
 
 	rcu_read_lock_bh();
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	for (rth = rcu_dereference(r->chain); rth;
 		rth = rcu_dereference(rth->u.rt_next)) {
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
 		    rth->fl.fl4_src == flp->fl4_src &&
@@ -2820,18 +2900,23 @@ out_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
+	struct rt_hash_bucket *table;
 	struct rtable *rt;
+	unsigned int hmask;
 	int h, s_h;
 	int idx, s_idx;
 
+	rt_hash_snapshot(&table, &hmask);
+
 	s_h = cb->args[0];
 	s_idx = idx = cb->args[1];
-	for (h = 0; h <= rt_hash_mask; h++) {
-		if (h < s_h) continue;
+	for (h = 0; h <= hmask; h++) {
+		if (h < s_h)
+			continue;
 		if (h > s_h)
 			s_idx = 0;
 		rcu_read_lock_bh();
-		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+		for (rt = rcu_dereference(table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
 			if (idx < s_idx)
 				continue;
@@ -3151,21 +3236,21 @@ #endif
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
+	__rt_hash_table = (struct rt_hash_bucket *)
 		alloc_large_system_hash("IP route cache",
 					sizeof(struct rt_hash_bucket),
 					rhash_entries,
 					(num_physpages >= 128 * 1024) ?
 					15 : 17,
 					HASH_HIGHMEM,
-					&rt_hash_log,
-					&rt_hash_mask,
+					NULL,
+					&__rt_hash_mask,
 					0);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+	memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
 	rt_hash_lock_init();
 
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+	ip_rt_max_size = (__rt_hash_mask + 1) * 16;
 
 	devinet_init();
 	ip_fib_init();
-- 
1.4.2.rc2.g3e042


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09  7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
  2006-08-09  7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
@ 2006-08-09  7:53 ` David Miller
  2006-08-09  8:32   ` Eric Dumazet
  2006-08-09 10:09   ` Eric Dumazet
  1 sibling, 2 replies; 12+ messages in thread
From: David Miller @ 2006-08-09  7:53 UTC (permalink / raw)
  To: netdev


[IPV4] route: Dynamic hash table sizing.

The algorithm is stupid, this changeset is about infrastructure.

Currently it starts at 16 entries (or whatever rhash_entries was
specified as), and allows growing up to 8MB.

The code can handle both growing and shrinking just fine, the only
tweaks necessary are to the rthash_new_size() function and the places
where rtcache_work is scheduled.

hashdist is now used at run-time so we need to drop the __initdata
tag.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 mm/page_alloc.c  |    2 -
 net/ipv4/route.c |  179 +++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 158 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
 	return 0;
 }
 
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a7b4ca2..897e67c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -94,6 +94,9 @@ #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/seqlock.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -120,6 +123,7 @@ #define RT_GC_TIMEOUT (300*HZ)
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
+static int ip_rt_hashsz_limit		= (8 * 1024 * 1024) / sizeof(void *);
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
 static int ip_rt_gc_interval		= 60 * HZ;
 static int ip_rt_gc_min_interval	= HZ / 2;
@@ -308,6 +312,135 @@ static void rt_hash_snapshot(struct rt_h
 	} while (read_seqretry(&rt_hash_seq, seq));
 }
 
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+	struct rt_hash_bucket *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kmalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+	else
+		n = (struct rt_hash_bucket *)
+			__get_free_pages(GFP_KERNEL, get_order(sz));
+
+	if (n)
+		memset(n, 0, sz);
+
+	return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(r);
+	else if (hashdist)
+		vfree(r);
+	else
+		free_pages((unsigned long)r, get_order(sz));
+}
+
+static void rtcache_transfer(struct rtable *list, struct rt_hash_bucket *new_table, unsigned int nhashmask)
+{
+	while (list) {
+		struct rtable *next = list->u.rt_next;
+		struct rt_hash_bucket *ent;
+		int iface = list->fl.iif;
+		unsigned int hash;
+
+		if (!iface)
+			iface = list->fl.oif;
+		hash = __rt_hash_code(list->fl.fl4_dst,
+				      list->fl.fl4_src &
+				      (iface << 5),
+				      nhashmask);
+		ent = &new_table[hash];
+		list->u.rt_next = ent->chain;
+		ent->chain = list;
+
+		list = next;
+	}
+}
+
+static unsigned long rthash_new_size(void)
+{
+	return ((__rt_hash_mask + 1) << 1) *
+		sizeof(struct rt_hash_bucket);
+}
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+	multipath_remove(rt);
+	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void rtcache_resize(void *__unused)
+{
+	struct rt_hash_bucket *new, *old;
+	unsigned long nsize;
+	unsigned int nhashmask, ohashmask;
+	int i;
+
+	mutex_lock(&hash_resize_mutex);
+
+	nsize = rthash_new_size();
+	new = rthash_alloc(nsize);
+	if (!new)
+		goto out_unlock;
+
+	write_seqlock_bh(&rt_hash_seq);
+
+	nhashmask = (nsize / sizeof(struct rt_hash_bucket)) - 1U;
+	for (i = __rt_hash_mask; i >= 0; i--) {
+		struct rtable *rth;
+
+		spin_lock_bh(rt_hash_lock_addr(i));
+		rth = __rt_hash_table[i].chain;
+		if (rth)
+			__rt_hash_table[i].chain = NULL;
+		spin_unlock_bh(rt_hash_lock_addr(i));
+
+		rtcache_transfer(rth, new, nhashmask);
+	}
+
+	old = __rt_hash_table;
+	ohashmask = __rt_hash_mask;
+
+	__rt_hash_table = new;
+	__rt_hash_mask = nhashmask;
+
+	/* XXX Do something more intelligent with these things.  */
+	ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+	ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+
+	write_sequnlock_bh(&rt_hash_seq);
+
+	synchronize_net();
+
+	/* It is possible that some entries got hashed into the old
+	 * table, free any such remnants.  No locking is necessary on
+	 * the chains as this table is no longer viewable by other
+	 * processors.
+	 */
+	for (i = ohashmask; i >= 0; i--) {
+		struct rtable *rth, *next;
+
+		for (rth = old[i].chain; rth; rth = next) {
+			next = rth->u.rt_next;
+			rt_free(rth);
+		}
+	}
+
+	rthash_free(old, (ohashmask + 1) * sizeof(struct rt_hash_bucket));
+
+out_unlock:
+	mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(rtcache_work, rtcache_resize, NULL);
+
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	struct rt_hash_bucket *table;
@@ -540,12 +673,6 @@ static struct file_operations rt_cpu_seq
 
 #endif /* CONFIG_PROC_FS */
   
-static __inline__ void rt_free(struct rtable *rt)
-{
-	multipath_remove(rt);
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
 static __inline__ void rt_drop(struct rtable *rt)
 {
 	multipath_remove(rt);
@@ -676,7 +803,7 @@ static void rt_check_expire(unsigned lon
 
 	rt_hash_snapshot(&table, &hmask);
 
-	mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
+	mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
 
 	/* Calculate number of entries, which we want to expire now. */
 	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << long_log2(hmask + 1));
+		((hmask + 1) << ip_rt_gc_elasticity);
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
@@ -971,7 +1098,11 @@ #if RT_CACHE_DEBUG >= 2
 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
 #endif
-out:	return 0;
+out:
+	if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
+	    (hmask + 1) < ip_rt_hashsz_limit)
+		schedule_work(&rtcache_work);
+	return 0;
 }
 
 static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct rtable *rt, struct rtable **rp)
@@ -3201,15 +3332,23 @@ #endif /* CONFIG_NET_CLS_ROUTE */
 static __initdata unsigned long rhash_entries;
 static int __init set_rhash_entries(char *str)
 {
+	unsigned long val;
+
 	if (!str)
 		return 0;
-	rhash_entries = simple_strtoul(str, &str, 0);
+	val = simple_strtoul(str, &str, 0);
+
+	/* Only use it if it's a power-of-2. */
+	if (!(val & (val - 1)))
+		rhash_entries = val;
+
 	return 1;
 }
 __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
+	unsigned long sz;
 	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3236,22 +3375,18 @@ #endif
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	__rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(num_physpages >= 128 * 1024) ?
-					15 : 17,
-					HASH_HIGHMEM,
-					NULL,
-					&__rt_hash_mask,
-					0);
-	memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
+	sz = (rhash_entries ? rhash_entries : 16);
+	sz *= sizeof(struct rt_hash_bucket);
 
+	__rt_hash_table = rthash_alloc(sz);
+	if (!__rt_hash_table)
+		panic("IP: failed to allocate routing cache hash table");
+	__rt_hash_mask = (sz / sizeof(struct rt_hash_bucket)) - 1;
 	ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
 	ip_rt_max_size = (__rt_hash_mask + 1) * 16;
 
+	rt_hash_lock_init();
+
 	devinet_init();
 	ip_fib_init();
 
-- 
1.4.2.rc2.g3e042


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09  7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
@ 2006-08-09  8:32   ` Eric Dumazet
  2006-08-09  9:09     ` David Miller
  2006-08-09 10:09   ` Eric Dumazet
  1 sibling, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2006-08-09  8:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wednesday 09 August 2006 09:53, David Miller wrote:
> [IPV4] route: Dynamic hash table sizing.
>

 Excellent work David, Thank you !!!

> +static int ip_rt_hashsz_limit		= (8 * 1024 * 1024) / sizeof(void *);

__read_mostly ?




> +		hash = __rt_hash_code(list->fl.fl4_dst,
> +				      list->fl.fl4_src &
> +				      (iface << 5),
> +				      nhashmask);

I thought it was a XOR (^) , not a AND (&)


>  static int __init set_rhash_entries(char *str)
>  {
> +	unsigned long val;
> +
>  	if (!str)
>  		return 0;
> -	rhash_entries = simple_strtoul(str, &str, 0);
> +	val = simple_strtoul(str, &str, 0);
> +
> +	/* Only use it if it's a power-of-2. */
> +	if (!(val & (val - 1)))
> +		rhash_entries = val;
> +

Well, it breaks x86_64 machines that currently do rhash_entries=2000000
Why not round to the next/previous power of two as before ?

>  	return 1;
>  }
>  __setup("rhash_entries=", set_rhash_entries);
>

Thank you
Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09  8:32   ` Eric Dumazet
@ 2006-08-09  9:09     ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2006-08-09  9:09 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 9 Aug 2006 10:32:54 +0200

> On Wednesday 09 August 2006 09:53, David Miller wrote:
> > [IPV4] route: Dynamic hash table sizing.
> >
> 
>  Excellent work David, Thank you !!!

No problem.  It's much more productive than all the pure-talk that
usually occurs on these topics.

Here is a patch implementing fixes for the problems you found, thanks
for reviewing.

commit fc03f220a20f0fef0567e7427d0e06d234b98531
Author: David S. Miller <davem@sunset.davemloft.net>
Date:   Wed Aug 9 02:06:33 2006 -0700

    [IPV4] route: Fix 3 bugs in dynamic hash table code.
    
    All problems were spotted by Eric Dumazet.
    
    1) During rehashing, the hash function was wrong.
       It should "xor" in the iface number not "and" it.
    
    2) Mark ip_rt_hashsz_limit __read_mostly
    
    3) Instead of rejecting non-power-of-2 rhash_entries
       values, round them up just like the alloc_large_system_hash
       function was doing for us previously
    
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 897e67c..a9216e2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -123,7 +123,6 @@ #define RT_GC_TIMEOUT (300*HZ)
 static int ip_rt_min_delay		= 2 * HZ;
 static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
-static int ip_rt_hashsz_limit		= (8 * 1024 * 1024) / sizeof(void *);
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
 static int ip_rt_gc_interval		= 60 * HZ;
 static int ip_rt_gc_min_interval	= HZ / 2;
@@ -253,6 +252,8 @@ static seqlock_t rt_hash_seq __read_most
 static struct rt_hash_bucket 	*__rt_hash_table __read_mostly;
 static unsigned			__rt_hash_mask __read_mostly;
 static unsigned int		rt_hash_rnd __read_mostly;
+static int ip_rt_hashsz_limit __read_mostly =
+	(8 * 1024 * 1024) / sizeof(void *);
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) \
@@ -351,7 +352,7 @@ static void rtcache_transfer(struct rtab
 		if (!iface)
 			iface = list->fl.oif;
 		hash = __rt_hash_code(list->fl.fl4_dst,
-				      list->fl.fl4_src &
+				      list->fl.fl4_src ^
 				      (iface << 5),
 				      nhashmask);
 		ent = &new_table[hash];
@@ -3338,9 +3339,7 @@ static int __init set_rhash_entries(char
 		return 0;
 	val = simple_strtoul(str, &str, 0);
 
-	/* Only use it if it's a power-of-2. */
-	if (!(val & (val - 1)))
-		rhash_entries = val;
+	rhash_entries = roundup_pow_of_two(val);
 
 	return 1;
 }

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09  7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
  2006-08-09  8:32   ` Eric Dumazet
@ 2006-08-09 10:09   ` Eric Dumazet
  2006-08-09 10:12     ` Michael Tokarev
  2006-08-09 11:14     ` David Miller
  1 sibling, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2006-08-09 10:09 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wednesday 09 August 2006 09:53, David Miller wrote:

> +	if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
> +	    (hmask + 1) < ip_rt_hashsz_limit)
> +		schedule_work(&rtcache_work);
> +	return 0;
>  }
>

I wonder if you should not replicate this test (against (hmask + 1) < 
ip_rt_hashsz_limit) in rtcache_resize() itself, because we might end calling 
rthash_new_size() while (hmask +1 ) = ip_rt_hashsz_limit

> -       mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
> +       mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;

Not sure I understand what you did here (in rt_check_expire()), could you 
please explain the math ? (I may be wrong but (x * 2^y) != (y * 2^x) for 
general values of x and y)

Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09 10:09   ` Eric Dumazet
@ 2006-08-09 10:12     ` Michael Tokarev
  2006-08-09 10:18       ` Evgeniy Polyakov
  2006-08-09 11:14     ` David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: Michael Tokarev @ 2006-08-09 10:12 UTC (permalink / raw)
  Cc: David Miller, netdev

On Wednesday 09 August 2006 09:53, David Miller wrote:
[]
> -       mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
> +       mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;

Hmm.. shift *by* a 64-bit number?

/mjt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09 10:12     ` Michael Tokarev
@ 2006-08-09 10:18       ` Evgeniy Polyakov
  0 siblings, 0 replies; 12+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09 10:18 UTC (permalink / raw)
  To: Michael Tokarev; +Cc: David Miller, netdev

On Wed, Aug 09, 2006 at 02:12:43PM +0400, Michael Tokarev (mjt@tls.msk.ru) wrote:
> On Wednesday 09 August 2006 09:53, David Miller wrote:
> []
> > -       mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
> > +       mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
> 
> Hmm.. shift *by* a 64-bit number?

I think it is done to shut compiler up.                                                                                                                                             
But nevertheless ip_rt_gc_interval is 15000 on my machine.

> /mjt

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09 10:09   ` Eric Dumazet
  2006-08-09 10:12     ` Michael Tokarev
@ 2006-08-09 11:14     ` David Miller
  2006-08-15  8:35       ` David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2006-08-09 11:14 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 9 Aug 2006 12:09:28 +0200

> On Wednesday 09 August 2006 09:53, David Miller wrote:
> 
> > +	if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
> > +	    (hmask + 1) < ip_rt_hashsz_limit)
> > +		schedule_work(&rtcache_work);
> > +	return 0;
> >  }
> >
> 
> I wonder if you should not replicate this test (against (hmask + 1) < 
> ip_rt_hashsz_limit) in rtcache_resize() itself, because we might end calling 
> rthash_new_size() while (hmask +1 ) = ip_rt_hashsz_limit

That's a good point, let me think about that.

> > -       mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
> > +       mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
> 
> Not sure I understand what you did here (in rt_check_expire()), could you 
> please explain the math ? (I may be wrong but (x * 2^y) != (y * 2^x) for 
> general values of x and y)

Indeed I'm a retard.

I made the same error in another location:

@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
 
 	/* Calculate number of entries, which we want to expire now. */
 	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << long_log2(hmask + 1));
+		((hmask + 1) << ip_rt_gc_elasticity);

I'll revert both of those changes, thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing.
  2006-08-09  7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
@ 2006-08-09 11:31   ` Herbert Xu
  2006-08-09 23:05     ` David Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Herbert Xu @ 2006-08-09 11:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

David Miller <davem@davemloft.net> wrote:
>
> +static struct rt_hash_bucket *rt_get_bucket(u32 daddr, u32 saddr, unsigned int *hp)
> +{
> +       struct rt_hash_bucket *r;
> +       unsigned long seq;
> +
> +       do {
> +               unsigned int hash;
> +
> +               seq = read_seqbegin(&rt_hash_seq);
> +               *hp = hash = __rt_hash_code(daddr, saddr, __rt_hash_mask);
> +               r = &__rt_hash_table[hash];
> +       } while (read_seqretry(&rt_hash_seq, seq));

If we hit a writer just as they begin resizing, we could be here for
quite a while.

In fact because we expect each writer to stick around for a relatively
long time, we could use a new seqlock primitive that just spun until
the LSB becomes zero again.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing.
  2006-08-09 11:31   ` Herbert Xu
@ 2006-08-09 23:05     ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2006-08-09 23:05 UTC (permalink / raw)
  To: herbert; +Cc: netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 09 Aug 2006 21:31:45 +1000

> If we hit a writer just as they begin resizing, we could be here for
> quite a while.

Yes, we discussed this a bit on IRC.

> In fact because we expect each writer to stick around for a
> relatively long time, we could use a new seqlock primitive that just
> spun until the LSB becomes zero again.

Indeed, and put a cpu_relax() in there.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] [IPV4] route: Dynamic hash table sizing.
  2006-08-09 11:14     ` David Miller
@ 2006-08-15  8:35       ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2006-08-15  8:35 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Wed, 09 Aug 2006 04:14:52 -0700 (PDT)

> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 9 Aug 2006 12:09:28 +0200
> 
> > On Wednesday 09 August 2006 09:53, David Miller wrote:
> > 
> > > +	if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
> > > +	    (hmask + 1) < ip_rt_hashsz_limit)
> > > +		schedule_work(&rtcache_work);
> > > +	return 0;
> > >  }
> > >
> > 
> > I wonder if you should not replicate this test (against (hmask + 1) < 
> > ip_rt_hashsz_limit) in rtcache_resize() itself, because we might end calling 
> > rthash_new_size() while (hmask +1 ) = ip_rt_hashsz_limit
> 
> That's a good point, let me think about that.

Ok, I don't think this is an issue.

schedule_work() will only cause one invocation of rthash_new_size()
even if you call schedule_work() several times before the actual
workqueue invocation of rtcache_new_size().

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2006-08-15  8:35 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-09  7:49 [PATCHSET]: Dymamic route cache hash sizing David Miller
2006-08-09  7:53 ` [PATCH 1/2]: [IPV4] route: Locking infrastructure for dynamic routing cache sizing David Miller
2006-08-09 11:31   ` Herbert Xu
2006-08-09 23:05     ` David Miller
2006-08-09  7:53 ` [PATCH 2/2] [IPV4] route: Dynamic hash table sizing David Miller
2006-08-09  8:32   ` Eric Dumazet
2006-08-09  9:09     ` David Miller
2006-08-09 10:09   ` Eric Dumazet
2006-08-09 10:12     ` Michael Tokarev
2006-08-09 10:18       ` Evgeniy Polyakov
2006-08-09 11:14     ` David Miller
2006-08-15  8:35       ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).