netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [IPV4 0/9] TRIE performance patches
@ 2008-01-22 23:37 Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
                   ` (10 more replies)
  0 siblings, 11 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


Time to handle a full BGP load (163K of routes).

Before:		Load		Dump		Flush
hash		3.5		 0.5		0.7
2.6.23.14	3.4		19.3		10.3
net-2.6.25	3.4		18.7		9.8

After:
kmem_cache	3.8		13.0		7.2
iter		3.9		12.3		6.9
unordered	3.1		11.9		4.9
find_node	3.1		 0.3		1.2

Load:	ip -batch iproute-table
Dump:	ip route >/dev/null
Flush:	ip route flush table main
-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 1/9] fib_trie: put leaf nodes in a slab cache
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Stephen Hemminger

[-- Attachment #1: leaf-kmem-cache.patch --]
[-- Type: text/plain, Size: 1669 bytes --]

This improves locality for operations that touch all the leaves.
Save space since these entries don't need to be hardware cache aligned.

Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-21 10:16:10.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-21 10:18:42.000000000 -0800
@@ -162,6 +162,7 @@ static struct tnode *halve(struct trie *
 static void tnode_free(struct tnode *tn);
 
 static struct kmem_cache *fn_alias_kmem __read_mostly;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
 
 static inline struct tnode *node_parent(struct node *node)
 {
@@ -325,7 +326,8 @@ static inline void alias_free_mem_rcu(st
 
 static void __leaf_free_rcu(struct rcu_head *head)
 {
-	kfree(container_of(head, struct leaf, rcu));
+	struct leaf *l = container_of(head, struct leaf, rcu);
+	kmem_cache_free(trie_leaf_kmem, l);
 }
 
 static void __leaf_info_free_rcu(struct rcu_head *head)
@@ -375,7 +377,7 @@ static inline void tnode_free(struct tno
 
 static struct leaf *leaf_new(void)
 {
-	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 	if (l) {
 		l->parent = T_LEAF;
 		INIT_HLIST_HEAD(&l->list);
@@ -1935,7 +1937,12 @@ out:
 void __init fib_hash_init(void)
 {
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
-					  0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+					  0, SLAB_PANIC, NULL);
+
+	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+					   max(sizeof(struct leaf),
+					       sizeof(struct leaf_info)),
+					   0, SLAB_PANIC, NULL);
 }
 
 

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 2/9] fib_trie: style cleanup
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Stephen Hemminger

[-- Attachment #1: fib-trie-style.patch --]
[-- Type: text/plain, Size: 18775 bytes --]

Style cleanups:
      * make check_leaf return -1 or plen, rather than by reference
      * Get rid of #ifdef that is always set
      * split out embedded function calls in if statements.
      * checkpatch warnings

Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>

--- a/net/ipv4/fib_trie.c	2008-01-21 10:18:42.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-21 10:19:00.000000000 -0800
@@ -155,7 +155,8 @@ struct trie {
 };
 
 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+				  int wasfull);
 static struct node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
@@ -395,7 +396,7 @@ static struct leaf_info *leaf_info_new(i
 	return li;
 }
 
-static struct tnode* tnode_new(t_key key, int pos, int bits)
+static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 	struct tnode *tn = tnode_alloc(sz);
@@ -427,7 +428,8 @@ static inline int tnode_full(const struc
 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 }
 
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+static inline void put_child(struct trie *t, struct tnode *tn, int i,
+			     struct node *n)
 {
 	tnode_put_child_reorg(tn, i, n, -1);
 }
@@ -437,7 +439,8 @@ static inline void put_child(struct trie
   * Update the value of full_children and empty_children.
   */
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+				  int wasfull)
 {
 	struct node *chi = tn->child[i];
 	int isfull;
@@ -577,11 +580,13 @@ static struct node *resize(struct trie *
 	err = 0;
 	max_resize = 10;
 	while ((tn->full_children > 0 &&  max_resize-- &&
-	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
-				inflate_threshold_use * tnode_child_length(tn))) {
+		50 * (tn->full_children + tnode_child_length(tn)
+		      - tn->empty_children)
+		>= inflate_threshold_use * tnode_child_length(tn))) {
 
 		old_tn = tn;
 		tn = inflate(t, tn);
+
 		if (IS_ERR(tn)) {
 			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -593,11 +598,13 @@ static struct node *resize(struct trie *
 
 	if (max_resize < 0) {
 		if (!tn->parent)
-			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
-			       inflate_threshold_root, tn->bits);
+			pr_warning("Fix inflate_threshold_root."
+				   " Now=%d size=%d bits\n",
+				   inflate_threshold_root, tn->bits);
 		else
-			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
-			       inflate_threshold, tn->bits);
+			pr_warning("Fix inflate_threshold."
+				   " Now=%d size=%d bits\n",
+				   inflate_threshold, tn->bits);
 	}
 
 	check_tnode(tn);
@@ -634,11 +641,13 @@ static struct node *resize(struct trie *
 
 	if (max_resize < 0) {
 		if (!tn->parent)
-			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
-			       halve_threshold_root, tn->bits);
+			pr_warning("Fix halve_threshold_root."
+				   " Now=%d size=%d bits\n",
+				   halve_threshold_root, tn->bits);
 		else
-			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
-			       halve_threshold, tn->bits);
+			pr_warning("Fix halve_threshold."
+				   " Now=%d size=%d bits\n",
+				   halve_threshold, tn->bits);
 	}
 
 	/* Only one child remains */
@@ -681,8 +690,9 @@ static struct tnode *inflate(struct trie
 	 */
 
 	for (i = 0; i < olen; i++) {
-		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+		struct tnode *inode;
 
+		inode = (struct tnode *) tnode_get_child(oldtnode, i);
 		if (inode &&
 		    IS_TNODE(inode) &&
 		    inode->pos == oldtnode->pos + oldtnode->bits &&
@@ -722,8 +732,9 @@ static struct tnode *inflate(struct trie
 
 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 		   tn->pos + tn->bits - 1) {
-			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
-					     1) == 0)
+			if (tkey_extract_bits(node->key,
+					      oldtnode->pos + oldtnode->bits,
+					      1) == 0)
 				put_child(t, tn, 2*i, node);
 			else
 				put_child(t, tn, 2*i+1, node);
@@ -899,7 +910,7 @@ static struct leaf_info *find_leaf_info(
 	return NULL;
 }
 
-static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 {
 	struct leaf_info *li = find_leaf_info(l, plen);
 
@@ -949,7 +960,10 @@ fib_find_node(struct trie *t, u32 key)
 
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			pos = tn->pos + tn->bits;
-			n = tnode_get_child_rcu(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+			n = tnode_get_child_rcu(tn,
+						tkey_extract_bits(key,
+								  tn->pos,
+								  tn->bits));
 		} else
 			break;
 	}
@@ -970,8 +984,10 @@ static struct node *trie_rebalance(struc
 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-		tn = (struct tnode *) resize (t, (struct tnode *)tn);
-		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+		tn = (struct tnode *) resize(t, (struct tnode *)tn);
+
+		tnode_put_child_reorg((struct tnode *)tp, cindex,
+				      (struct node *)tn, wasfull);
 
 		tp = node_parent((struct node *) tn);
 		if (!tp)
@@ -981,9 +997,9 @@ static struct node *trie_rebalance(struc
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
-		tn = (struct tnode*) resize(t, (struct tnode *)tn);
+		tn = (struct tnode *)resize(t, (struct tnode *)tn);
 
-	return (struct node*) tn;
+	return (struct node *)tn;
 }
 
 /* only used from updater-side */
@@ -1028,7 +1044,10 @@ static struct list_head *fib_insert_node
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			tp = tn;
 			pos = tn->pos + tn->bits;
-			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+			n = tnode_get_child(tn,
+					    tkey_extract_bits(key,
+							      tn->pos,
+							      tn->bits));
 
 			BUG_ON(n && node_parent(n) != tn);
 		} else
@@ -1113,16 +1132,18 @@ static struct list_head *fib_insert_node
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+			put_child(t, (struct tnode *)tp, cindex,
+				  (struct node *)tn);
 		} else {
-			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
+			rcu_assign_pointer(t->trie, (struct node *)tn);
 			tp = tn;
 		}
 	}
 
 	if (tp && tp->pos + tp->bits > 32)
-		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
-		       tp, tp->pos, tp->bits, key, plen);
+		pr_warning("fib_trie"
+			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+			   tp, tp->pos, tp->bits, key, plen);
 
 	/* Rebalance the trie */
 
@@ -1232,10 +1253,10 @@ static int fn_trie_insert(struct fib_tab
 				break;
 			if (fa->fa_type == cfg->fc_type &&
 			    fa->fa_scope == cfg->fc_scope &&
-			    fa->fa_info == fi) {
+			    fa->fa_info == fi)
 				goto out;
-			}
 		}
+
 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
 			fa = fa_orig;
 	}
@@ -1286,38 +1307,40 @@ err:
 
 
 /* should be called with rcu_read_lock */
-static inline int check_leaf(struct trie *t, struct leaf *l,
-			     t_key key, int *plen, const struct flowi *flp,
-			     struct fib_result *res)
+static int check_leaf(struct trie *t, struct leaf *l,
+		      t_key key,  const struct flowi *flp,
+		      struct fib_result *res)
 {
-	int err, i;
-	__be32 mask;
 	struct leaf_info *li;
 	struct hlist_head *hhead = &l->list;
 	struct hlist_node *node;
 
 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
-		i = li->plen;
-		mask = inet_make_mask(i);
+		int err;
+		int plen = li->plen;
+		__be32 mask = inet_make_mask(plen);
+
 		if (l->key != (key & ntohl(mask)))
 			continue;
 
-		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
-			*plen = i;
+		err = fib_semantic_match(&li->falh, flp, res,
+					 htonl(l->key), mask, plen);
+
 #ifdef CONFIG_IP_FIB_TRIE_STATS
+		if (err <= 0)
 			t->stats.semantic_match_passed++;
+		else
+			t->stats.semantic_match_miss++;
 #endif
-			return err;
-		}
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-		t->stats.semantic_match_miss++;
-#endif
+		if (err <= 0)
+			return plen;
 	}
-	return 1;
+
+	return -1;
 }
 
-static int
-fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
+			  struct fib_result *res)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	int plen, ret = 0;
@@ -1344,10 +1367,13 @@ fn_trie_lookup(struct fib_table *tb, con
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
-			goto found;
-		goto failed;
+		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+		if (plen < 0)
+			goto failed;
+		ret = 0;
+		goto found;
 	}
+
 	pn = (struct tnode *) n;
 	chopped_off = 0;
 
@@ -1369,14 +1395,14 @@ fn_trie_lookup(struct fib_table *tb, con
 		}
 
 		if (IS_LEAF(n)) {
-			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
-				goto found;
-			else
+			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+			if (plen < 0)
 				goto backtrace;
+
+			ret = 0;
+			goto found;
 		}
 
-#define HL_OPTIMIZE
-#ifdef HL_OPTIMIZE
 		cn = (struct tnode *)n;
 
 		/*
@@ -1405,12 +1431,13 @@ fn_trie_lookup(struct fib_table *tb, con
 		 * *are* zero.
 		 */
 
-		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+		/* NOTA BENE: Checking only skipped bits
+		   for the new node here */
 
 		if (current_prefix_length < pos+bits) {
 			if (tkey_extract_bits(cn->key, current_prefix_length,
-						cn->pos - current_prefix_length) != 0 ||
-			    !(cn->child[0]))
+						cn->pos - current_prefix_length)
+			    || !(cn->child[0]))
 				goto backtrace;
 		}
 
@@ -1433,14 +1460,17 @@ fn_trie_lookup(struct fib_table *tb, con
 		 * new tnode's key.
 		 */
 
-		/* Note: We aren't very concerned about the piece of the key
-		 * that precede pn->pos+pn->bits, since these have already been
-		 * checked. The bits after cn->pos aren't checked since these are
-		 * by definition "unknown" at this point. Thus, what we want to
-		 * see is if we are about to enter the "prefix matching" state,
-		 * and in that case verify that the skipped bits that will prevail
-		 * throughout this subtree are zero, as they have to be if we are
-		 * to find a matching prefix.
+		/*
+		 * Note: We aren't very concerned about the piece of
+		 * the key that precede pn->pos+pn->bits, since these
+		 * have already been checked. The bits after cn->pos
+		 * aren't checked since these are by definition
+		 * "unknown" at this point. Thus, what we want to see
+		 * is if we are about to enter the "prefix matching"
+		 * state, and in that case verify that the skipped
+		 * bits that will prevail throughout this subtree are
+		 * zero, as they have to be if we are to find a
+		 * matching prefix.
 		 */
 
 		node_prefix = mask_pfx(cn->key, cn->pos);
@@ -1448,13 +1478,15 @@ fn_trie_lookup(struct fib_table *tb, con
 		pref_mismatch = key_prefix^node_prefix;
 		mp = 0;
 
-		/* In short: If skipped bits in this node do not match the search
-		 * key, enter the "prefix matching" state.directly.
+		/*
+		 * In short: If skipped bits in this node do not match
+		 * the search key, enter the "prefix matching"
+		 * state.directly.
 		 */
 		if (pref_mismatch) {
 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
 				mp++;
-				pref_mismatch = pref_mismatch <<1;
+				pref_mismatch = pref_mismatch << 1;
 			}
 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
 
@@ -1464,7 +1496,7 @@ fn_trie_lookup(struct fib_table *tb, con
 			if (current_prefix_length >= cn->pos)
 				current_prefix_length = mp;
 		}
-#endif
+
 		pn = (struct tnode *)n; /* Descend */
 		chopped_off = 0;
 		continue;
@@ -1473,12 +1505,14 @@ backtrace:
 		chopped_off++;
 
 		/* As zero don't change the child key (cindex) */
-		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
+		while ((chopped_off <= pn->bits)
+		       && !(cindex & (1<<(chopped_off-1))))
 			chopped_off++;
 
 		/* Decrease current_... with bits chopped off */
 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
-			current_prefix_length = pn->pos + pn->bits - chopped_off;
+			current_prefix_length = pn->pos + pn->bits
+				- chopped_off;
 
 		/*
 		 * Either we do the actual chop off according or if we have
@@ -1528,7 +1562,8 @@ static int trie_leaf_remove(struct trie 
 	while (n != NULL && IS_TNODE(n)) {
 		struct tnode *tn = (struct tnode *) n;
 		check_tnode(tn);
-		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
+		n = tnode_get_child(tn, tkey_extract_bits(key,
+							  tn->pos, tn->bits));
 
 		BUG_ON(n && node_parent(n) != tn);
 	}
@@ -1694,7 +1729,7 @@ static struct leaf *nextleaf(struct trie
 		if (IS_LEAF(trie))          /* trie w. just a leaf */
 			return (struct leaf *) trie;
 
-		p = (struct tnode*) trie;  /* Start */
+		p = (struct tnode *)trie;  /* Start */
 	} else
 		p = node_parent_rcu(c);
 
@@ -1762,8 +1797,9 @@ static int fn_trie_flush(struct fib_tabl
 	return found;
 }
 
-static void
-fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static void fn_trie_select_default(struct fib_table *tb,
+				   const struct flowi *flp,
+				   struct fib_result *res)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	int order, last_idx;
@@ -1834,7 +1870,8 @@ out:
 	rcu_read_unlock();
 }
 
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
+			   struct fib_table *tb,
 			   struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int i, s_i;
@@ -1873,8 +1910,8 @@ static int fn_trie_dump_fa(t_key key, in
 	return skb->len;
 }
 
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
-			     struct netlink_callback *cb)
+static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
+			     struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int h, s_h;
 	struct list_head *fa_head;
@@ -1897,7 +1934,7 @@ static int fn_trie_dump_plen(struct trie
 		if (list_empty(fa_head))
 			continue;
 
-		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
+		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
 			cb->args[3] = h;
 			return -1;
 		}
@@ -1906,7 +1943,8 @@ static int fn_trie_dump_plen(struct trie
 	return skb->len;
 }
 
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
+			struct netlink_callback *cb)
 {
 	int m, s_m;
 	struct trie *t = (struct trie *) tb->tb_data;
@@ -1921,7 +1959,7 @@ static int fn_trie_dump(struct fib_table
 			memset(&cb->args[3], 0,
 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
-		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
+		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
 			cb->args[2] = m;
 			goto out;
 		}
@@ -1936,7 +1974,8 @@ out:
 
 void __init fib_hash_init(void)
 {
-	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
+	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+					  sizeof(struct fib_alias),
 					  0, SLAB_PANIC, NULL);
 
 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
@@ -1970,7 +2009,7 @@ struct fib_table *fib_hash_table(u32 id)
 	memset(t, 0, sizeof(*t));
 
 	if (id == RT_TABLE_LOCAL)
-		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
+		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
 
 	return tb;
 }
@@ -2104,7 +2143,8 @@ static void trie_show_stats(struct seq_f
 	else
 		avdepth = 0;
 
-	seq_printf(seq, "\tAver depth:     %u.%02d\n", avdepth / 100, avdepth % 100 );
+	seq_printf(seq, "\tAver depth:     %u.%02d\n",
+		   avdepth / 100, avdepth % 100);
 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
 
 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
@@ -2136,16 +2176,20 @@ static void trie_show_usage(struct seq_f
 			    const struct trie_use_stats *stats)
 {
 	seq_printf(seq, "\nCounters:\n---------\n");
-	seq_printf(seq,"gets = %u\n", stats->gets);
-	seq_printf(seq,"backtracks = %u\n", stats->backtrack);
-	seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
-	seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
-	seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
-	seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
+	seq_printf(seq, "gets = %u\n", stats->gets);
+	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+	seq_printf(seq, "semantic match passed = %u\n",
+		   stats->semantic_match_passed);
+	seq_printf(seq, "semantic match miss = %u\n",
+		   stats->semantic_match_miss);
+	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
+	seq_printf(seq, "skipped node resize = %u\n\n",
+		   stats->resize_node_skipped);
 }
 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
 
-static void fib_trie_show(struct seq_file *seq, const char *name, struct trie *trie)
+static void fib_trie_show(struct seq_file *seq, const char *name,
+			  struct trie *trie)
 {
 	struct trie_stat stat;
 
@@ -2163,7 +2207,8 @@ static int fib_triestat_seq_show(struct 
 	struct fib_table *tb;
 
 	seq_printf(seq,
-		   "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+		   "Basic info: size of leaf:"
+		   " %Zd bytes, size of tnode: %Zd bytes.\n",
 		   sizeof(struct leaf), sizeof(struct tnode));
 
 	tb = fib_get_table(net, RT_TABLE_LOCAL);
@@ -2436,10 +2481,11 @@ static int fib_route_seq_show(struct seq
 
 	if (iter->trie == iter->trie_local)
 		return 0;
+
 	if (IS_TNODE(l))
 		return 0;
 
-	for (i=32; i>=0; i--) {
+	for (i = 32; i >= 0; i--) {
 		struct leaf_info *li = find_leaf_info(l, i);
 		struct fib_alias *fa;
 		__be32 mask, prefix;
@@ -2466,7 +2512,8 @@ static int fib_route_seq_show(struct seq
 					 fi->fib_nh->nh_gw, flags, 0, 0,
 					 fi->fib_priority,
 					 mask,
-					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+					 (fi->fib_advmss ?
+					  fi->fib_advmss + 40 : 0),
 					 fi->fib_window,
 					 fi->fib_rtt >> 3);
 			else

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 3/9] fib_trie: compute size when needed
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: fib-trie-stats-x.patch --]
[-- Type: text/plain, Size: 2412 bytes --]

Compute the number of prefixes when needed, rather than doing bookeeping.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-21 17:45:03.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-21 17:45:03.000000000 -0800
@@ -143,12 +143,12 @@ struct trie_stat {
 	unsigned int tnodes;
 	unsigned int leaves;
 	unsigned int nullpointers;
+	unsigned int prefixes;
 	unsigned int nodesizes[MAX_STAT_DEPTH];
 };
 
 struct trie {
 	struct node *trie;
-	unsigned int size;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	struct trie_use_stats stats;
 #endif
@@ -1289,8 +1289,6 @@ static int fn_trie_insert(struct fib_tab
 	list_add_tail_rcu(&new_fa->fa_list,
 			  (fa ? &fa->fa_list : fa_head));
 
-	t->size++;
-
 	rt_cache_flush(-1);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
@@ -1576,9 +1574,6 @@ static int trie_leaf_remove(struct trie 
 	 * Key found.
 	 * Remove the leaf and rebalance the tree
 	 */
-
-	t->size--;
-
 	tp = node_parent(n);
 	tnode_free((struct tnode *) n);
 
@@ -2111,10 +2106,17 @@ static void trie_collect_stats(struct tr
 	for (n = fib_trie_get_first(&iter, t); n;
 	     n = fib_trie_get_next(&iter)) {
 		if (IS_LEAF(n)) {
+			struct leaf *l = (struct leaf *)n;
+			struct leaf_info *li;
+			struct hlist_node *tmp;
+
 			s->leaves++;
 			s->totdepth += iter.depth;
 			if (iter.depth > s->maxdepth)
 				s->maxdepth = iter.depth;
+
+			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+				++s->prefixes;
 		} else {
 			const struct tnode *tn = (const struct tnode *) n;
 			int i;
@@ -2148,8 +2150,11 @@ static void trie_show_stats(struct seq_f
 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
 
 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
-
 	bytes = sizeof(struct leaf) * stat->leaves;
+
+	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
+	bytes += sizeof(struct leaf_info) * stat->prefixes;
+
 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
 	bytes += sizeof(struct tnode) * stat->tnodes;
 
@@ -2193,8 +2198,8 @@ static void fib_trie_show(struct seq_fil
 {
 	struct trie_stat stat;
 
-	seq_printf(seq, "%s: %d\n", name, trie->size);
 	trie_collect_stats(trie, &stat);
+	seq_printf(seq, "%s:\n", name);
 	trie_show_stats(seq, &stat);
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	trie_show_usage(seq, &trie->stats);

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 4/9] fib_trie: use hash list
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (2 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Stephen Hemminger

[-- Attachment #1: fib-use-the-list.patch --]
[-- Type: text/plain, Size: 2587 bytes --]

The code to dump can use the existing hash chain rather
than doing repeated lookup.

Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-21 17:45:03.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-21 17:45:08.000000000 -0800
@@ -2396,31 +2396,30 @@ static int fib_trie_seq_show(struct seq_
 
 	} else {
 		struct leaf *l = (struct leaf *) n;
-		int i;
+		struct leaf_info *li;
+		struct hlist_node *node;
+
 		__be32 val = htonl(l->key);
 
 		seq_indent(seq, iter->depth);
 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
-		for (i = 32; i >= 0; i--) {
-			struct leaf_info *li = find_leaf_info(l, i);
 
-			if (li) {
-				struct fib_alias *fa;
+		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+			struct fib_alias *fa;
 
-				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
-					char buf1[32], buf2[32];
+			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+				char buf1[32], buf2[32];
 
-					seq_indent(seq, iter->depth+1);
-					seq_printf(seq, "  /%d %s %s", i,
-						   rtn_scope(buf1, sizeof(buf1),
-							     fa->fa_scope),
-						   rtn_type(buf2, sizeof(buf2),
-							     fa->fa_type));
-					if (fa->fa_tos)
-						seq_printf(seq, "tos =%d\n",
-							   fa->fa_tos);
-					seq_putc(seq, '\n');
-				}
+				seq_indent(seq, iter->depth+1);
+				seq_printf(seq, "  /%d %s %s", li->plen,
+					   rtn_scope(buf1, sizeof(buf1),
+						     fa->fa_scope),
+					   rtn_type(buf2, sizeof(buf2),
+						    fa->fa_type));
+				if (fa->fa_tos)
+					seq_printf(seq, "tos =%d\n",
+						   fa->fa_tos);
+				seq_putc(seq, '\n');
 			}
 		}
 	}
@@ -2474,8 +2473,8 @@ static int fib_route_seq_show(struct seq
 {
 	const struct fib_trie_iter *iter = seq->private;
 	struct leaf *l = v;
-	int i;
-	char bf[128];
+	struct leaf_info *li;
+	struct hlist_node *node;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2490,8 +2489,7 @@ static int fib_route_seq_show(struct seq
 	if (IS_TNODE(l))
 		return 0;
 
-	for (i = 32; i >= 0; i--) {
-		struct leaf_info *li = find_leaf_info(l, i);
+	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
 		struct fib_alias *fa;
 		__be32 mask, prefix;
 
@@ -2504,6 +2502,7 @@ static int fib_route_seq_show(struct seq
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+			char bf[128];
 
 			if (fa->fa_type == RTN_BROADCAST
 			    || fa->fa_type == RTN_MULTICAST)

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 5/9] fib_trie: dump message multiple part flag
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (3 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: fib-multi.patch --]
[-- Type: text/plain, Size: 528 bytes --]

Match fib_hash, and set NLM_F_MULTI to handle multiple part messages.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-21 17:52:10.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-21 17:52:11.000000000 -0800
@@ -1895,7 +1895,7 @@ static int fn_trie_dump_fa(t_key key, in
 				  xkey,
 				  plen,
 				  fa->fa_tos,
-				  fa->fa_info, 0) < 0) {
+				  fa->fa_info, NLM_F_MULTI) < 0) {
 			cb->args[4] = i;
 			return -1;
 		}

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 6/9] fib_trie: iterator recode
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (4 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Remove the complex loop structure of nextleaf() andreplace it with a
simpler tree walker. This improves the performance and is much
cleaner.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-22 09:52:46.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-22 12:58:59.000000000 -0800
@@ -1708,64 +1708,65 @@ static int trie_flush_leaf(struct trie *
 	return found;
 }
 
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
 {
-	struct node *c = (struct node *) thisleaf;
-	struct tnode *p;
-	int idx;
-	struct node *trie = rcu_dereference(t->trie);
-
-	if (c == NULL) {
-		if (trie == NULL)
-			return NULL;
-
-		if (IS_LEAF(trie))          /* trie w. just a leaf */
-			return (struct leaf *) trie;
-
-		p = (struct tnode *)trie;  /* Start */
-	} else
-		p = node_parent_rcu(c);
+	do {
+		t_key idx;
 
-	while (p) {
-		int pos, last;
-
-		/*  Find the next child of the parent */
 		if (c)
-			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
 		else
-			pos = 0;
-
-		last = 1 << p->bits;
-		for (idx = pos; idx < last ; idx++) {
-			c = rcu_dereference(p->child[idx]);
+			idx = 0;
 
+		while (idx < 1u << p->bits) {
+			c = tnode_get_child_rcu(p, idx++);
 			if (!c)
 				continue;
 
-			/* Decend if tnode */
-			while (IS_TNODE(c)) {
-				p = (struct tnode *) c;
-				idx = 0;
-
-				/* Rightmost non-NULL branch */
-				if (p && IS_TNODE(p))
-					while (!(c = rcu_dereference(p->child[idx]))
-					       && idx < (1<<p->bits)) idx++;
-
-				/* Done with this tnode? */
-				if (idx >= (1 << p->bits) || !c)
-					goto up;
+			if (IS_LEAF(c)) {
+				prefetch(p->child[idx]);
+				return (struct leaf *) c;
 			}
-			return (struct leaf *) c;
+
+			/* Rescan start scanning in new node */
+			p = (struct tnode *) c;
+			idx = 0;
 		}
-up:
-		/* No more children go up one step  */
+
+		/* Node empty, walk back up to parent */
 		c = (struct node *) p;
-		p = node_parent_rcu(c);
-	}
-	return NULL; /* Ready. Root of trie */
+	} while ( (p = node_parent_rcu(c)) != NULL);
+
+	return NULL; /* Root of trie */
+}
+
+
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+
+	if (!n)
+		return NULL;
+
+	if (IS_LEAF(n))          /* trie is just a leaf */
+		return (struct leaf *) n;
+
+	return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+	struct node *c = (struct node *) l;
+	struct tnode *p = node_parent(c);
+
+	if (!p)
+		return NULL;	/* trie with just one leaf */
+
+	return leaf_walk_rcu(p, c);
 }
 
 /*
@@ -1775,9 +1776,9 @@ static int fn_trie_flush(struct fib_tabl
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	struct leaf *ll = NULL, *l = NULL;
-	int found = 0, h;
+	int found = 0;
 
-	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
 		found += trie_flush_leaf(t, l);
 
 		if (ll && hlist_empty(&ll->list))
@@ -1884,7 +1885,6 @@ static int fn_trie_dump_fa(t_key key, in
 			i++;
 			continue;
 		}
-		BUG_ON(!fa->fa_info);
 
 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
 				  cb->nlh->nlmsg_seq,
@@ -1913,8 +1913,9 @@ static int fn_trie_dump_plen(struct trie
 	struct leaf *l = NULL;
 
 	s_h = cb->args[3];
+	h = 0;
 
-	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+	for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 7/9] fib_trie: dump table in sorted order
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (5 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: fib-trie-dump-unordered.patch --]
[-- Type: text/plain, Size: 2689 bytes --]

It is easier with TRIE to dump the data traversal rather than
interating over every possible prefix. This saves some time and makes
the dump come out in sorted order.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/fib_trie.c	2008-01-22 12:58:59.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-22 12:59:03.000000000 -0800
@@ -1905,67 +1905,71 @@ static int fn_trie_dump_fa(t_key key, in
 	return skb->len;
 }
 
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+			struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int h, s_h;
-	struct list_head *fa_head;
-	struct leaf *l = NULL;
+	struct leaf_info *li;
+	struct hlist_node *node;
+	int i, s_i;
 
-	s_h = cb->args[3];
-	h = 0;
+	s_i = cb->args[3];
+	i = 0;
 
-	for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
-		if (h < s_h)
+	/* rcu_read_lock is hold by caller */
+	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+		if (i < s_i) {
+			i++;
 			continue;
-		if (h > s_h)
-			memset(&cb->args[4], 0,
-			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
-
-		fa_head = get_fa_head(l, plen);
+		}
 
-		if (!fa_head)
-			continue;
+		if (i > s_i)
+			cb->args[4] = 0;
 
-		if (list_empty(fa_head))
+		if (list_empty(&li->falh))
 			continue;
 
-		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
-			cb->args[3] = h;
+		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+			cb->args[3] = i;
 			return -1;
 		}
+		i++;
 	}
-	cb->args[3] = h;
+
+	cb->args[3] = i;
 	return skb->len;
 }
 
+
+
 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
 			struct netlink_callback *cb)
 {
-	int m, s_m;
+	struct leaf *l;
 	struct trie *t = (struct trie *) tb->tb_data;
-
-	s_m = cb->args[2];
+	int h = 0;
+	int s_h = cb->args[2];
 
 	rcu_read_lock();
-	for (m = 0; m <= 32; m++) {
-		if (m < s_m)
+	for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
+		if (h < s_h)
 			continue;
-		if (m > s_m)
-			memset(&cb->args[3], 0,
-				sizeof(cb->args) - 3*sizeof(cb->args[0]));
-
-		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
-			cb->args[2] = m;
-			goto out;
+
+		if (h > s_h) {
+			cb->args[3] = 0;
+			cb->args[4] = 0;
+		}
+
+		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+			rcu_read_unlock();
+			cb->args[2] = h;
+			return -1;
 		}
 	}
 	rcu_read_unlock();
-	cb->args[2] = m;
+
+	cb->args[2] = h;
 	return skb->len;
-out:
-	rcu_read_unlock();
-	return -1;
 }
 
 void __init fib_hash_init(void)

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 8/9] fib_trie: avoid extra search on delete
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (6 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: trie-remove-desquare.patch --]
[-- Type: text/plain, Size: 2593 bytes --]

Get rid of extra search that made route deletion O(n).

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-22 15:24:41.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-22 15:25:32.000000000 -0800
@@ -1542,49 +1542,23 @@ found:
 	return ret;
 }
 
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
 {
-	t_key cindex;
-	struct tnode *tp = NULL;
-	struct node *n = t->trie;
-	struct leaf *l;
-
-	pr_debug("entering trie_leaf_remove(%p)\n", n);
-
-	/* Note that in the case skipped bits, those bits are *not* checked!
-	 * When we finish this, we will have NULL or a T_LEAF, and the
-	 * T_LEAF may or may not match our key.
-	 */
-
-	while (n != NULL && IS_TNODE(n)) {
-		struct tnode *tn = (struct tnode *) n;
-		check_tnode(tn);
-		n = tnode_get_child(tn, tkey_extract_bits(key,
-							  tn->pos, tn->bits));
-
-		BUG_ON(n && node_parent(n) != tn);
-	}
-	l = (struct leaf *) n;
-
-	if (!n || !tkey_equals(l->key, key))
-		return 0;
+	struct tnode *tp = node_parent((struct node *) l);
 
-	/*
-	 * Key found.
-	 * Remove the leaf and rebalance the tree
-	 */
-	tp = node_parent(n);
-	tnode_free((struct tnode *) n);
+	pr_debug("entering trie_leaf_remove(%p)\n", l);
 
 	if (tp) {
-		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
 	} else
 		rcu_assign_pointer(t->trie, NULL);
 
-	return 1;
+	tnode_free((struct tnode *) l);
 }
 
 /*
@@ -1662,7 +1636,7 @@ static int fn_trie_delete(struct fib_tab
 	}
 
 	if (hlist_empty(&l->list))
-		trie_leaf_remove(t, key);
+		trie_leaf_remove(t, l);
 
 	if (fa->fa_state & FA_S_ACCESSED)
 		rt_cache_flush(-1);
@@ -1775,19 +1749,19 @@ static struct leaf *trie_nextleaf(struct
 static int fn_trie_flush(struct fib_table *tb)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
-	struct leaf *ll = NULL, *l = NULL;
+	struct leaf *l, *ll = NULL;
 	int found = 0;
 
 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
 		found += trie_flush_leaf(t, l);
 
 		if (ll && hlist_empty(&ll->list))
-			trie_leaf_remove(t, ll->key);
+			trie_leaf_remove(t, ll);
 		ll = l;
 	}
 
 	if (ll && hlist_empty(&ll->list))
-		trie_leaf_remove(t, ll->key);
+		trie_leaf_remove(t, ll);
 
 	pr_debug("trie_flush found=%d\n", found);
 	return found;

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 9/9] fib_trie: avoid rescan on dump
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (7 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
  2008-01-23  5:58 ` [IPV4 0/9] TRIE performance patches David Miller
  2008-01-23 14:06 ` Robert Olsson
  10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: trie-dump-desquare.patch --]
[-- Type: text/plain, Size: 1877 bytes --]

This converts dumping (and flushing) of large route tables
form O(N^2) to O(N). If the route dump took multiple pages then
the dump routine gets called again. The old code kept track of
location by counter, the new code instead uses the last key.

This is a really big win ( 0.3 sec vs 12 sec) for big route tables.

One side effect is that if the table changes during the dump, 
then the last key will not be found, and we will return -EBUSY.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv4/fib_trie.c	2008-01-22 15:25:32.000000000 -0800
+++ b/net/ipv4/fib_trie.c	2008-01-22 15:26:41.000000000 -0800
@@ -1914,35 +1914,43 @@ static int fn_trie_dump_leaf(struct leaf
 	return skb->len;
 }
 
-
-
 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
 			struct netlink_callback *cb)
 {
 	struct leaf *l;
 	struct trie *t = (struct trie *) tb->tb_data;
-	int h = 0;
-	int s_h = cb->args[2];
+	t_key key = cb->args[2];
 
 	rcu_read_lock();
-	for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
-		if (h < s_h)
-			continue;
-
-		if (h > s_h) {
-			cb->args[3] = 0;
-			cb->args[4] = 0;
+	/* Dump starting at last key.
+	 * Note: 0.0.0.0/0 (ie default) is first key.
+	 */
+	if (!key)
+		l = trie_firstleaf(t);
+	else {
+		l = fib_find_node(t, key);
+		if (!l) {
+			/* The table changed during the dump, rather than
+			 * giving partial data, just make application retry.
+			 */
+			rcu_read_unlock();
+			return -EBUSY;
 		}
+	}
 
+	while (l) {
+		cb->args[2] = l->key;
 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
 			rcu_read_unlock();
-			cb->args[2] = h;
 			return -1;
 		}
+
+		l = trie_nextleaf(l);
+		memset(&cb->args[3], 0,
+		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
 	rcu_read_unlock();
 
-	cb->args[2] = h;
 	return skb->len;
 }
 

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (8 preceding siblings ...)
  2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
@ 2008-01-23  5:58 ` David Miller
  2008-01-23 14:06 ` Robert Olsson
  10 siblings, 0 replies; 17+ messages in thread
From: David Miller @ 2008-01-23  5:58 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 22 Jan 2008 15:37:33 -0800

> 
> Time to handle a full BGP load (163K of routes).
> 
> Before:		Load		Dump		Flush
> hash		3.5		 0.5		0.7
> 2.6.23.14	3.4		19.3		10.3
> net-2.6.25	3.4		18.7		9.8
> 
> After:
> kmem_cache	3.8		13.0		7.2
> iter		3.9		12.3		6.9
> unordered	3.1		11.9		4.9
> find_node	3.1		 0.3		1.2
> 
> Load:	ip -batch iproute-table
> Dump:	ip route >/dev/null
> Flush:	ip route flush table main

Nice work Stephen, all 9 patches applied.

We'll need to think about that new -EBUSY behavior, it
might confuse routing daemons and similar, and therefore
we might need to find a way to fudge continuing the dump
instead of failing.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [IPV4 0/9] TRIE performance patches
  2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
                   ` (9 preceding siblings ...)
  2008-01-23  5:58 ` [IPV4 0/9] TRIE performance patches David Miller
@ 2008-01-23 14:06 ` Robert Olsson
  2008-01-23 16:31   ` Stephen Hemminger
  2008-01-23 23:49   ` Stephen Hemminger
  10 siblings, 2 replies; 17+ messages in thread
From: Robert Olsson @ 2008-01-23 14:06 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev


Stephen Hemminger writes:

 > Time to handle a full BGP load (163K of routes).
 > 
 > Before:		Load		Dump		Flush
 >
 > kmem_cache	3.8		13.0		7.2
 > iter		3.9		12.3		6.9
 > unordered	3.1		11.9		4.9
 > find_node	3.1		 0.3		1.2

 I certainly like the speed but what will we brake when
 we don't return in longest prefix order?

labb:/# ip r
default via 10.10.10.1 dev eth0 
5.0.0.0/8 via 192.168.2.2 dev eth3 
10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
10.10.11.0/24 dev eth1  proto kernel  scope link  src 10.10.11.1 
11.0.0.0/8 via 10.10.11.2 dev eth1 
192.168.1.0/24 dev eth2  proto kernel  scope link  src 192.168.1.2 
192.168.2.0/24 dev eth3  proto kernel  scope link  src 192.168.2.1 

labb:/# ip route list match 10.10.10.1
default via 10.10.10.1 dev eth0 
10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
labb:/# 

Maybe the unordered dump can be ordered cheaply...

Cheers.
				--ro


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-23 14:06 ` Robert Olsson
@ 2008-01-23 16:31   ` Stephen Hemminger
  2008-01-23 23:49   ` Stephen Hemminger
  1 sibling, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-23 16:31 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David Miller, netdev

Robert Olsson wrote:
> Stephen Hemminger writes:
>
>  > Time to handle a full BGP load (163K of routes).
>  > 
>  > Before:		Load		Dump		Flush
>  >
>  > kmem_cache	3.8		13.0		7.2
>  > iter		3.9		12.3		6.9
>  > unordered	3.1		11.9		4.9
>  > find_node	3.1		 0.3		1.2
>
>  I certainly like the speed but what will we brake when
>  we don't return in longest prefix order?
>
> labb:/# ip r
> default via 10.10.10.1 dev eth0 
> 5.0.0.0/8 via 192.168.2.2 dev eth3 
> 10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
> 10.10.11.0/24 dev eth1  proto kernel  scope link  src 10.10.11.1 
> 11.0.0.0/8 via 10.10.11.2 dev eth1 
> 192.168.1.0/24 dev eth2  proto kernel  scope link  src 192.168.1.2 
> 192.168.2.0/24 dev eth3  proto kernel  scope link  src 192.168.2.1 
>
> labb:/# ip route list match 10.10.10.1
> default via 10.10.10.1 dev eth0 
> 10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
> labb:/# 
>
> Maybe the unordered dump can be ordered cheaply...
>
> Cheers.
> 				--ro
>
>   
Hash returned the routes in prefix order (then random).  Returning the 
routes in numerical order
seems just as logical. I'm going to test on quagga.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-23 14:06 ` Robert Olsson
  2008-01-23 16:31   ` Stephen Hemminger
@ 2008-01-23 23:49   ` Stephen Hemminger
  2008-01-24  9:36     ` Robert Olsson
  2008-02-01 18:00     ` Robert Olsson
  1 sibling, 2 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-23 23:49 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David Miller, netdev

On Wed, 23 Jan 2008 15:06:47 +0100
Robert Olsson <Robert.Olsson@data.slu.se> wrote:

> 
> Stephen Hemminger writes:
> 
>  > Time to handle a full BGP load (163K of routes).
>  > 
>  > Before:		Load		Dump		Flush
>  >
>  > kmem_cache	3.8		13.0		7.2
>  > iter		3.9		12.3		6.9
>  > unordered	3.1		11.9		4.9
>  > find_node	3.1		 0.3		1.2
> 
>  I certainly like the speed but what will we brake when
>  we don't return in longest prefix order?
> 
> labb:/# ip r
> default via 10.10.10.1 dev eth0 
> 5.0.0.0/8 via 192.168.2.2 dev eth3 
> 10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
> 10.10.11.0/24 dev eth1  proto kernel  scope link  src 10.10.11.1 
> 11.0.0.0/8 via 10.10.11.2 dev eth1 
> 192.168.1.0/24 dev eth2  proto kernel  scope link  src 192.168.1.2 
> 192.168.2.0/24 dev eth3  proto kernel  scope link  src 192.168.2.1 
> 
> labb:/# ip route list match 10.10.10.1
> default via 10.10.10.1 dev eth0 
> 10.10.10.0/24 dev eth0  proto kernel  scope link  src 10.10.10.2 
> labb:/# 
> 
> Maybe the unordered dump can be ordered cheaply...

Dumping by prefix is possible, but unless 32x slower. Dumping in
address order is just as logical. Like I said, I'm investigating what
quagga handles.
-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-23 23:49   ` Stephen Hemminger
@ 2008-01-24  9:36     ` Robert Olsson
  2008-01-24 16:18       ` Stephen Hemminger
  2008-02-01 18:00     ` Robert Olsson
  1 sibling, 1 reply; 17+ messages in thread
From: Robert Olsson @ 2008-01-24  9:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev


Stephen Hemminger writes:

 > Dumping by prefix is possible, but unless 32x slower. Dumping in
 > address order is just as logical. Like I said, I'm investigating what
 > quagga handles.

 How about taking a snapshot to in address order (as you did) to some 
 allocated memory, returning from that memory in prefix order? This would 
 solve the  -EBUSY too and give a consistent view of the routing table at
 the time for the dump/snapshot.

 Cheers
					--ro

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-24  9:36     ` Robert Olsson
@ 2008-01-24 16:18       ` Stephen Hemminger
  0 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-24 16:18 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Robert Olsson, David Miller, netdev

On Thu, 24 Jan 2008 10:36:45 +0100
Robert Olsson <Robert.Olsson@data.slu.se> wrote:

> 
> Stephen Hemminger writes:
> 
>  > Dumping by prefix is possible, but unless 32x slower. Dumping in
>  > address order is just as logical. Like I said, I'm investigating what
>  > quagga handles.
> 
>  How about taking a snapshot to in address order (as you did) to some 
>  allocated memory, returning from that memory in prefix order? This would 
>  solve the  -EBUSY too and give a consistent view of the routing table at
>  the time for the dump/snapshot.
> 
>  Cheers
> 					--ro

Snapshotting is going to work, because of scale and because the kernel can't
tell when application is going to come back.

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [IPV4 0/9] TRIE performance patches
  2008-01-23 23:49   ` Stephen Hemminger
  2008-01-24  9:36     ` Robert Olsson
@ 2008-02-01 18:00     ` Robert Olsson
  1 sibling, 0 replies; 17+ messages in thread
From: Robert Olsson @ 2008-02-01 18:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev


Hello, finally got some time to test...

Table w. 214k routes with full rDoS on two intrefaces on 2 x AMD64 processors, 
speed 2814.43 MHz. Profiled with CPU_CLK_UNHALTED and rtstat

w/o latest patch fib_trie pathes. Tput ~233 kpps  

samples  %        symbol name

109925   14.4513  fn_trie_lookup
109821   14.4376  ip_route_input
87245    11.4696  rt_intern_hash
31270     4.1109  kmem_cache_alloc
24159     3.1761  dev_queue_xmit
23200     3.0500  neigh_lookup
22464     2.9532  free_block
18412     2.4205  kmem_cache_free
17830     2.3440  dst_destroy
15740     2.0693  fib_get_table

With latest patch fib_patches.(Stephens others) Same throughput 
~233 kpps but we see a different profile. Why we don't get any 
better better throughput is yet to be understand (the drops in 
qdisc could be the cause) more analysis needed

79242    14.3520  ip_route_input
65188    11.8066  fn_trie_lookup
64559    11.6927  rt_intern_hash
22901     4.1477  kmem_cache_alloc
21038     3.8103  check_leaf
16197     2.9335  neigh_lookup
14802     2.6809  free_block
14596     2.6436  ip_rcv_finish
12365     2.2395  fib_validate_source
12048     2.1821  dst_destroy

fib_hash thoughput ~177 kpps  
Hard work for fib_hash here as we have many zones.
it can be fast with less zines.

200568   37.8013  fn_hash_lookup
58352    10.9977  ip_route_input
44495     8.3860  rt_intern_hash
12873     2.4262  kmem_cache_alloc
12115     2.2833  rt_may_expire
11691     2.2034  rt_garbage_collect
10821     2.0394  dev_queue_xmit
9999      1.8845  fib_validate_source
8762      1.6514  fib_get_table
8558      1.6129  fib_semantic_match


Cheers
					--ro

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2008-02-01 18:01 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
2008-01-23  5:58 ` [IPV4 0/9] TRIE performance patches David Miller
2008-01-23 14:06 ` Robert Olsson
2008-01-23 16:31   ` Stephen Hemminger
2008-01-23 23:49   ` Stephen Hemminger
2008-01-24  9:36     ` Robert Olsson
2008-01-24 16:18       ` Stephen Hemminger
2008-02-01 18:00     ` Robert Olsson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).