* [IPV4 0/9] TRIE performance patches
@ 2008-01-22 23:37 Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
` (10 more replies)
0 siblings, 11 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Time to handle a full BGP load (163K of routes).
Before: Load Dump Flush
hash 3.5 0.5 0.7
2.6.23.14 3.4 19.3 10.3
net-2.6.25 3.4 18.7 9.8
After:
kmem_cache 3.8 13.0 7.2
iter 3.9 12.3 6.9
unordered 3.1 11.9 4.9
find_node 3.1 0.3 1.2
Load: ip -batch iproute-table
Dump: ip route >/dev/null
Flush: ip route flush table main
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 1/9] fib_trie: put leaf nodes in a slab cache
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
` (9 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Stephen Hemminger
[-- Attachment #1: leaf-kmem-cache.patch --]
[-- Type: text/plain, Size: 1669 bytes --]
This improves locality for operations that touch all the leaves.
Save space since these entries don't need to be hardware cache aligned.
Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-21 10:16:10.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-21 10:18:42.000000000 -0800
@@ -162,6 +162,7 @@ static struct tnode *halve(struct trie *
static void tnode_free(struct tnode *tn);
static struct kmem_cache *fn_alias_kmem __read_mostly;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
static inline struct tnode *node_parent(struct node *node)
{
@@ -325,7 +326,8 @@ static inline void alias_free_mem_rcu(st
static void __leaf_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct leaf, rcu));
+ struct leaf *l = container_of(head, struct leaf, rcu);
+ kmem_cache_free(trie_leaf_kmem, l);
}
static void __leaf_info_free_rcu(struct rcu_head *head)
@@ -375,7 +377,7 @@ static inline void tnode_free(struct tno
static struct leaf *leaf_new(void)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = T_LEAF;
INIT_HLIST_HEAD(&l->list);
@@ -1935,7 +1937,12 @@ out:
void __init fib_hash_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ 0, SLAB_PANIC, NULL);
+
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ max(sizeof(struct leaf),
+ sizeof(struct leaf_info)),
+ 0, SLAB_PANIC, NULL);
}
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 2/9] fib_trie: style cleanup
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
` (8 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Stephen Hemminger
[-- Attachment #1: fib-trie-style.patch --]
[-- Type: text/plain, Size: 18775 bytes --]
Style cleanups:
* make check_leaf return -1 or plen, rather than by reference
* Get rid of #ifdef that is always set
* split out embedded function calls in if statements.
* checkpatch warnings
Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-21 10:18:42.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-21 10:19:00.000000000 -0800
@@ -155,7 +155,8 @@ struct trie {
};
static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+ int wasfull);
static struct node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
@@ -395,7 +396,7 @@ static struct leaf_info *leaf_info_new(i
return li;
}
-static struct tnode* tnode_new(t_key key, int pos, int bits)
+static struct tnode *tnode_new(t_key key, int pos, int bits)
{
size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
struct tnode *tn = tnode_alloc(sz);
@@ -427,7 +428,8 @@ static inline int tnode_full(const struc
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+static inline void put_child(struct trie *t, struct tnode *tn, int i,
+ struct node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -437,7 +439,8 @@ static inline void put_child(struct trie
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+ int wasfull)
{
struct node *chi = tn->child[i];
int isfull;
@@ -577,11 +580,13 @@ static struct node *resize(struct trie *
err = 0;
max_resize = 10;
while ((tn->full_children > 0 && max_resize-- &&
- 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold_use * tnode_child_length(tn))) {
+ 50 * (tn->full_children + tnode_child_length(tn)
+ - tn->empty_children)
+ >= inflate_threshold_use * tnode_child_length(tn))) {
old_tn = tn;
tn = inflate(t, tn);
+
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -593,11 +598,13 @@ static struct node *resize(struct trie *
if (max_resize < 0) {
if (!tn->parent)
- printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
- inflate_threshold_root, tn->bits);
+ pr_warning("Fix inflate_threshold_root."
+ " Now=%d size=%d bits\n",
+ inflate_threshold_root, tn->bits);
else
- printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
- inflate_threshold, tn->bits);
+ pr_warning("Fix inflate_threshold."
+ " Now=%d size=%d bits\n",
+ inflate_threshold, tn->bits);
}
check_tnode(tn);
@@ -634,11 +641,13 @@ static struct node *resize(struct trie *
if (max_resize < 0) {
if (!tn->parent)
- printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
- halve_threshold_root, tn->bits);
+ pr_warning("Fix halve_threshold_root."
+ " Now=%d size=%d bits\n",
+ halve_threshold_root, tn->bits);
else
- printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
- halve_threshold, tn->bits);
+ pr_warning("Fix halve_threshold."
+ " Now=%d size=%d bits\n",
+ halve_threshold, tn->bits);
}
/* Only one child remains */
@@ -681,8 +690,9 @@ static struct tnode *inflate(struct trie
*/
for (i = 0; i < olen; i++) {
- struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+ struct tnode *inode;
+ inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
IS_TNODE(inode) &&
inode->pos == oldtnode->pos + oldtnode->bits &&
@@ -722,8 +732,9 @@ static struct tnode *inflate(struct trie
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
- 1) == 0)
+ if (tkey_extract_bits(node->key,
+ oldtnode->pos + oldtnode->bits,
+ 1) == 0)
put_child(t, tn, 2*i, node);
else
put_child(t, tn, 2*i+1, node);
@@ -899,7 +910,7 @@ static struct leaf_info *find_leaf_info(
return NULL;
}
-static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct leaf *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
@@ -949,7 +960,10 @@ fib_find_node(struct trie *t, u32 key)
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
- n = tnode_get_child_rcu(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child_rcu(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
} else
break;
}
@@ -970,8 +984,10 @@ static struct node *trie_rebalance(struc
while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize (t, (struct tnode *)tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+ tn = (struct tnode *) resize(t, (struct tnode *)tn);
+
+ tnode_put_child_reorg((struct tnode *)tp, cindex,
+ (struct node *)tn, wasfull);
tp = node_parent((struct node *) tn);
if (!tp)
@@ -981,9 +997,9 @@ static struct node *trie_rebalance(struc
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode*) resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, (struct tnode *)tn);
- return (struct node*) tn;
+ return (struct node *)tn;
}
/* only used from updater-side */
@@ -1028,7 +1044,10 @@ static struct list_head *fib_insert_node
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
BUG_ON(n && node_parent(n) != tn);
} else
@@ -1113,16 +1132,18 @@ static struct list_head *fib_insert_node
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+ put_child(t, (struct tnode *)tp, cindex,
+ (struct node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
+ rcu_assign_pointer(t->trie, (struct node *)tn);
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
- printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ pr_warning("fib_trie"
+ " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
@@ -1232,10 +1253,10 @@ static int fn_trie_insert(struct fib_tab
break;
if (fa->fa_type == cfg->fc_type &&
fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
+ fa->fa_info == fi)
goto out;
- }
}
+
if (!(cfg->fc_nlflags & NLM_F_APPEND))
fa = fa_orig;
}
@@ -1286,38 +1307,40 @@ err:
/* should be called with rcu_read_lock */
-static inline int check_leaf(struct trie *t, struct leaf *l,
- t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res)
+static int check_leaf(struct trie *t, struct leaf *l,
+ t_key key, const struct flowi *flp,
+ struct fib_result *res)
{
- int err, i;
- __be32 mask;
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
struct hlist_node *node;
hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- i = li->plen;
- mask = inet_make_mask(i);
+ int err;
+ int plen = li->plen;
+ __be32 mask = inet_make_mask(plen);
+
if (l->key != (key & ntohl(mask)))
continue;
- if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
- *plen = i;
+ err = fib_semantic_match(&li->falh, flp, res,
+ htonl(l->key), mask, plen);
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
+ if (err <= 0)
t->stats.semantic_match_passed++;
+ else
+ t->stats.semantic_match_miss++;
#endif
- return err;
- }
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_miss++;
-#endif
+ if (err <= 0)
+ return plen;
}
- return 1;
+
+ return -1;
}
-static int
-fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int plen, ret = 0;
@@ -1344,10 +1367,13 @@ fn_trie_lookup(struct fib_table *tb, con
/* Just a leaf? */
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- goto failed;
+ plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+ if (plen < 0)
+ goto failed;
+ ret = 0;
+ goto found;
}
+
pn = (struct tnode *) n;
chopped_off = 0;
@@ -1369,14 +1395,14 @@ fn_trie_lookup(struct fib_table *tb, con
}
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- else
+ plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+ if (plen < 0)
goto backtrace;
+
+ ret = 0;
+ goto found;
}
-#define HL_OPTIMIZE
-#ifdef HL_OPTIMIZE
cn = (struct tnode *)n;
/*
@@ -1405,12 +1431,13 @@ fn_trie_lookup(struct fib_table *tb, con
* *are* zero.
*/
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+ /* NOTA BENE: Checking only skipped bits
+ for the new node here */
if (current_prefix_length < pos+bits) {
if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
+ cn->pos - current_prefix_length)
+ || !(cn->child[0]))
goto backtrace;
}
@@ -1433,14 +1460,17 @@ fn_trie_lookup(struct fib_table *tb, con
* new tnode's key.
*/
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
+ /*
+ * Note: We aren't very concerned about the piece of
+ * the key that precede pn->pos+pn->bits, since these
+ * have already been checked. The bits after cn->pos
+ * aren't checked since these are by definition
+ * "unknown" at this point. Thus, what we want to see
+ * is if we are about to enter the "prefix matching"
+ * state, and in that case verify that the skipped
+ * bits that will prevail throughout this subtree are
+ * zero, as they have to be if we are to find a
+ * matching prefix.
*/
node_prefix = mask_pfx(cn->key, cn->pos);
@@ -1448,13 +1478,15 @@ fn_trie_lookup(struct fib_table *tb, con
pref_mismatch = key_prefix^node_prefix;
mp = 0;
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
+ /*
+ * In short: If skipped bits in this node do not match
+ * the search key, enter the "prefix matching"
+ * state.directly.
*/
if (pref_mismatch) {
while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
mp++;
- pref_mismatch = pref_mismatch <<1;
+ pref_mismatch = pref_mismatch << 1;
}
key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
@@ -1464,7 +1496,7 @@ fn_trie_lookup(struct fib_table *tb, con
if (current_prefix_length >= cn->pos)
current_prefix_length = mp;
}
-#endif
+
pn = (struct tnode *)n; /* Descend */
chopped_off = 0;
continue;
@@ -1473,12 +1505,14 @@ backtrace:
chopped_off++;
/* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
+ while ((chopped_off <= pn->bits)
+ && !(cindex & (1<<(chopped_off-1))))
chopped_off++;
/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits - chopped_off;
+ current_prefix_length = pn->pos + pn->bits
+ - chopped_off;
/*
* Either we do the actual chop off according or if we have
@@ -1528,7 +1562,8 @@ static int trie_leaf_remove(struct trie
while (n != NULL && IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
check_tnode(tn);
- n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child(tn, tkey_extract_bits(key,
+ tn->pos, tn->bits));
BUG_ON(n && node_parent(n) != tn);
}
@@ -1694,7 +1729,7 @@ static struct leaf *nextleaf(struct trie
if (IS_LEAF(trie)) /* trie w. just a leaf */
return (struct leaf *) trie;
- p = (struct tnode*) trie; /* Start */
+ p = (struct tnode *)trie; /* Start */
} else
p = node_parent_rcu(c);
@@ -1762,8 +1797,9 @@ static int fn_trie_flush(struct fib_tabl
return found;
}
-static void
-fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static void fn_trie_select_default(struct fib_table *tb,
+ const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int order, last_idx;
@@ -1834,7 +1870,8 @@ out:
rcu_read_unlock();
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
+ struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
int i, s_i;
@@ -1873,8 +1910,8 @@ static int fn_trie_dump_fa(t_key key, in
return skb->len;
}
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
{
int h, s_h;
struct list_head *fa_head;
@@ -1897,7 +1934,7 @@ static int fn_trie_dump_plen(struct trie
if (list_empty(fa_head))
continue;
- if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
+ if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
cb->args[3] = h;
return -1;
}
@@ -1906,7 +1943,8 @@ static int fn_trie_dump_plen(struct trie
return skb->len;
}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
int m, s_m;
struct trie *t = (struct trie *) tb->tb_data;
@@ -1921,7 +1959,7 @@ static int fn_trie_dump(struct fib_table
memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0]));
- if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
+ if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
cb->args[2] = m;
goto out;
}
@@ -1936,7 +1974,8 @@ out:
void __init fib_hash_init(void)
{
- fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
@@ -1970,7 +2009,7 @@ struct fib_table *fib_hash_table(u32 id)
memset(t, 0, sizeof(*t));
if (id == RT_TABLE_LOCAL)
- printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
return tb;
}
@@ -2104,7 +2143,8 @@ static void trie_show_stats(struct seq_f
else
avdepth = 0;
- seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
@@ -2136,16 +2176,20 @@ static void trie_show_usage(struct seq_f
const struct trie_use_stats *stats)
{
seq_printf(seq, "\nCounters:\n---------\n");
- seq_printf(seq,"gets = %u\n", stats->gets);
- seq_printf(seq,"backtracks = %u\n", stats->backtrack);
- seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
- seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
- seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
- seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
+ seq_printf(seq, "gets = %u\n", stats->gets);
+ seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ stats->semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n",
+ stats->semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n",
+ stats->resize_node_skipped);
}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
-static void fib_trie_show(struct seq_file *seq, const char *name, struct trie *trie)
+static void fib_trie_show(struct seq_file *seq, const char *name,
+ struct trie *trie)
{
struct trie_stat stat;
@@ -2163,7 +2207,8 @@ static int fib_triestat_seq_show(struct
struct fib_table *tb;
seq_printf(seq,
- "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ "Basic info: size of leaf:"
+ " %Zd bytes, size of tnode: %Zd bytes.\n",
sizeof(struct leaf), sizeof(struct tnode));
tb = fib_get_table(net, RT_TABLE_LOCAL);
@@ -2436,10 +2481,11 @@ static int fib_route_seq_show(struct seq
if (iter->trie == iter->trie_local)
return 0;
+
if (IS_TNODE(l))
return 0;
- for (i=32; i>=0; i--) {
+ for (i = 32; i >= 0; i--) {
struct leaf_info *li = find_leaf_info(l, i);
struct fib_alias *fa;
__be32 mask, prefix;
@@ -2466,7 +2512,8 @@ static int fib_route_seq_show(struct seq
fi->fib_nh->nh_gw, flags, 0, 0,
fi->fib_priority,
mask,
- (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
else
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 3/9] fib_trie: compute size when needed
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
` (7 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-trie-stats-x.patch --]
[-- Type: text/plain, Size: 2412 bytes --]
Compute the number of prefixes when needed, rather than doing bookeeping.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800
@@ -143,12 +143,12 @@ struct trie_stat {
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
+ unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
struct trie {
struct node *trie;
- unsigned int size;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
@@ -1289,8 +1289,6 @@ static int fn_trie_insert(struct fib_tab
list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
- t->size++;
-
rt_cache_flush(-1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1576,9 +1574,6 @@ static int trie_leaf_remove(struct trie
* Key found.
* Remove the leaf and rebalance the tree
*/
-
- t->size--;
-
tp = node_parent(n);
tnode_free((struct tnode *) n);
@@ -2111,10 +2106,17 @@ static void trie_collect_stats(struct tr
for (n = fib_trie_get_first(&iter, t); n;
n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
+ struct leaf *l = (struct leaf *)n;
+ struct leaf_info *li;
+ struct hlist_node *tmp;
+
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
+
+ hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+ ++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
int i;
@@ -2148,8 +2150,11 @@ static void trie_show_stats(struct seq_f
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
-
bytes = sizeof(struct leaf) * stat->leaves;
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct leaf_info) * stat->prefixes;
+
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
bytes += sizeof(struct tnode) * stat->tnodes;
@@ -2193,8 +2198,8 @@ static void fib_trie_show(struct seq_fil
{
struct trie_stat stat;
- seq_printf(seq, "%s: %d\n", name, trie->size);
trie_collect_stats(trie, &stat);
+ seq_printf(seq, "%s:\n", name);
trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
trie_show_usage(seq, &trie->stats);
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 4/9] fib_trie: use hash list
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (2 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
` (6 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Stephen Hemminger
[-- Attachment #1: fib-use-the-list.patch --]
[-- Type: text/plain, Size: 2587 bytes --]
The code to dump can use the existing hash chain rather
than doing repeated lookup.
Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-21 17:45:08.000000000 -0800
@@ -2396,31 +2396,30 @@ static int fib_trie_seq_show(struct seq_
} else {
struct leaf *l = (struct leaf *) n;
- int i;
+ struct leaf_info *li;
+ struct hlist_node *node;
+
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
- for (i = 32; i >= 0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
- if (li) {
- struct fib_alias *fa;
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ struct fib_alias *fa;
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- char buf1[32], buf2[32];
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ char buf1[32], buf2[32];
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", i,
- rtn_scope(buf1, sizeof(buf1),
- fa->fa_scope),
- rtn_type(buf2, sizeof(buf2),
- fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, "tos =%d\n",
- fa->fa_tos);
- seq_putc(seq, '\n');
- }
+ seq_indent(seq, iter->depth+1);
+ seq_printf(seq, " /%d %s %s", li->plen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, "tos =%d\n",
+ fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
}
@@ -2474,8 +2473,8 @@ static int fib_route_seq_show(struct seq
{
const struct fib_trie_iter *iter = seq->private;
struct leaf *l = v;
- int i;
- char bf[128];
+ struct leaf_info *li;
+ struct hlist_node *node;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2490,8 +2489,7 @@ static int fib_route_seq_show(struct seq
if (IS_TNODE(l))
return 0;
- for (i = 32; i >= 0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
@@ -2504,6 +2502,7 @@ static int fib_route_seq_show(struct seq
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+ char bf[128];
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 5/9] fib_trie: dump message multiple part flag
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (3 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
` (5 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-multi.patch --]
[-- Type: text/plain, Size: 528 bytes --]
Match fib_hash, and set NLM_F_MULTI to handle multiple part messages.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-21 17:52:10.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-21 17:52:11.000000000 -0800
@@ -1895,7 +1895,7 @@ static int fn_trie_dump_fa(t_key key, in
xkey,
plen,
fa->fa_tos,
- fa->fa_info, 0) < 0) {
+ fa->fa_info, NLM_F_MULTI) < 0) {
cb->args[4] = i;
return -1;
}
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 6/9] fib_trie: iterator recode
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (4 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
` (4 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Remove the complex loop structure of nextleaf() andreplace it with a
simpler tree walker. This improves the performance and is much
cleaner.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 09:52:46.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 12:58:59.000000000 -0800
@@ -1708,64 +1708,65 @@ static int trie_flush_leaf(struct trie *
return found;
}
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
{
- struct node *c = (struct node *) thisleaf;
- struct tnode *p;
- int idx;
- struct node *trie = rcu_dereference(t->trie);
-
- if (c == NULL) {
- if (trie == NULL)
- return NULL;
-
- if (IS_LEAF(trie)) /* trie w. just a leaf */
- return (struct leaf *) trie;
-
- p = (struct tnode *)trie; /* Start */
- } else
- p = node_parent_rcu(c);
+ do {
+ t_key idx;
- while (p) {
- int pos, last;
-
- /* Find the next child of the parent */
if (c)
- pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+ idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
else
- pos = 0;
-
- last = 1 << p->bits;
- for (idx = pos; idx < last ; idx++) {
- c = rcu_dereference(p->child[idx]);
+ idx = 0;
+ while (idx < 1u << p->bits) {
+ c = tnode_get_child_rcu(p, idx++);
if (!c)
continue;
- /* Decend if tnode */
- while (IS_TNODE(c)) {
- p = (struct tnode *) c;
- idx = 0;
-
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (!(c = rcu_dereference(p->child[idx]))
- && idx < (1<<p->bits)) idx++;
-
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || !c)
- goto up;
+ if (IS_LEAF(c)) {
+ prefetch(p->child[idx]);
+ return (struct leaf *) c;
}
- return (struct leaf *) c;
+
+ /* Rescan start scanning in new node */
+ p = (struct tnode *) c;
+ idx = 0;
}
-up:
- /* No more children go up one step */
+
+ /* Node empty, walk back up to parent */
c = (struct node *) p;
- p = node_parent_rcu(c);
- }
- return NULL; /* Ready. Root of trie */
+ } while ( (p = node_parent_rcu(c)) != NULL);
+
+ return NULL; /* Root of trie */
+}
+
+
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+ struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+
+ if (!n)
+ return NULL;
+
+ if (IS_LEAF(n)) /* trie is just a leaf */
+ return (struct leaf *) n;
+
+ return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+ struct node *c = (struct node *) l;
+ struct tnode *p = node_parent(c);
+
+ if (!p)
+ return NULL; /* trie with just one leaf */
+
+ return leaf_walk_rcu(p, c);
}
/*
@@ -1775,9 +1776,9 @@ static int fn_trie_flush(struct fib_tabl
{
struct trie *t = (struct trie *) tb->tb_data;
struct leaf *ll = NULL, *l = NULL;
- int found = 0, h;
+ int found = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+ for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
found += trie_flush_leaf(t, l);
if (ll && hlist_empty(&ll->list))
@@ -1884,7 +1885,6 @@ static int fn_trie_dump_fa(t_key key, in
i++;
continue;
}
- BUG_ON(!fa->fa_info);
if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
@@ -1913,8 +1913,9 @@ static int fn_trie_dump_plen(struct trie
struct leaf *l = NULL;
s_h = cb->args[3];
+ h = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+ for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
if (h < s_h)
continue;
if (h > s_h)
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 7/9] fib_trie: dump table in sorted order
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (5 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
` (3 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-trie-dump-unordered.patch --]
[-- Type: text/plain, Size: 2689 bytes --]
It is easier with TRIE to dump the data traversal rather than
interating over every possible prefix. This saves some time and makes
the dump come out in sorted order.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 12:58:59.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 12:59:03.000000000 -0800
@@ -1905,67 +1905,71 @@ static int fn_trie_dump_fa(t_key key, in
return skb->len;
}
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
{
- int h, s_h;
- struct list_head *fa_head;
- struct leaf *l = NULL;
+ struct leaf_info *li;
+ struct hlist_node *node;
+ int i, s_i;
- s_h = cb->args[3];
- h = 0;
+ s_i = cb->args[3];
+ i = 0;
- for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
- if (h < s_h)
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ if (i < s_i) {
+ i++;
continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
-
- fa_head = get_fa_head(l, plen);
+ }
- if (!fa_head)
- continue;
+ if (i > s_i)
+ cb->args[4] = 0;
- if (list_empty(fa_head))
+ if (list_empty(&li->falh))
continue;
- if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
- cb->args[3] = h;
+ if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+ cb->args[3] = i;
return -1;
}
+ i++;
}
- cb->args[3] = h;
+
+ cb->args[3] = i;
return skb->len;
}
+
+
static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- int m, s_m;
+ struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
-
- s_m = cb->args[2];
+ int h = 0;
+ int s_h = cb->args[2];
rcu_read_lock();
- for (m = 0; m <= 32; m++) {
- if (m < s_m)
+ for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
+ if (h < s_h)
continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
-
- if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
- cb->args[2] = m;
- goto out;
+
+ if (h > s_h) {
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ rcu_read_unlock();
+ cb->args[2] = h;
+ return -1;
}
}
rcu_read_unlock();
- cb->args[2] = m;
+
+ cb->args[2] = h;
return skb->len;
-out:
- rcu_read_unlock();
- return -1;
}
void __init fib_hash_init(void)
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 8/9] fib_trie: avoid extra search on delete
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (6 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
` (2 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: trie-remove-desquare.patch --]
[-- Type: text/plain, Size: 2593 bytes --]
Get rid of extra search that made route deletion O(n).
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 15:24:41.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 15:25:32.000000000 -0800
@@ -1542,49 +1542,23 @@ found:
return ret;
}
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- t_key cindex;
- struct tnode *tp = NULL;
- struct node *n = t->trie;
- struct leaf *l;
-
- pr_debug("entering trie_leaf_remove(%p)\n", n);
-
- /* Note that in the case skipped bits, those bits are *not* checked!
- * When we finish this, we will have NULL or a T_LEAF, and the
- * T_LEAF may or may not match our key.
- */
-
- while (n != NULL && IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- check_tnode(tn);
- n = tnode_get_child(tn, tkey_extract_bits(key,
- tn->pos, tn->bits));
-
- BUG_ON(n && node_parent(n) != tn);
- }
- l = (struct leaf *) n;
-
- if (!n || !tkey_equals(l->key, key))
- return 0;
+ struct tnode *tp = node_parent((struct node *) l);
- /*
- * Key found.
- * Remove the leaf and rebalance the tree
- */
- tp = node_parent(n);
- tnode_free((struct tnode *) n);
+ pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, NULL);
rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
} else
rcu_assign_pointer(t->trie, NULL);
- return 1;
+ tnode_free((struct tnode *) l);
}
/*
@@ -1662,7 +1636,7 @@ static int fn_trie_delete(struct fib_tab
}
if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(-1);
@@ -1775,19 +1749,19 @@ static struct leaf *trie_nextleaf(struct
static int fn_trie_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *ll = NULL, *l = NULL;
+ struct leaf *l, *ll = NULL;
int found = 0;
for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
found += trie_flush_leaf(t, l);
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
ll = l;
}
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
pr_debug("trie_flush found=%d\n", found);
return found;
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 9/9] fib_trie: avoid rescan on dump
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (7 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
@ 2008-01-22 23:37 ` Stephen Hemminger
2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller
2008-01-23 14:06 ` Robert Olsson
10 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: trie-dump-desquare.patch --]
[-- Type: text/plain, Size: 1877 bytes --]
This converts dumping (and flushing) of large route tables
form O(N^2) to O(N). If the route dump took multiple pages then
the dump routine gets called again. The old code kept track of
location by counter, the new code instead uses the last key.
This is a really big win ( 0.3 sec vs 12 sec) for big route tables.
One side effect is that if the table changes during the dump,
then the last key will not be found, and we will return -EBUSY.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 15:25:32.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 15:26:41.000000000 -0800
@@ -1914,35 +1914,43 @@ static int fn_trie_dump_leaf(struct leaf
return skb->len;
}
-
-
static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
- int h = 0;
- int s_h = cb->args[2];
+ t_key key = cb->args[2];
rcu_read_lock();
- for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
- if (h < s_h)
- continue;
-
- if (h > s_h) {
- cb->args[3] = 0;
- cb->args[4] = 0;
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ if (!key)
+ l = trie_firstleaf(t);
+ else {
+ l = fib_find_node(t, key);
+ if (!l) {
+ /* The table changed during the dump, rather than
+ * giving partial data, just make application retry.
+ */
+ rcu_read_unlock();
+ return -EBUSY;
}
+ }
+ while (l) {
+ cb->args[2] = l->key;
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
rcu_read_unlock();
- cb->args[2] = h;
return -1;
}
+
+ l = trie_nextleaf(l);
+ memset(&cb->args[3], 0,
+ sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
rcu_read_unlock();
- cb->args[2] = h;
return skb->len;
}
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (8 preceding siblings ...)
2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
@ 2008-01-23 5:58 ` David Miller
2008-01-23 14:06 ` Robert Olsson
10 siblings, 0 replies; 17+ messages in thread
From: David Miller @ 2008-01-23 5:58 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 22 Jan 2008 15:37:33 -0800
>
> Time to handle a full BGP load (163K of routes).
>
> Before: Load Dump Flush
> hash 3.5 0.5 0.7
> 2.6.23.14 3.4 19.3 10.3
> net-2.6.25 3.4 18.7 9.8
>
> After:
> kmem_cache 3.8 13.0 7.2
> iter 3.9 12.3 6.9
> unordered 3.1 11.9 4.9
> find_node 3.1 0.3 1.2
>
> Load: ip -batch iproute-table
> Dump: ip route >/dev/null
> Flush: ip route flush table main
Nice work Stephen, all 9 patches applied.
We'll need to think about that new -EBUSY behavior, it
might confuse routing daemons and similar, and therefore
we might need to find a way to fudge continuing the dump
instead of failing.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 0/9] TRIE performance patches
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
` (9 preceding siblings ...)
2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller
@ 2008-01-23 14:06 ` Robert Olsson
2008-01-23 16:31 ` Stephen Hemminger
2008-01-23 23:49 ` Stephen Hemminger
10 siblings, 2 replies; 17+ messages in thread
From: Robert Olsson @ 2008-01-23 14:06 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Stephen Hemminger writes:
> Time to handle a full BGP load (163K of routes).
>
> Before: Load Dump Flush
>
> kmem_cache 3.8 13.0 7.2
> iter 3.9 12.3 6.9
> unordered 3.1 11.9 4.9
> find_node 3.1 0.3 1.2
I certainly like the speed but what will we brake when
we don't return in longest prefix order?
labb:/# ip r
default via 10.10.10.1 dev eth0
5.0.0.0/8 via 192.168.2.2 dev eth3
10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1
11.0.0.0/8 via 10.10.11.2 dev eth1
192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2
192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1
labb:/# ip route list match 10.10.10.1
default via 10.10.10.1 dev eth0
10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
labb:/#
Maybe the unordered dump can be ordered cheaply...
Cheers.
--ro
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-23 14:06 ` Robert Olsson
@ 2008-01-23 16:31 ` Stephen Hemminger
2008-01-23 23:49 ` Stephen Hemminger
1 sibling, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-23 16:31 UTC (permalink / raw)
To: Robert Olsson; +Cc: David Miller, netdev
Robert Olsson wrote:
> Stephen Hemminger writes:
>
> > Time to handle a full BGP load (163K of routes).
> >
> > Before: Load Dump Flush
> >
> > kmem_cache 3.8 13.0 7.2
> > iter 3.9 12.3 6.9
> > unordered 3.1 11.9 4.9
> > find_node 3.1 0.3 1.2
>
> I certainly like the speed but what will we brake when
> we don't return in longest prefix order?
>
> labb:/# ip r
> default via 10.10.10.1 dev eth0
> 5.0.0.0/8 via 192.168.2.2 dev eth3
> 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
> 10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1
> 11.0.0.0/8 via 10.10.11.2 dev eth1
> 192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2
> 192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1
>
> labb:/# ip route list match 10.10.10.1
> default via 10.10.10.1 dev eth0
> 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
> labb:/#
>
> Maybe the unordered dump can be ordered cheaply...
>
> Cheers.
> --ro
>
>
Hash returned the routes in prefix order (then random). Returning the
routes in numerical order
seems just as logical. I'm going to test on quagga.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-23 14:06 ` Robert Olsson
2008-01-23 16:31 ` Stephen Hemminger
@ 2008-01-23 23:49 ` Stephen Hemminger
2008-01-24 9:36 ` Robert Olsson
2008-02-01 18:00 ` Robert Olsson
1 sibling, 2 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-23 23:49 UTC (permalink / raw)
To: Robert Olsson; +Cc: David Miller, netdev
On Wed, 23 Jan 2008 15:06:47 +0100
Robert Olsson <Robert.Olsson@data.slu.se> wrote:
>
> Stephen Hemminger writes:
>
> > Time to handle a full BGP load (163K of routes).
> >
> > Before: Load Dump Flush
> >
> > kmem_cache 3.8 13.0 7.2
> > iter 3.9 12.3 6.9
> > unordered 3.1 11.9 4.9
> > find_node 3.1 0.3 1.2
>
> I certainly like the speed but what will we brake when
> we don't return in longest prefix order?
>
> labb:/# ip r
> default via 10.10.10.1 dev eth0
> 5.0.0.0/8 via 192.168.2.2 dev eth3
> 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
> 10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1
> 11.0.0.0/8 via 10.10.11.2 dev eth1
> 192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2
> 192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1
>
> labb:/# ip route list match 10.10.10.1
> default via 10.10.10.1 dev eth0
> 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2
> labb:/#
>
> Maybe the unordered dump can be ordered cheaply...
Dumping by prefix is possible, but unless 32x slower. Dumping in
address order is just as logical. Like I said, I'm investigating what
quagga handles.
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-23 23:49 ` Stephen Hemminger
@ 2008-01-24 9:36 ` Robert Olsson
2008-01-24 16:18 ` Stephen Hemminger
2008-02-01 18:00 ` Robert Olsson
1 sibling, 1 reply; 17+ messages in thread
From: Robert Olsson @ 2008-01-24 9:36 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev
Stephen Hemminger writes:
> Dumping by prefix is possible, but unless 32x slower. Dumping in
> address order is just as logical. Like I said, I'm investigating what
> quagga handles.
How about taking a snapshot to in address order (as you did) to some
allocated memory, returning from that memory in prefix order? This would
solve the -EBUSY too and give a consistent view of the routing table at
the time for the dump/snapshot.
Cheers
--ro
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-24 9:36 ` Robert Olsson
@ 2008-01-24 16:18 ` Stephen Hemminger
0 siblings, 0 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-24 16:18 UTC (permalink / raw)
To: Robert Olsson; +Cc: Robert Olsson, David Miller, netdev
On Thu, 24 Jan 2008 10:36:45 +0100
Robert Olsson <Robert.Olsson@data.slu.se> wrote:
>
> Stephen Hemminger writes:
>
> > Dumping by prefix is possible, but unless 32x slower. Dumping in
> > address order is just as logical. Like I said, I'm investigating what
> > quagga handles.
>
> How about taking a snapshot to in address order (as you did) to some
> allocated memory, returning from that memory in prefix order? This would
> solve the -EBUSY too and give a consistent view of the routing table at
> the time for the dump/snapshot.
>
> Cheers
> --ro
Snapshotting is going to work, because of scale and because the kernel can't
tell when application is going to come back.
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches
2008-01-23 23:49 ` Stephen Hemminger
2008-01-24 9:36 ` Robert Olsson
@ 2008-02-01 18:00 ` Robert Olsson
1 sibling, 0 replies; 17+ messages in thread
From: Robert Olsson @ 2008-02-01 18:00 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev
Hello, finally got some time to test...
Table w. 214k routes with full rDoS on two intrefaces on 2 x AMD64 processors,
speed 2814.43 MHz. Profiled with CPU_CLK_UNHALTED and rtstat
w/o latest patch fib_trie pathes. Tput ~233 kpps
samples % symbol name
109925 14.4513 fn_trie_lookup
109821 14.4376 ip_route_input
87245 11.4696 rt_intern_hash
31270 4.1109 kmem_cache_alloc
24159 3.1761 dev_queue_xmit
23200 3.0500 neigh_lookup
22464 2.9532 free_block
18412 2.4205 kmem_cache_free
17830 2.3440 dst_destroy
15740 2.0693 fib_get_table
With latest patch fib_patches.(Stephens others) Same throughput
~233 kpps but we see a different profile. Why we don't get any
better better throughput is yet to be understand (the drops in
qdisc could be the cause) more analysis needed
79242 14.3520 ip_route_input
65188 11.8066 fn_trie_lookup
64559 11.6927 rt_intern_hash
22901 4.1477 kmem_cache_alloc
21038 3.8103 check_leaf
16197 2.9335 neigh_lookup
14802 2.6809 free_block
14596 2.6436 ip_rcv_finish
12365 2.2395 fib_validate_source
12048 2.1821 dst_destroy
fib_hash thoughput ~177 kpps
Hard work for fib_hash here as we have many zones.
it can be fast with less zines.
200568 37.8013 fn_hash_lookup
58352 10.9977 ip_route_input
44495 8.3860 rt_intern_hash
12873 2.4262 kmem_cache_alloc
12115 2.2833 rt_may_expire
11691 2.2034 rt_garbage_collect
10821 2.0394 dev_queue_xmit
9999 1.8845 fib_validate_source
8762 1.6514 fib_get_table
8558 1.6129 fib_semantic_match
Cheers
--ro
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2008-02-01 18:01 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger
2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger
2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger
2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger
2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger
2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger
2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger
2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger
2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller
2008-01-23 14:06 ` Robert Olsson
2008-01-23 16:31 ` Stephen Hemminger
2008-01-23 23:49 ` Stephen Hemminger
2008-01-24 9:36 ` Robert Olsson
2008-01-24 16:18 ` Stephen Hemminger
2008-02-01 18:00 ` Robert Olsson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).