* [IPV4 0/9] TRIE performance patches
@ 2008-01-22 23:37 Stephen Hemminger
2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger
` (10 more replies)
0 siblings, 11 replies; 17+ messages in thread
From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Time to handle a full BGP load (163K of routes).
Before: Load Dump Flush
hash 3.5 0.5 0.7
2.6.23.14 3.4 19.3 10.3
net-2.6.25 3.4 18.7 9.8
After:
kmem_cache 3.8 13.0 7.2
iter 3.9 12.3 6.9
unordered 3.1 11.9 4.9
find_node 3.1 0.3 1.2
Load: ip -batch iproute-table
Dump: ip route >/dev/null
Flush: ip route flush table main
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 17+ messages in thread* [IPV4 1/9] fib_trie: put leaf nodes in a slab cache 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger ` (9 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev, Stephen Hemminger [-- Attachment #1: leaf-kmem-cache.patch --] [-- Type: text/plain, Size: 1669 bytes --] This improves locality for operations that touch all the leaves. Save space since these entries don't need to be hardware cache aligned. Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-21 10:16:10.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-21 10:18:42.000000000 -0800 @@ -162,6 +162,7 @@ static struct tnode *halve(struct trie * static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; +static struct kmem_cache *trie_leaf_kmem __read_mostly; static inline struct tnode *node_parent(struct node *node) { @@ -325,7 +326,8 @@ static inline void alias_free_mem_rcu(st static void __leaf_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct leaf, rcu)); + struct leaf *l = container_of(head, struct leaf, rcu); + kmem_cache_free(trie_leaf_kmem, l); } static void __leaf_info_free_rcu(struct rcu_head *head) @@ -375,7 +377,7 @@ static inline void tnode_free(struct tno static struct leaf *leaf_new(void) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { l->parent = T_LEAF; INIT_HLIST_HEAD(&l->list); @@ -1935,7 +1937,12 @@ out: void __init fib_hash_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + 0, SLAB_PANIC, NULL); + + trie_leaf_kmem = kmem_cache_create("ip_fib_trie", + max(sizeof(struct leaf), + sizeof(struct leaf_info)), + 0, SLAB_PANIC, NULL); } -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 2/9] fib_trie: style cleanup 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger 2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger ` (8 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev, Stephen Hemminger [-- Attachment #1: fib-trie-style.patch --] [-- Type: text/plain, Size: 18775 bytes --] Style cleanups: * make check_leaf return -1 or plen, rather than by reference * Get rid of #ifdef that is always set * split out embedded function calls in if statements. * checkpatch warnings Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-21 10:18:42.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-21 10:19:00.000000000 -0800 @@ -155,7 +155,8 @@ struct trie { }; static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull); static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); @@ -395,7 +396,7 @@ static struct leaf_info *leaf_info_new(i return li; } -static struct tnode* tnode_new(t_key key, int pos, int bits) +static struct tnode *tnode_new(t_key key, int pos, int bits) { size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); struct tnode *tn = tnode_alloc(sz); @@ -427,7 +428,8 @@ static inline int tnode_full(const struc return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) +static inline void put_child(struct trie *t, struct tnode *tn, int i, + struct node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -437,7 +439,8 @@ static inline void put_child(struct trie * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull) { struct node *chi = tn->child[i]; int isfull; @@ -577,11 +580,13 @@ static struct node *resize(struct trie * err = 0; max_resize = 10; while ((tn->full_children > 0 && max_resize-- && - 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= - inflate_threshold_use * tnode_child_length(tn))) { + 50 * (tn->full_children + tnode_child_length(tn) + - tn->empty_children) + >= inflate_threshold_use * tnode_child_length(tn))) { old_tn = tn; tn = inflate(t, tn); + if (IS_ERR(tn)) { tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -593,11 +598,13 @@ static struct node *resize(struct trie * if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); + pr_warning("Fix inflate_threshold_root." + " Now=%d size=%d bits\n", + inflate_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", - inflate_threshold, tn->bits); + pr_warning("Fix inflate_threshold." + " Now=%d size=%d bits\n", + inflate_threshold, tn->bits); } check_tnode(tn); @@ -634,11 +641,13 @@ static struct node *resize(struct trie * if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", - halve_threshold_root, tn->bits); + pr_warning("Fix halve_threshold_root." + " Now=%d size=%d bits\n", + halve_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", - halve_threshold, tn->bits); + pr_warning("Fix halve_threshold." + " Now=%d size=%d bits\n", + halve_threshold, tn->bits); } /* Only one child remains */ @@ -681,8 +690,9 @@ static struct tnode *inflate(struct trie */ for (i = 0; i < olen; i++) { - struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + struct tnode *inode; + inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && IS_TNODE(inode) && inode->pos == oldtnode->pos + oldtnode->bits && @@ -722,8 +732,9 @@ static struct tnode *inflate(struct trie if (IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, - 1) == 0) + if (tkey_extract_bits(node->key, + oldtnode->pos + oldtnode->bits, + 1) == 0) put_child(t, tn, 2*i, node); else put_child(t, tn, 2*i+1, node); @@ -899,7 +910,7 @@ static struct leaf_info *find_leaf_info( return NULL; } -static inline struct list_head * get_fa_head(struct leaf *l, int plen) +static inline struct list_head *get_fa_head(struct leaf *l, int plen) { struct leaf_info *li = find_leaf_info(l, plen); @@ -949,7 +960,10 @@ fib_find_node(struct trie *t, u32 key) if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { pos = tn->pos + tn->bits; - n = tnode_get_child_rcu(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child_rcu(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); } else break; } @@ -970,8 +984,10 @@ static struct node *trie_rebalance(struc while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *) resize (t, (struct tnode *)tn); - tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); + tn = (struct tnode *) resize(t, (struct tnode *)tn); + + tnode_put_child_reorg((struct tnode *)tp, cindex, + (struct node *)tn, wasfull); tp = node_parent((struct node *) tn); if (!tp) @@ -981,9 +997,9 @@ static struct node *trie_rebalance(struc /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode*) resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, (struct tnode *)tn); - return (struct node*) tn; + return (struct node *)tn; } /* only used from updater-side */ @@ -1028,7 +1044,10 @@ static struct list_head *fib_insert_node if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; pos = tn->pos + tn->bits; - n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); BUG_ON(n && node_parent(n) != tn); } else @@ -1113,16 +1132,18 @@ static struct list_head *fib_insert_node if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); + put_child(t, (struct tnode *)tp, cindex, + (struct node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ + rcu_assign_pointer(t->trie, (struct node *)tn); tp = tn; } } if (tp && tp->pos + tp->bits > 32) - printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", - tp, tp->pos, tp->bits, key, plen); + pr_warning("fib_trie" + " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", + tp, tp->pos, tp->bits, key, plen); /* Rebalance the trie */ @@ -1232,10 +1253,10 @@ static int fn_trie_insert(struct fib_tab break; if (fa->fa_type == cfg->fc_type && fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { + fa->fa_info == fi) goto out; - } } + if (!(cfg->fc_nlflags & NLM_F_APPEND)) fa = fa_orig; } @@ -1286,38 +1307,40 @@ err: /* should be called with rcu_read_lock */ -static inline int check_leaf(struct trie *t, struct leaf *l, - t_key key, int *plen, const struct flowi *flp, - struct fib_result *res) +static int check_leaf(struct trie *t, struct leaf *l, + t_key key, const struct flowi *flp, + struct fib_result *res) { - int err, i; - __be32 mask; struct leaf_info *li; struct hlist_head *hhead = &l->list; struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - i = li->plen; - mask = inet_make_mask(i); + int err; + int plen = li->plen; + __be32 mask = inet_make_mask(plen); + if (l->key != (key & ntohl(mask))) continue; - if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) { - *plen = i; + err = fib_semantic_match(&li->falh, flp, res, + htonl(l->key), mask, plen); + #ifdef CONFIG_IP_FIB_TRIE_STATS + if (err <= 0) t->stats.semantic_match_passed++; + else + t->stats.semantic_match_miss++; #endif - return err; - } -#ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; -#endif + if (err <= 0) + return plen; } - return 1; + + return -1; } -static int -fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int plen, ret = 0; @@ -1344,10 +1367,13 @@ fn_trie_lookup(struct fib_table *tb, con /* Just a leaf? */ if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - goto failed; + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) + goto failed; + ret = 0; + goto found; } + pn = (struct tnode *) n; chopped_off = 0; @@ -1369,14 +1395,14 @@ fn_trie_lookup(struct fib_table *tb, con } if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - else + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) goto backtrace; + + ret = 0; + goto found; } -#define HL_OPTIMIZE -#ifdef HL_OPTIMIZE cn = (struct tnode *)n; /* @@ -1405,12 +1431,13 @@ fn_trie_lookup(struct fib_table *tb, con * *are* zero. */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ + /* NOTA BENE: Checking only skipped bits + for the new node here */ if (current_prefix_length < pos+bits) { if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) + cn->pos - current_prefix_length) + || !(cn->child[0])) goto backtrace; } @@ -1433,14 +1460,17 @@ fn_trie_lookup(struct fib_table *tb, con * new tnode's key. */ - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. + /* + * Note: We aren't very concerned about the piece of + * the key that precede pn->pos+pn->bits, since these + * have already been checked. The bits after cn->pos + * aren't checked since these are by definition + * "unknown" at this point. Thus, what we want to see + * is if we are about to enter the "prefix matching" + * state, and in that case verify that the skipped + * bits that will prevail throughout this subtree are + * zero, as they have to be if we are to find a + * matching prefix. */ node_prefix = mask_pfx(cn->key, cn->pos); @@ -1448,13 +1478,15 @@ fn_trie_lookup(struct fib_table *tb, con pref_mismatch = key_prefix^node_prefix; mp = 0; - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. + /* + * In short: If skipped bits in this node do not match + * the search key, enter the "prefix matching" + * state.directly. */ if (pref_mismatch) { while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { mp++; - pref_mismatch = pref_mismatch <<1; + pref_mismatch = pref_mismatch << 1; } key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); @@ -1464,7 +1496,7 @@ fn_trie_lookup(struct fib_table *tb, con if (current_prefix_length >= cn->pos) current_prefix_length = mp; } -#endif + pn = (struct tnode *)n; /* Descend */ chopped_off = 0; continue; @@ -1473,12 +1505,14 @@ backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) + while ((chopped_off <= pn->bits) + && !(cindex & (1<<(chopped_off-1)))) chopped_off++; /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) - current_prefix_length = pn->pos + pn->bits - chopped_off; + current_prefix_length = pn->pos + pn->bits + - chopped_off; /* * Either we do the actual chop off according or if we have @@ -1528,7 +1562,8 @@ static int trie_leaf_remove(struct trie while (n != NULL && IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; check_tnode(tn); - n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child(tn, tkey_extract_bits(key, + tn->pos, tn->bits)); BUG_ON(n && node_parent(n) != tn); } @@ -1694,7 +1729,7 @@ static struct leaf *nextleaf(struct trie if (IS_LEAF(trie)) /* trie w. just a leaf */ return (struct leaf *) trie; - p = (struct tnode*) trie; /* Start */ + p = (struct tnode *)trie; /* Start */ } else p = node_parent_rcu(c); @@ -1762,8 +1797,9 @@ static int fn_trie_flush(struct fib_tabl return found; } -static void -fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static void fn_trie_select_default(struct fib_table *tb, + const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int order, last_idx; @@ -1834,7 +1870,8 @@ out: rcu_read_unlock(); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, +static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, + struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { int i, s_i; @@ -1873,8 +1910,8 @@ static int fn_trie_dump_fa(t_key key, in return skb->len; } -static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) +static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) { int h, s_h; struct list_head *fa_head; @@ -1897,7 +1934,7 @@ static int fn_trie_dump_plen(struct trie if (list_empty(fa_head)) continue; - if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { + if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) { cb->args[3] = h; return -1; } @@ -1906,7 +1943,8 @@ static int fn_trie_dump_plen(struct trie return skb->len; } -static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { int m, s_m; struct trie *t = (struct trie *) tb->tb_data; @@ -1921,7 +1959,7 @@ static int fn_trie_dump(struct fib_table memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); - if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { + if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) { cb->args[2] = m; goto out; } @@ -1936,7 +1974,8 @@ out: void __init fib_hash_init(void) { - fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", @@ -1970,7 +2009,7 @@ struct fib_table *fib_hash_table(u32 id) memset(t, 0, sizeof(*t)); if (id == RT_TABLE_LOCAL) - printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION); + pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); return tb; } @@ -2104,7 +2143,8 @@ static void trie_show_stats(struct seq_f else avdepth = 0; - seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 ); + seq_printf(seq, "\tAver depth: %u.%02d\n", + avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); @@ -2136,16 +2176,20 @@ static void trie_show_usage(struct seq_f const struct trie_use_stats *stats) { seq_printf(seq, "\nCounters:\n---------\n"); - seq_printf(seq,"gets = %u\n", stats->gets); - seq_printf(seq,"backtracks = %u\n", stats->backtrack); - seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed); - seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss); - seq_printf(seq,"null node hit= %u\n", stats->null_node_hit); - seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped); + seq_printf(seq, "gets = %u\n", stats->gets); + seq_printf(seq, "backtracks = %u\n", stats->backtrack); + seq_printf(seq, "semantic match passed = %u\n", + stats->semantic_match_passed); + seq_printf(seq, "semantic match miss = %u\n", + stats->semantic_match_miss); + seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); + seq_printf(seq, "skipped node resize = %u\n\n", + stats->resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ -static void fib_trie_show(struct seq_file *seq, const char *name, struct trie *trie) +static void fib_trie_show(struct seq_file *seq, const char *name, + struct trie *trie) { struct trie_stat stat; @@ -2163,7 +2207,8 @@ static int fib_triestat_seq_show(struct struct fib_table *tb; seq_printf(seq, - "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", + "Basic info: size of leaf:" + " %Zd bytes, size of tnode: %Zd bytes.\n", sizeof(struct leaf), sizeof(struct tnode)); tb = fib_get_table(net, RT_TABLE_LOCAL); @@ -2436,10 +2481,11 @@ static int fib_route_seq_show(struct seq if (iter->trie == iter->trie_local) return 0; + if (IS_TNODE(l)) return 0; - for (i=32; i>=0; i--) { + for (i = 32; i >= 0; i--) { struct leaf_info *li = find_leaf_info(l, i); struct fib_alias *fa; __be32 mask, prefix; @@ -2466,7 +2512,8 @@ static int fib_route_seq_show(struct seq fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, mask, - (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); else -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 3/9] fib_trie: compute size when needed 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger 2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger 2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger ` (7 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev [-- Attachment #1: fib-trie-stats-x.patch --] [-- Type: text/plain, Size: 2412 bytes --] Compute the number of prefixes when needed, rather than doing bookeeping. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800 @@ -143,12 +143,12 @@ struct trie_stat { unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; + unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct node *trie; - unsigned int size; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif @@ -1289,8 +1289,6 @@ static int fn_trie_insert(struct fib_tab list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - t->size++; - rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1576,9 +1574,6 @@ static int trie_leaf_remove(struct trie * Key found. * Remove the leaf and rebalance the tree */ - - t->size--; - tp = node_parent(n); tnode_free((struct tnode *) n); @@ -2111,10 +2106,17 @@ static void trie_collect_stats(struct tr for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { + struct leaf *l = (struct leaf *)n; + struct leaf_info *li; + struct hlist_node *tmp; + s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; + + hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + ++s->prefixes; } else { const struct tnode *tn = (const struct tnode *) n; int i; @@ -2148,8 +2150,11 @@ static void trie_show_stats(struct seq_f seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct leaf) * stat->leaves; + + seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); + bytes += sizeof(struct leaf_info) * stat->prefixes; + seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += sizeof(struct tnode) * stat->tnodes; @@ -2193,8 +2198,8 @@ static void fib_trie_show(struct seq_fil { struct trie_stat stat; - seq_printf(seq, "%s: %d\n", name, trie->size); trie_collect_stats(trie, &stat); + seq_printf(seq, "%s:\n", name); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, &trie->stats); -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 4/9] fib_trie: use hash list 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (2 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger ` (6 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev, Stephen Hemminger [-- Attachment #1: fib-use-the-list.patch --] [-- Type: text/plain, Size: 2587 bytes --] The code to dump can use the existing hash chain rather than doing repeated lookup. Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-21 17:45:03.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-21 17:45:08.000000000 -0800 @@ -2396,31 +2396,30 @@ static int fib_trie_seq_show(struct seq_ } else { struct leaf *l = (struct leaf *) n; - int i; + struct leaf_info *li; + struct hlist_node *node; + __be32 val = htonl(l->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); - for (i = 32; i >= 0; i--) { - struct leaf_info *li = find_leaf_info(l, i); - if (li) { - struct fib_alias *fa; + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + struct fib_alias *fa; - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - char buf1[32], buf2[32]; + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + char buf1[32], buf2[32]; - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", i, - rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), - rtn_type(buf2, sizeof(buf2), - fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, "tos =%d\n", - fa->fa_tos); - seq_putc(seq, '\n'); - } + seq_indent(seq, iter->depth+1); + seq_printf(seq, " /%d %s %s", li->plen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, "tos =%d\n", + fa->fa_tos); + seq_putc(seq, '\n'); } } } @@ -2474,8 +2473,8 @@ static int fib_route_seq_show(struct seq { const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; - int i; - char bf[128]; + struct leaf_info *li; + struct hlist_node *node; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2490,8 +2489,7 @@ static int fib_route_seq_show(struct seq if (IS_TNODE(l)) return 0; - for (i = 32; i >= 0; i--) { - struct leaf_info *li = find_leaf_info(l, i); + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { struct fib_alias *fa; __be32 mask, prefix; @@ -2504,6 +2502,7 @@ static int fib_route_seq_show(struct seq list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + char bf[128]; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 5/9] fib_trie: dump message multiple part flag 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (3 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger ` (5 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev [-- Attachment #1: fib-multi.patch --] [-- Type: text/plain, Size: 528 bytes --] Match fib_hash, and set NLM_F_MULTI to handle multiple part messages. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-21 17:52:10.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-21 17:52:11.000000000 -0800 @@ -1895,7 +1895,7 @@ static int fn_trie_dump_fa(t_key key, in xkey, plen, fa->fa_tos, - fa->fa_info, 0) < 0) { + fa->fa_info, NLM_F_MULTI) < 0) { cb->args[4] = i; return -1; } -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 6/9] fib_trie: iterator recode 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (4 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger ` (4 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev Remove the complex loop structure of nextleaf() andreplace it with a simpler tree walker. This improves the performance and is much cleaner. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-22 09:52:46.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-22 12:58:59.000000000 -0800 @@ -1708,64 +1708,65 @@ static int trie_flush_leaf(struct trie * return found; } -/* rcu_read_lock needs to be hold by caller from readside */ - -static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) +/* + * Scan for the next right leaf starting at node p->child[idx] + * Since we have back pointer, no recursion necessary. + */ +static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) { - struct node *c = (struct node *) thisleaf; - struct tnode *p; - int idx; - struct node *trie = rcu_dereference(t->trie); - - if (c == NULL) { - if (trie == NULL) - return NULL; - - if (IS_LEAF(trie)) /* trie w. just a leaf */ - return (struct leaf *) trie; - - p = (struct tnode *)trie; /* Start */ - } else - p = node_parent_rcu(c); + do { + t_key idx; - while (p) { - int pos, last; - - /* Find the next child of the parent */ if (c) - pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); + idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; else - pos = 0; - - last = 1 << p->bits; - for (idx = pos; idx < last ; idx++) { - c = rcu_dereference(p->child[idx]); + idx = 0; + while (idx < 1u << p->bits) { + c = tnode_get_child_rcu(p, idx++); if (!c) continue; - /* Decend if tnode */ - while (IS_TNODE(c)) { - p = (struct tnode *) c; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (!(c = rcu_dereference(p->child[idx])) - && idx < (1<<p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || !c) - goto up; + if (IS_LEAF(c)) { + prefetch(p->child[idx]); + return (struct leaf *) c; } - return (struct leaf *) c; + + /* Rescan start scanning in new node */ + p = (struct tnode *) c; + idx = 0; } -up: - /* No more children go up one step */ + + /* Node empty, walk back up to parent */ c = (struct node *) p; - p = node_parent_rcu(c); - } - return NULL; /* Ready. Root of trie */ + } while ( (p = node_parent_rcu(c)) != NULL); + + return NULL; /* Root of trie */ +} + + +static struct leaf *trie_firstleaf(struct trie *t) +{ + struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + + if (!n) + return NULL; + + if (IS_LEAF(n)) /* trie is just a leaf */ + return (struct leaf *) n; + + return leaf_walk_rcu(n, NULL); +} + +static struct leaf *trie_nextleaf(struct leaf *l) +{ + struct node *c = (struct node *) l; + struct tnode *p = node_parent(c); + + if (!p) + return NULL; /* trie with just one leaf */ + + return leaf_walk_rcu(p, c); } /* @@ -1775,9 +1776,9 @@ static int fn_trie_flush(struct fib_tabl { struct trie *t = (struct trie *) tb->tb_data; struct leaf *ll = NULL, *l = NULL; - int found = 0, h; + int found = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { + for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) @@ -1884,7 +1885,6 @@ static int fn_trie_dump_fa(t_key key, in i++; continue; } - BUG_ON(!fa->fa_info); if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -1913,8 +1913,9 @@ static int fn_trie_dump_plen(struct trie struct leaf *l = NULL; s_h = cb->args[3]; + h = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { + for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) { if (h < s_h) continue; if (h > s_h) -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 7/9] fib_trie: dump table in sorted order 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (5 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger ` (3 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev [-- Attachment #1: fib-trie-dump-unordered.patch --] [-- Type: text/plain, Size: 2689 bytes --] It is easier with TRIE to dump the data traversal rather than interating over every possible prefix. This saves some time and makes the dump come out in sorted order. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-22 12:58:59.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-22 12:59:03.000000000 -0800 @@ -1905,67 +1905,71 @@ static int fn_trie_dump_fa(t_key key, in return skb->len; } -static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) + +static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) { - int h, s_h; - struct list_head *fa_head; - struct leaf *l = NULL; + struct leaf_info *li; + struct hlist_node *node; + int i, s_i; - s_h = cb->args[3]; - h = 0; + s_i = cb->args[3]; + i = 0; - for (l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) { - if (h < s_h) + /* rcu_read_lock is hold by caller */ + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + if (i < s_i) { + i++; continue; - if (h > s_h) - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - - fa_head = get_fa_head(l, plen); + } - if (!fa_head) - continue; + if (i > s_i) + cb->args[4] = 0; - if (list_empty(fa_head)) + if (list_empty(&li->falh)) continue; - if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) { - cb->args[3] = h; + if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { + cb->args[3] = i; return -1; } + i++; } - cb->args[3] = h; + + cb->args[3] = i; return skb->len; } + + static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - int m, s_m; + struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; - - s_m = cb->args[2]; + int h = 0; + int s_h = cb->args[2]; rcu_read_lock(); - for (m = 0; m <= 32; m++) { - if (m < s_m) + for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) { + if (h < s_h) continue; - if (m > s_m) - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); - - if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) { - cb->args[2] = m; - goto out; + + if (h > s_h) { + cb->args[3] = 0; + cb->args[4] = 0; + } + + if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + rcu_read_unlock(); + cb->args[2] = h; + return -1; } } rcu_read_unlock(); - cb->args[2] = m; + + cb->args[2] = h; return skb->len; -out: - rcu_read_unlock(); - return -1; } void __init fib_hash_init(void) -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 8/9] fib_trie: avoid extra search on delete 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (6 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger ` (2 subsequent siblings) 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev [-- Attachment #1: trie-remove-desquare.patch --] [-- Type: text/plain, Size: 2593 bytes --] Get rid of extra search that made route deletion O(n). Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-22 15:24:41.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-22 15:25:32.000000000 -0800 @@ -1542,49 +1542,23 @@ found: return ret; } -/* only called from updater side */ -static int trie_leaf_remove(struct trie *t, t_key key) +/* + * Remove the leaf and return parent. + */ +static void trie_leaf_remove(struct trie *t, struct leaf *l) { - t_key cindex; - struct tnode *tp = NULL; - struct node *n = t->trie; - struct leaf *l; - - pr_debug("entering trie_leaf_remove(%p)\n", n); - - /* Note that in the case skipped bits, those bits are *not* checked! - * When we finish this, we will have NULL or a T_LEAF, and the - * T_LEAF may or may not match our key. - */ - - while (n != NULL && IS_TNODE(n)) { - struct tnode *tn = (struct tnode *) n; - check_tnode(tn); - n = tnode_get_child(tn, tkey_extract_bits(key, - tn->pos, tn->bits)); - - BUG_ON(n && node_parent(n) != tn); - } - l = (struct leaf *) n; - - if (!n || !tkey_equals(l->key, key)) - return 0; + struct tnode *tp = node_parent((struct node *) l); - /* - * Key found. - * Remove the leaf and rebalance the tree - */ - tp = node_parent(n); - tnode_free((struct tnode *) n); + pr_debug("entering trie_leaf_remove(%p)\n", l); if (tp) { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); + t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); } else rcu_assign_pointer(t->trie, NULL); - return 1; + tnode_free((struct tnode *) l); } /* @@ -1662,7 +1636,7 @@ static int fn_trie_delete(struct fib_tab } if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); @@ -1775,19 +1749,19 @@ static struct leaf *trie_nextleaf(struct static int fn_trie_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; - struct leaf *ll = NULL, *l = NULL; + struct leaf *l, *ll = NULL; int found = 0; for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); ll = l; } if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); pr_debug("trie_flush found=%d\n", found); return found; -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 9/9] fib_trie: avoid rescan on dump 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (7 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger @ 2008-01-22 23:37 ` Stephen Hemminger 2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller 2008-01-23 14:06 ` Robert Olsson 10 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-22 23:37 UTC (permalink / raw) To: David Miller; +Cc: netdev [-- Attachment #1: trie-dump-desquare.patch --] [-- Type: text/plain, Size: 1877 bytes --] This converts dumping (and flushing) of large route tables form O(N^2) to O(N). If the route dump took multiple pages then the dump routine gets called again. The old code kept track of location by counter, the new code instead uses the last key. This is a really big win ( 0.3 sec vs 12 sec) for big route tables. One side effect is that if the table changes during the dump, then the last key will not be found, and we will return -EBUSY. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- a/net/ipv4/fib_trie.c 2008-01-22 15:25:32.000000000 -0800 +++ b/net/ipv4/fib_trie.c 2008-01-22 15:26:41.000000000 -0800 @@ -1914,35 +1914,43 @@ static int fn_trie_dump_leaf(struct leaf return skb->len; } - - static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; - int h = 0; - int s_h = cb->args[2]; + t_key key = cb->args[2]; rcu_read_lock(); - for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) { - if (h < s_h) - continue; - - if (h > s_h) { - cb->args[3] = 0; - cb->args[4] = 0; + /* Dump starting at last key. + * Note: 0.0.0.0/0 (ie default) is first key. + */ + if (!key) + l = trie_firstleaf(t); + else { + l = fib_find_node(t, key); + if (!l) { + /* The table changed during the dump, rather than + * giving partial data, just make application retry. + */ + rcu_read_unlock(); + return -EBUSY; } + } + while (l) { + cb->args[2] = l->key; if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { rcu_read_unlock(); - cb->args[2] = h; return -1; } + + l = trie_nextleaf(l); + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); } rcu_read_unlock(); - cb->args[2] = h; return skb->len; } -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (8 preceding siblings ...) 2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger @ 2008-01-23 5:58 ` David Miller 2008-01-23 14:06 ` Robert Olsson 10 siblings, 0 replies; 17+ messages in thread From: David Miller @ 2008-01-23 5:58 UTC (permalink / raw) To: shemminger; +Cc: netdev From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Tue, 22 Jan 2008 15:37:33 -0800 > > Time to handle a full BGP load (163K of routes). > > Before: Load Dump Flush > hash 3.5 0.5 0.7 > 2.6.23.14 3.4 19.3 10.3 > net-2.6.25 3.4 18.7 9.8 > > After: > kmem_cache 3.8 13.0 7.2 > iter 3.9 12.3 6.9 > unordered 3.1 11.9 4.9 > find_node 3.1 0.3 1.2 > > Load: ip -batch iproute-table > Dump: ip route >/dev/null > Flush: ip route flush table main Nice work Stephen, all 9 patches applied. We'll need to think about that new -EBUSY behavior, it might confuse routing daemons and similar, and therefore we might need to find a way to fudge continuing the dump instead of failing. ^ permalink raw reply [flat|nested] 17+ messages in thread
* [IPV4 0/9] TRIE performance patches 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger ` (9 preceding siblings ...) 2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller @ 2008-01-23 14:06 ` Robert Olsson 2008-01-23 16:31 ` Stephen Hemminger 2008-01-23 23:49 ` Stephen Hemminger 10 siblings, 2 replies; 17+ messages in thread From: Robert Olsson @ 2008-01-23 14:06 UTC (permalink / raw) To: Stephen Hemminger; +Cc: David Miller, netdev Stephen Hemminger writes: > Time to handle a full BGP load (163K of routes). > > Before: Load Dump Flush > > kmem_cache 3.8 13.0 7.2 > iter 3.9 12.3 6.9 > unordered 3.1 11.9 4.9 > find_node 3.1 0.3 1.2 I certainly like the speed but what will we brake when we don't return in longest prefix order? labb:/# ip r default via 10.10.10.1 dev eth0 5.0.0.0/8 via 192.168.2.2 dev eth3 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1 11.0.0.0/8 via 10.10.11.2 dev eth1 192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2 192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1 labb:/# ip route list match 10.10.10.1 default via 10.10.10.1 dev eth0 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 labb:/# Maybe the unordered dump can be ordered cheaply... Cheers. --ro ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-23 14:06 ` Robert Olsson @ 2008-01-23 16:31 ` Stephen Hemminger 2008-01-23 23:49 ` Stephen Hemminger 1 sibling, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-23 16:31 UTC (permalink / raw) To: Robert Olsson; +Cc: David Miller, netdev Robert Olsson wrote: > Stephen Hemminger writes: > > > Time to handle a full BGP load (163K of routes). > > > > Before: Load Dump Flush > > > > kmem_cache 3.8 13.0 7.2 > > iter 3.9 12.3 6.9 > > unordered 3.1 11.9 4.9 > > find_node 3.1 0.3 1.2 > > I certainly like the speed but what will we brake when > we don't return in longest prefix order? > > labb:/# ip r > default via 10.10.10.1 dev eth0 > 5.0.0.0/8 via 192.168.2.2 dev eth3 > 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 > 10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1 > 11.0.0.0/8 via 10.10.11.2 dev eth1 > 192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2 > 192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1 > > labb:/# ip route list match 10.10.10.1 > default via 10.10.10.1 dev eth0 > 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 > labb:/# > > Maybe the unordered dump can be ordered cheaply... > > Cheers. > --ro > > Hash returned the routes in prefix order (then random). Returning the routes in numerical order seems just as logical. I'm going to test on quagga. ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-23 14:06 ` Robert Olsson 2008-01-23 16:31 ` Stephen Hemminger @ 2008-01-23 23:49 ` Stephen Hemminger 2008-01-24 9:36 ` Robert Olsson 2008-02-01 18:00 ` Robert Olsson 1 sibling, 2 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-23 23:49 UTC (permalink / raw) To: Robert Olsson; +Cc: David Miller, netdev On Wed, 23 Jan 2008 15:06:47 +0100 Robert Olsson <Robert.Olsson@data.slu.se> wrote: > > Stephen Hemminger writes: > > > Time to handle a full BGP load (163K of routes). > > > > Before: Load Dump Flush > > > > kmem_cache 3.8 13.0 7.2 > > iter 3.9 12.3 6.9 > > unordered 3.1 11.9 4.9 > > find_node 3.1 0.3 1.2 > > I certainly like the speed but what will we brake when > we don't return in longest prefix order? > > labb:/# ip r > default via 10.10.10.1 dev eth0 > 5.0.0.0/8 via 192.168.2.2 dev eth3 > 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 > 10.10.11.0/24 dev eth1 proto kernel scope link src 10.10.11.1 > 11.0.0.0/8 via 10.10.11.2 dev eth1 > 192.168.1.0/24 dev eth2 proto kernel scope link src 192.168.1.2 > 192.168.2.0/24 dev eth3 proto kernel scope link src 192.168.2.1 > > labb:/# ip route list match 10.10.10.1 > default via 10.10.10.1 dev eth0 > 10.10.10.0/24 dev eth0 proto kernel scope link src 10.10.10.2 > labb:/# > > Maybe the unordered dump can be ordered cheaply... Dumping by prefix is possible, but unless 32x slower. Dumping in address order is just as logical. Like I said, I'm investigating what quagga handles. -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-23 23:49 ` Stephen Hemminger @ 2008-01-24 9:36 ` Robert Olsson 2008-01-24 16:18 ` Stephen Hemminger 2008-02-01 18:00 ` Robert Olsson 1 sibling, 1 reply; 17+ messages in thread From: Robert Olsson @ 2008-01-24 9:36 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev Stephen Hemminger writes: > Dumping by prefix is possible, but unless 32x slower. Dumping in > address order is just as logical. Like I said, I'm investigating what > quagga handles. How about taking a snapshot to in address order (as you did) to some allocated memory, returning from that memory in prefix order? This would solve the -EBUSY too and give a consistent view of the routing table at the time for the dump/snapshot. Cheers --ro ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-24 9:36 ` Robert Olsson @ 2008-01-24 16:18 ` Stephen Hemminger 0 siblings, 0 replies; 17+ messages in thread From: Stephen Hemminger @ 2008-01-24 16:18 UTC (permalink / raw) To: Robert Olsson; +Cc: Robert Olsson, David Miller, netdev On Thu, 24 Jan 2008 10:36:45 +0100 Robert Olsson <Robert.Olsson@data.slu.se> wrote: > > Stephen Hemminger writes: > > > Dumping by prefix is possible, but unless 32x slower. Dumping in > > address order is just as logical. Like I said, I'm investigating what > > quagga handles. > > How about taking a snapshot to in address order (as you did) to some > allocated memory, returning from that memory in prefix order? This would > solve the -EBUSY too and give a consistent view of the routing table at > the time for the dump/snapshot. > > Cheers > --ro Snapshotting is going to work, because of scale and because the kernel can't tell when application is going to come back. -- Stephen Hemminger <stephen.hemminger@vyatta.com> ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [IPV4 0/9] TRIE performance patches 2008-01-23 23:49 ` Stephen Hemminger 2008-01-24 9:36 ` Robert Olsson @ 2008-02-01 18:00 ` Robert Olsson 1 sibling, 0 replies; 17+ messages in thread From: Robert Olsson @ 2008-02-01 18:00 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, netdev Hello, finally got some time to test... Table w. 214k routes with full rDoS on two intrefaces on 2 x AMD64 processors, speed 2814.43 MHz. Profiled with CPU_CLK_UNHALTED and rtstat w/o latest patch fib_trie pathes. Tput ~233 kpps samples % symbol name 109925 14.4513 fn_trie_lookup 109821 14.4376 ip_route_input 87245 11.4696 rt_intern_hash 31270 4.1109 kmem_cache_alloc 24159 3.1761 dev_queue_xmit 23200 3.0500 neigh_lookup 22464 2.9532 free_block 18412 2.4205 kmem_cache_free 17830 2.3440 dst_destroy 15740 2.0693 fib_get_table With latest patch fib_patches.(Stephens others) Same throughput ~233 kpps but we see a different profile. Why we don't get any better better throughput is yet to be understand (the drops in qdisc could be the cause) more analysis needed 79242 14.3520 ip_route_input 65188 11.8066 fn_trie_lookup 64559 11.6927 rt_intern_hash 22901 4.1477 kmem_cache_alloc 21038 3.8103 check_leaf 16197 2.9335 neigh_lookup 14802 2.6809 free_block 14596 2.6436 ip_rcv_finish 12365 2.2395 fib_validate_source 12048 2.1821 dst_destroy fib_hash thoughput ~177 kpps Hard work for fib_hash here as we have many zones. it can be fast with less zines. 200568 37.8013 fn_hash_lookup 58352 10.9977 ip_route_input 44495 8.3860 rt_intern_hash 12873 2.4262 kmem_cache_alloc 12115 2.2833 rt_may_expire 11691 2.2034 rt_garbage_collect 10821 2.0394 dev_queue_xmit 9999 1.8845 fib_validate_source 8762 1.6514 fib_get_table 8558 1.6129 fib_semantic_match Cheers --ro ^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2008-02-01 18:01 UTC | newest] Thread overview: 17+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-01-22 23:37 [IPV4 0/9] TRIE performance patches Stephen Hemminger 2008-01-22 23:37 ` [IPV4 1/9] fib_trie: put leaf nodes in a slab cache Stephen Hemminger 2008-01-22 23:37 ` [IPV4 2/9] fib_trie: style cleanup Stephen Hemminger 2008-01-22 23:37 ` [IPV4 3/9] fib_trie: compute size when needed Stephen Hemminger 2008-01-22 23:37 ` [IPV4 4/9] fib_trie: use hash list Stephen Hemminger 2008-01-22 23:37 ` [IPV4 5/9] fib_trie: dump message multiple part flag Stephen Hemminger 2008-01-22 23:37 ` [IPV4 6/9] fib_trie: iterator recode Stephen Hemminger 2008-01-22 23:37 ` [IPV4 7/9] fib_trie: dump table in sorted order Stephen Hemminger 2008-01-22 23:37 ` [IPV4 8/9] fib_trie: avoid extra search on delete Stephen Hemminger 2008-01-22 23:37 ` [IPV4 9/9] fib_trie: avoid rescan on dump Stephen Hemminger 2008-01-23 5:58 ` [IPV4 0/9] TRIE performance patches David Miller 2008-01-23 14:06 ` Robert Olsson 2008-01-23 16:31 ` Stephen Hemminger 2008-01-23 23:49 ` Stephen Hemminger 2008-01-24 9:36 ` Robert Olsson 2008-01-24 16:18 ` Stephen Hemminger 2008-02-01 18:00 ` Robert Olsson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).