All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <dada1@cosmosbay.com>
To: Stephen Hemminger <stephen.hemminger@vyatta.com>
Cc: David Miller <davem@davemloft.net>,
	robert.olsson@its.uu.se, netdev@vger.kernel.org
Subject: Re: [RFC 6/6] fib_trie: combine leaf and info
Date: Tue, 15 Jan 2008 07:12:07 +0100	[thread overview]
Message-ID: <478C4EB7.6060807@cosmosbay.com> (raw)
In-Reply-To: <20080114210716.4b09c84d@deepthought>

[-- Attachment #1: Type: text/plain, Size: 9629 bytes --]

Stephen Hemminger a écrit :
> Combine the prefix information and the leaf together into one
> allocation. This is furthur simplified by converting the hlist
> into a simple bitfield.
> 
> Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>
> 
> This patch is experimental (ie it boots and loads routes), but
> is slower for the 163,000 route dump test.
> 

Its also very memory expensive. That was not Robert suggestion I believe.

Its suggestion is to embedd one info into each leaves.

Please find attached to this mail a preliminary and incomplete patch I wrote 
this morning before coffee :), to get the idea.

Before patching this file, maybe we should first do a cleanup (checkpatch.pl 
and obvious modern style formating :) )
> ---
>  net/ipv4/fib_trie.c |  165 ++++++++++++++++++++--------------------------------
>  1 file changed, 65 insertions(+), 100 deletions(-)
> 
> --- a/net/ipv4/fib_trie.c	2008-01-14 18:37:51.000000000 -0800
> +++ b/net/ipv4/fib_trie.c	2008-01-14 20:57:12.000000000 -0800
> @@ -50,10 +50,9 @@
>   *		Patrick McHardy <kaber@trash.net>
>   */
>  
> -#define VERSION "0.408"
> +#define VERSION "0.409"
> +
>  
> -#include <asm/uaccess.h>
> -#include <asm/system.h>
>  #include <linux/bitops.h>
>  #include <linux/types.h>
>  #include <linux/kernel.h>
> @@ -80,6 +79,8 @@
>  #include <net/tcp.h>
>  #include <net/sock.h>
>  #include <net/ip_fib.h>
> +#include <asm/bitops.h>
> +
>  #include "fib_lookup.h"
>  
>  #define MAX_STAT_DEPTH 32
> @@ -101,20 +102,24 @@ struct node {
>  	t_key key;
>  };
>  
> +struct leaf_info {
> +	struct list_head falh;
> +};
> +
> +#define INFO_SIZE	33
> +/* NB: bitmap is in reverse order, so that find_first returns largest prefix */
> +#define PLEN2BIT(x)	(INFO_SIZE - (x))
> +
>  struct leaf {
>  	unsigned long parent;
>  	t_key key;
> -	struct hlist_head list;
>  	struct rcu_head rcu;
> -};
>  
> -struct leaf_info {
> -	struct hlist_node hlist;
> -	struct rcu_head rcu;
> -	int plen;
> -	struct list_head falh;
> +	unsigned long bitmap[INFO_SIZE / BITS_PER_LONG + 1];
> +	struct leaf_info info[INFO_SIZE];
>  };
>  
> +
>  struct tnode {
>  	unsigned long parent;
>  	t_key key;
> @@ -321,16 +326,6 @@ static void __leaf_free_rcu(struct rcu_h
>  	kmem_cache_free(trie_leaf_kmem, leaf);
>  }
>  
> -static void __leaf_info_free_rcu(struct rcu_head *head)
> -{
> -	kfree(container_of(head, struct leaf_info, rcu));
> -}
> -
> -static inline void free_leaf_info(struct leaf_info *leaf)
> -{
> -	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
> -}
> -
>  static struct tnode *tnode_alloc(size_t size)
>  {
>  	struct page *pages;
> @@ -371,21 +366,37 @@ static struct leaf *leaf_new(void)
>  	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
>  	if (l) {
>  		l->parent = T_LEAF;
> -		INIT_HLIST_HEAD(&l->list);
> +		bitmap_zero(l->bitmap, INFO_SIZE);
>  	}
>  	return l;
>  }
>  
> -static struct leaf_info *leaf_info_new(int plen)
> +static inline struct leaf_info *find_leaf_info(struct leaf *l, int plen)
>  {
> -	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
> -	if (li) {
> -		li->plen = plen;
> -		INIT_LIST_HEAD(&li->falh);
> -	}
> +	return test_bit(PLEN2BIT(plen), l->bitmap) ? &l->info[plen] : NULL;
> +}
> +
> +static struct leaf_info *set_leaf_info(struct leaf *l, int plen)
> +{
> +	struct leaf_info *li = &l->info[plen];
> +
> +	INIT_LIST_HEAD(&li->falh);
> +	set_bit(PLEN2BIT(plen), l->bitmap);
> +
>  	return li;
>  }
>  
> +static inline void clear_leaf_info(struct leaf *l, int plen)
> +{
> +	smp_mb__before_clear_bit();
> +	clear_bit(PLEN2BIT(plen), &l->bitmap);
> +}
> +
> +static inline int leaf_info_empty(const struct leaf *l)
> +{
> +	return find_first_bit(l->bitmap, INFO_SIZE) >= INFO_SIZE;
> +}
> +
>  static struct tnode* tnode_new(t_key key, int pos, int bits)
>  {
>  	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
> @@ -876,20 +887,6 @@ nomem:
>  
>  /* readside must use rcu_read_lock currently dump routines
>   via get_fa_head and dump */
> -
> -static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
> -{
> -	struct hlist_head *head = &l->list;
> -	struct hlist_node *node;
> -	struct leaf_info *li;
> -
> -	hlist_for_each_entry_rcu(li, node, head, hlist)
> -		if (li->plen == plen)
> -			return li;
> -
> -	return NULL;
> -}
> -
>  static inline struct list_head * get_fa_head(struct leaf *l, int plen)
>  {
>  	struct leaf_info *li = find_leaf_info(l, plen);
> @@ -900,26 +897,6 @@ static inline struct list_head * get_fa_
>  	return &li->falh;
>  }
>  
> -static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
> -{
> -	struct leaf_info *li = NULL, *last = NULL;
> -	struct hlist_node *node;
> -
> -	if (hlist_empty(head)) {
> -		hlist_add_head_rcu(&new->hlist, head);
> -	} else {
> -		hlist_for_each_entry(li, node, head, hlist) {
> -			if (new->plen > li->plen)
> -				break;
> -
> -			last = li;
> -		}
> -		if (last)
> -			hlist_add_after_rcu(&last->hlist, &new->hlist);
> -		else
> -			hlist_add_before_rcu(&new->hlist, &li->hlist);
> -	}
> -}
>  
>  /* rcu_read_lock needs to be hold by caller from readside */
>  
> @@ -993,6 +970,8 @@ static struct list_head *fib_insert_node
>  	pos = 0;
>  	n = t->trie;
>  
> +	pr_debug("fib_insert_node: %x/%d\n", key, plen);
> +
>  	/* If we point to NULL, stop. Either the tree is empty and we should
>  	 * just put a new leaf in if, or we have reached an empty child slot,
>  	 * and we should just put our new leaf in that.
> @@ -1038,13 +1017,9 @@ static struct list_head *fib_insert_node
>  
>  	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
>  		l = (struct leaf *) n;
> -		li = leaf_info_new(plen);
> -
> -		if (!li)
> -			return NULL;
> -
> +		li = set_leaf_info(l, plen);
>  		fa_head = &li->falh;
> -		insert_leaf_info(&l->list, li);
> +
>  		goto done;
>  	}
>  	l = leaf_new();
> @@ -1053,15 +1028,8 @@ static struct list_head *fib_insert_node
>  		return NULL;
>  
>  	l->key = key;
> -	li = leaf_info_new(plen);
> -
> -	if (!li) {
> -		tnode_free((struct tnode *) l);
> -		return NULL;
> -	}
> -
> +	li = set_leaf_info(l, plen);
>  	fa_head = &li->falh;
> -	insert_leaf_info(&l->list, li);
>  
>  	if (t->trie && n == NULL) {
>  		/* Case 2: n is NULL, and will just insert a new leaf */
> @@ -1091,7 +1059,6 @@ static struct list_head *fib_insert_node
>  		}
>  
>  		if (!tn) {
> -			free_leaf_info(li);
>  			tnode_free((struct tnode *) l);
>  			return NULL;
>  		}
> @@ -1119,7 +1086,7 @@ static struct list_head *fib_insert_node
>  
>  	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
>  done:
> -	return fa_head;
> +	return &li->falh;
>  }
>  
>  /*
> @@ -1281,14 +1248,14 @@ static int check_leaf(struct trie *t, st
>  		      t_key key,  const struct flowi *flp,
>  		      struct fib_result *res)
>  {
> -	struct leaf_info *li;
> -	struct hlist_head *hhead = &l->list;
> -	struct hlist_node *node;
> +	int bit;
>  
> -	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
> -		int err;
> -		int plen = li->plen;
> +	/* need find highest prefix */
> +	for_each_bit(bit, l->bitmap, INFO_SIZE) {
> +		int plen = PLEN2BIT(bit);
> +		struct leaf_info *li = &l->info[plen];
>  		__be32 mask = inet_make_mask(plen);
> +		int err;
>  
>  		if (l->key != (key & ntohl(mask)))
>  			continue;
> @@ -1622,12 +1589,10 @@ static int fn_trie_delete(struct fib_tab
>  
>  	list_del_rcu(&fa->fa_list);
>  
> -	if (list_empty(fa_head)) {
> -		hlist_del_rcu(&li->hlist);
> -		free_leaf_info(li);
> -	}
> +	if (list_empty(fa_head))
> +		clear_leaf_info(l, plen);
>  
> -	if (hlist_empty(&l->list))
> +	if (leaf_info_empty(l))
>  		trie_leaf_remove(t, key);
>  
>  	if (fa->fa_state & FA_S_ACCESSED)
> @@ -1659,18 +1624,17 @@ static int trie_flush_list(struct trie *
>  static int trie_flush_leaf(struct trie *t, struct leaf *l)
>  {
>  	int found = 0;
> -	struct hlist_head *lih = &l->list;
> -	struct hlist_node *node, *tmp;
> -	struct leaf_info *li = NULL;
> +	int bit;
>  
> -	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
> +	for_each_bit(bit, l->bitmap, INFO_SIZE) {
> +		int plen = PLEN2BIT(bit);
> +		struct leaf_info *li = &l->info[plen];
>  		found += trie_flush_list(t, &li->falh);
>  
> -		if (list_empty(&li->falh)) {
> -			hlist_del_rcu(&li->hlist);
> -			free_leaf_info(li);
> -		}
> +		if (list_empty(&li->falh))
> +			clear_leaf_info(l, plen);
>  	}
> +
>  	return found;
>  }
>  
> @@ -1746,12 +1710,12 @@ static int fn_trie_flush(struct fib_tabl
>  	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
>  		found += trie_flush_leaf(t, l);
>  
> -		if (ll && hlist_empty(&ll->list))
> +		if (ll && leaf_info_empty(ll))
>  			trie_leaf_remove(t, ll->key);
>  		ll = l;
>  	}
>  
> -	if (ll && hlist_empty(&ll->list))
> +	if (ll && leaf_info_empty(ll))
>  		trie_leaf_remove(t, ll->key);
>  
>  	pr_debug("trie_flush found=%d\n", found);
> @@ -2428,10 +2392,11 @@ static int fib_route_seq_show(struct seq
>  
>  	if (iter->trie == iter->trie_local)
>  		return 0;
> +
>  	if (IS_TNODE(l))
>  		return 0;
>  
> -	for (i=32; i>=0; i--) {
> +	for (i = 32; i >= 0; i--) {
>  		struct leaf_info *li = find_leaf_info(l, i);
>  		struct fib_alias *fa;
>  		__be32 mask, prefix;
> @@ -2439,7 +2404,7 @@ static int fib_route_seq_show(struct seq
>  		if (!li)
>  			continue;
>  
> -		mask = inet_make_mask(li->plen);
> +		mask = inet_make_mask(i);
>  		prefix = htonl(l->key);
>  
>  		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


[-- Attachment #2: info_embedded.patch --]
[-- Type: text/plain, Size: 3670 bytes --]

--- net/ipv4/fib_trie.c
+++ net/ipv4/fib_trie.c.rcu
@@ -97,30 +97,22 @@
 #define IS_LEAF(n) (n->parent & T_LEAF)
 
 struct node {
-	unsigned long		parent;
-	t_key			key;
+	unsigned long parent;
+	t_key key;
 };
 
 struct leaf {
-	unsigned long		parent;
-	t_key			key;
-	/*
-	 * Because we have at least one info per leaf, we embedd one here
-	 * to save some space and speedup lookups (sharing cache line)
-	 * Note : not inside a structure so that we dont have holes on 64bit 
-	 */
-	int 			plen_iinfo;
-	struct list_head	falh_iinfo;
-
-	struct hlist_head	list;
-	struct rcu_head		rcu;
+	unsigned long parent;
+	t_key key;
+	struct hlist_head list;
+	struct rcu_head rcu;
 };
 
 struct leaf_info {
-	struct hlist_node	hlist;
-	int			plen;
-	struct list_head	falh;
-	struct rcu_head		rcu;
+	struct hlist_node hlist;
+	struct rcu_head rcu;
+	int plen;
+	struct list_head falh;
 };
 
 struct tnode {
@@ -381,13 +373,11 @@
 		call_rcu(&tn->rcu, __tnode_free_rcu);
 }
 
-static struct leaf *leaf_new(int plen)
+static struct leaf *leaf_new(void)
 {
 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
 	if (l) {
 		l->parent = T_LEAF;
-		l->plen_iinfo = plen;
-		INIT_LIST_HEAD(&l->falh_iinfo);
 		INIT_HLIST_HEAD(&l->list);
 	}
 	return l;
@@ -909,12 +899,7 @@
 
 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 {
-	struct leaf_info *li;
-
-	if (plen == l->plen_iinfo)
-		return &l->falh_iinfo;
-
-	li = find_leaf_info(l, plen);
+	struct leaf_info *li = find_leaf_info(l, plen);
 
 	if (!li)
 		return NULL;
@@ -1071,22 +1056,21 @@
 		goto done;
 	}
 	t->size++;
-	l = leaf_new(plen);
+	l = leaf_new();
 
 	if (!l)
 		return NULL;
 
 	l->key = key;
-//	li = leaf_info_new(plen);
+	li = leaf_info_new(plen);
+
+	if (!li) {
+		tnode_free((struct tnode *) l);
+		return NULL;
+	}
 
-//	if (!li) {
-//		tnode_free((struct tnode *) l);
-//		return NULL;
-//	}
-
-//	fa_head = &li->falh;
-//	insert_leaf_info(&l->list, li);
-	fa_head = &l->falh_iinfo;
+	fa_head = &li->falh;
+	insert_leaf_info(&l->list, li);
 
 	if (t->trie && n == NULL) {
 		/* Case 2: n is NULL, and will just insert a new leaf */
@@ -1116,7 +1100,7 @@
 		}
 
 		if (!tn) {
-//			free_leaf_info(li);
+			free_leaf_info(li);
 			tnode_free((struct tnode *) l);
 			return NULL;
 		}
@@ -1640,7 +1624,7 @@
 	li = find_leaf_info(l, plen);
 
 	list_del_rcu(&fa->fa_list);
-// FIXME
+
 	if (list_empty(fa_head)) {
 		hlist_del_rcu(&li->hlist);
 		free_leaf_info(li);
@@ -2382,11 +2366,10 @@
 		seq_indent(seq, iter->depth);
 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
 		for (i = 32; i >= 0; i--) {
-//			struct leaf_info *li = get_fa_head(l, i); //find_leaf_info(l, i);
-			struct list_head *fa_head = get_fa_head(l, i);
-			if (fa_head) {
+			struct leaf_info *li = find_leaf_info(l, i);
+			if (li) {
 				struct fib_alias *fa;
-				list_for_each_entry_rcu(fa, fa_head, fa_list) {
+				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 					seq_indent(seq, iter->depth+1);
 					seq_printf(seq, "  /%d %s %s", i,
 						   rtn_scope(fa->fa_scope),
@@ -2465,18 +2448,17 @@
 		return 0;
 
 	for (i=32; i>=0; i--) {
-		//struct leaf_info *li = find_leaf_info(l, i);
-		struct list_head *fa_head = get_fa_head(l, i);
+		struct leaf_info *li = find_leaf_info(l, i);
 		struct fib_alias *fa;
 		__be32 mask, prefix;
 
-		if (!fa_head)
+		if (!li)
 			continue;
 
-		mask = inet_make_mask(i);
+		mask = inet_make_mask(li->plen);
 		prefix = htonl(l->key);
 
-		list_for_each_entry_rcu(fa, fa_head, fa_list) {
+		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
 

  reply	other threads:[~2008-01-15  6:12 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20080112064513.803976049@linux-foundation.org>
     [not found] ` <20080112064646.659443238@linux-foundation.org>
2008-01-12 11:16   ` [PATCH 9/9] fix sparse warnings Eric Dumazet
2008-01-12 11:28     ` David Miller
2008-01-12 21:08     ` Stephen Hemminger
2008-01-14 11:07       ` Robert Olsson
2008-01-14 17:34         ` Eric Dumazet
2008-01-14 17:59           ` Robert Olsson
2008-01-14 19:27             ` [FIB]: Avoid using static variables without proper locking Eric Dumazet
2008-01-15  7:10               ` David Miller
2008-01-12 21:09     ` [PATCH 9/9] fix sparse warnings Stephen Hemminger
2008-01-13  5:28       ` David Miller
2008-01-13 18:30     ` [FIB]: full_children & empty_children should be uint, not ushort Eric Dumazet
2008-01-13 22:02       ` Robert Olsson
2008-01-14  6:32         ` David Miller
2008-01-13  5:25   ` [PATCH 9/9] fix sparse warnings David Miller
     [not found] ` <20080112064646.056241123@linux-foundation.org>
2008-01-13  4:49   ` [PATCH 1/9] get rid of trie_init David Miller
     [not found] ` <20080112064646.132747871@linux-foundation.org>
2008-01-13  4:50   ` [PATCH 2/9] get rid of unused revision element David Miller
2008-01-14 11:44     ` Robert Olsson
2008-01-14 12:06       ` David Miller
2008-01-14 16:35         ` Stephen Hemminger
2008-01-15  7:07           ` David Miller
     [not found] ` <20080112064646.207183428@linux-foundation.org>
2008-01-13  4:53   ` [PATCH 3/9] move size information to pr_debug() David Miller
     [not found] ` <20080112064646.282104074@linux-foundation.org>
2008-01-13  4:55   ` [PATCH 4/9] statistics improvements David Miller
2008-01-13  5:33     ` Stephen Hemminger
2008-01-13  5:44       ` David Miller
2008-01-14 20:57         ` [PATCH] [IPV4] fib_trie: size and statistics Stephen Hemminger
     [not found]           ` <20080114164450.55f8c9b2@deepthought>
2008-01-15  0:46             ` [PATCH 3/6] [IPV4] trie: put leaf nodes in a slab cache Stephen Hemminger
2008-01-15  0:47               ` [PATCH 4/6] [IPV4] fib_trie style cleanup Stephen Hemminger
2008-01-15  2:58                 ` [PATCH 5/6] [IPV4] fib_trie: checkleaf calling convention Stephen Hemminger
2008-01-15  5:07                   ` [RFC 6/6] fib_trie: combine leaf and info Stephen Hemminger
2008-01-15  6:12                     ` Eric Dumazet [this message]
2008-01-15  6:16                       ` Eric Dumazet
2008-01-15 16:19                         ` Stephen Hemminger
2008-01-15 16:44                           ` Robert Olsson
2008-01-15 17:25                             ` Eric Dumazet
2008-01-15 17:47                               ` Stephen Hemminger
2008-01-15 18:10                                 ` Eric Dumazet
2008-01-15 18:15                                   ` Stephen Hemminger
2008-01-15 18:32                                     ` Eric Dumazet
2008-01-15 20:18                                 ` Robert Olsson
2008-01-15 21:16                                   ` Eric Dumazet
2008-01-15 17:59                               ` Robert Olsson
2008-01-15  6:49               ` [PATCH 3/6] [IPV4] trie: put leaf nodes in a slab cache Eric Dumazet
2008-01-15  7:29               ` David Miller
2008-01-15  5:00           ` [PATCH 2/6] [IPV4] fib hash|trie initialization Stephen Hemminger
2008-01-15  7:14             ` David Miller
2008-01-15  6:55           ` [PATCH] [IPV4] fib_trie: size and statistics Eric Dumazet
2008-01-15  7:28             ` David Miller
2008-01-15  7:12           ` David Miller
     [not found] ` <20080112064646.356466158@linux-foundation.org>
2008-01-13  4:56   ` [PATCH 5/9] use %u for unsigned printfs David Miller
     [not found] ` <20080112064646.432200237@linux-foundation.org>
2008-01-13  4:57   ` [PATCH 6/9] : fib_insert_node cleanup David Miller
     [not found] ` <20080112064646.507015655@linux-foundation.org>
2008-01-13  4:58   ` [PATCH 7/9] printk related cleanups David Miller
     [not found] ` <20080112064646.583836190@linux-foundation.org>
2008-01-13  5:23   ` [PATCH 8/9] add statistics David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=478C4EB7.6060807@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=robert.olsson@its.uu.se \
    --cc=stephen.hemminger@vyatta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.