netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jarek Poplawski <jarkao2@gmail.com>
To: Robert Olsson <robert@robur.slu.se>
Cc: "Robert Olsson" <Robert.Olsson@data.slu.se>,
	"Eric Dumazet" <dada1@cosmosbay.com>,
	=?ISO-8859-2?Q?Pawe=B3_Staszewski?= <pstaszewski@itcare.pl>,
	"Robert Olsson" <robert.olsson@its.uu.se>,
	"Linux Network Development list" <netdev@vger.kernel.org>
Subject: Re: rib_trie / Fix inflate_threshold_root. Now=15 size=11 bits
Date: Fri, 26 Jun 2009 12:54:49 +0000	[thread overview]
Message-ID: <20090626125449.GA8897@ff.dom.local> (raw)
In-Reply-To: <19012.49700.908412.410984@robur.slu.se>

On Fri, Jun 26, 2009 at 02:42:12PM +0200, Robert Olsson wrote:
> 
> Jarek Poplawski writes:
> 
>  > >  But maybe memory allocation experts has some good suggestions.
>  > 
>  > Pawel has reported these problems for a long time:
>  > http://bugzilla.kernel.org/show_bug.cgi?id=6648
>  > 
>  > So, until it's fully investigated, it seems some 'fast' fix is needed
>  > here.
> 
>  We talked about having a fixed pre-allocated root-node long ago but it's only 
>  optimisation for routers w. full BGP. Best if memory problems got solved.
>  

I think the current process of rebalancing can allocate and hold
unnecessarily long a lot of 'temp' memory, so probably something
like the patch below could be useful. It should be applied to the
2.6.30 after two patches below (from 2.6.31-rc). (Alas I can't even
compile-test it now).

Cheers,
Jarek P.

--- (for testing)

 net/ipv4/fib_trie.c |   24 ++++++++++++++++++------
 1 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 012cf5a..c2fc862 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -366,6 +366,14 @@ static void __tnode_vfree(struct work_struct *arg)
 	vfree(tn);
 }
 
+static void __tnode_free(struct tnode *tn)
+{
+	if (size <= PAGE_SIZE)
+		kfree(tn);
+	else
+		vfree(tn);
+}
+
 static void __tnode_free_rcu(struct rcu_head *head)
 {
 	struct tnode *tn = container_of(head, struct tnode, rcu);
@@ -402,7 +410,7 @@ static void tnode_free_flush(void)
 	while ((tn = tnode_free_head)) {
 		tnode_free_head = tn->tnode_free;
 		tn->tnode_free = NULL;
-		tnode_free(tn);
+		__tnode_free(tn);
 	}
 }
 
@@ -1020,19 +1028,23 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 		tnode_put_child_reorg((struct tnode *)tp, cindex,
 				      (struct node *)tn, wasfull);
 
-		tp = node_parent((struct node *) tn);
+		synchronize_rcu();
 		tnode_free_flush();
+		tp = node_parent((struct node *) tn);
 		if (!tp)
 			break;
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
-	if (IS_TNODE(tn))
+	if (IS_TNODE(tn)) {
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
-
-	rcu_assign_pointer(t->trie, (struct node *)tn);
-	tnode_free_flush();
+		rcu_assign_pointer(t->trie, (struct node *)tn);
+		synchronize_rcu();
+		tnode_free_flush();
+	} else {
+		rcu_assign_pointer(t->trie, (struct node *)tn);
+	}
 
 	return;
 }

---
commit e0f7cb8c8cc6cccce28d2ce39ad8c60d23c3799f
Author: Jarek Poplawski <jarkao2@gmail.com>
Date:   Mon Jun 15 02:31:29 2009 -0700

    ipv4: Fix fib_trie rebalancing
    
    While doing trie_rebalance(): resize(), inflate(), halve() RCU free
    tnodes before updating their parents. It depends on RCU delaying the
    real destruction, but if RCU readers start after call_rcu() and before
    parent update they could access freed memory.
    
    It is currently prevented with preempt_disable() on the update side,
    but it's not safe, except maybe classic RCU, plus it conflicts with
    memory allocations with GFP_KERNEL flag used from these functions.
    
    This patch explicitly delays freeing of tnodes by adding them to the
    list, which is flushed after the update is finished.
    
    Reported-by: Yan Zheng <zheng.yan@oracle.com>
    Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 538d2a9..d1a39b1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -123,6 +123,7 @@ struct tnode {
 	union {
 		struct rcu_head rcu;
 		struct work_struct work;
+		struct tnode *tnode_free;
 	};
 	struct node *child[0];
 };
@@ -161,6 +162,8 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 static struct node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
+/* tnodes to free after resize(); protected by RTNL */
+static struct tnode *tnode_free_head;
 
 static struct kmem_cache *fn_alias_kmem __read_mostly;
 static struct kmem_cache *trie_leaf_kmem __read_mostly;
@@ -385,6 +388,29 @@ static inline void tnode_free(struct tnode *tn)
 		call_rcu(&tn->rcu, __tnode_free_rcu);
 }
 
+static void tnode_free_safe(struct tnode *tn)
+{
+	BUG_ON(IS_LEAF(tn));
+
+	if (node_parent((struct node *) tn)) {
+		tn->tnode_free = tnode_free_head;
+		tnode_free_head = tn;
+	} else {
+		tnode_free(tn);
+	}
+}
+
+static void tnode_free_flush(void)
+{
+	struct tnode *tn;
+
+	while ((tn = tnode_free_head)) {
+		tnode_free_head = tn->tnode_free;
+		tn->tnode_free = NULL;
+		tnode_free(tn);
+	}
+}
+
 static struct leaf *leaf_new(void)
 {
 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -495,7 +521,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 	/* No children */
 	if (tn->empty_children == tnode_child_length(tn)) {
-		tnode_free(tn);
+		tnode_free_safe(tn);
 		return NULL;
 	}
 	/* One child */
@@ -509,7 +535,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 			/* compress one level */
 			node_set_parent(n, NULL);
-			tnode_free(tn);
+			tnode_free_safe(tn);
 			return n;
 		}
 	/*
@@ -670,7 +696,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 			/* compress one level */
 
 			node_set_parent(n, NULL);
-			tnode_free(tn);
+			tnode_free_safe(tn);
 			return n;
 		}
 
@@ -756,7 +782,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 			put_child(t, tn, 2*i, inode->child[0]);
 			put_child(t, tn, 2*i+1, inode->child[1]);
 
-			tnode_free(inode);
+			tnode_free_safe(inode);
 			continue;
 		}
 
@@ -801,9 +827,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 		put_child(t, tn, 2*i, resize(t, left));
 		put_child(t, tn, 2*i+1, resize(t, right));
 
-		tnode_free(inode);
+		tnode_free_safe(inode);
 	}
-	tnode_free(oldtnode);
+	tnode_free_safe(oldtnode);
 	return tn;
 nomem:
 	{
@@ -885,7 +911,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 		put_child(t, newBinNode, 1, right);
 		put_child(t, tn, i/2, resize(t, newBinNode));
 	}
-	tnode_free(oldtnode);
+	tnode_free_safe(oldtnode);
 	return tn;
 nomem:
 	{
@@ -989,7 +1015,6 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	t_key cindex, key;
 	struct tnode *tp;
 
-	preempt_disable();
 	key = tn->key;
 
 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
@@ -1001,16 +1026,18 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 				      (struct node *)tn, wasfull);
 
 		tp = node_parent((struct node *) tn);
+		tnode_free_flush();
 		if (!tp)
 			break;
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
-	if (IS_TNODE(tn))
+	if (IS_TNODE(tn)) {
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
+		tnode_free_flush();
+	}
 
-	preempt_enable();
 	return (struct node *)tn;
 }
 
---
commit 7b85576d15bf2574b0a451108f59f9ad4170dd3f
Author: Jarek Poplawski <jarkao2@gmail.com>
Date:   Thu Jun 18 00:28:51 2009 -0700

    ipv4: Fix fib_trie rebalancing, part 2
    
    My previous patch, which explicitly delays freeing of tnodes by adding
    them to the list to flush them after the update is finished, isn't
    strict enough. It treats exceptionally tnodes without parent, assuming
    they are newly created, so "invisible" for the read side yet.
    
    But the top tnode doesn't have parent as well, so we have to exclude
    all exceptions (at least until a better way is found). Additionally we
    need to move rcu assignment of this node before flushing, so the
    return type of the trie_rebalance() function is changed.
    
    Reported-by: Yan Zheng <zheng.yan@oracle.com>
    Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d1a39b1..012cf5a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -391,13 +391,8 @@ static inline void tnode_free(struct tnode *tn)
 static void tnode_free_safe(struct tnode *tn)
 {
 	BUG_ON(IS_LEAF(tn));
-
-	if (node_parent((struct node *) tn)) {
-		tn->tnode_free = tnode_free_head;
-		tnode_free_head = tn;
-	} else {
-		tnode_free(tn);
-	}
+	tn->tnode_free = tnode_free_head;
+	tnode_free_head = tn;
 }
 
 static void tnode_free_flush(void)
@@ -1009,7 +1004,7 @@ fib_find_node(struct trie *t, u32 key)
 	return NULL;
 }
 
-static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	int wasfull;
 	t_key cindex, key;
@@ -1033,12 +1028,13 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	}
 
 	/* Handle last (top) tnode */
-	if (IS_TNODE(tn)) {
+	if (IS_TNODE(tn))
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
-		tnode_free_flush();
-	}
 
-	return (struct node *)tn;
+	rcu_assign_pointer(t->trie, (struct node *)tn);
+	tnode_free_flush();
+
+	return;
 }
 
 /* only used from updater-side */
@@ -1186,7 +1182,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 
 	/* Rebalance the trie */
 
-	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+	trie_rebalance(t, tp);
 done:
 	return fa_head;
 }
@@ -1605,7 +1601,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
 	if (tp) {
 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
-		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+		trie_rebalance(t, tp);
 	} else
 		rcu_assign_pointer(t->trie, NULL);
 

  reply	other threads:[~2009-06-26 12:54 UTC|newest]

Thread overview: 99+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-06-25 15:48 rib_trie / Fix inflate_threshold_root. Now=15 size=11 bits Paweł Staszewski
2009-06-25 21:19 ` Eric Dumazet
2009-06-25 21:52   ` Paweł Staszewski
2009-06-25 22:54     ` Eric Dumazet
2009-06-26 10:06       ` Paweł Staszewski
2009-06-26 10:34         ` Eric Dumazet
2009-06-26 10:47           ` Paweł Staszewski
2009-06-26 10:52             ` Eric Dumazet
2009-06-26 17:26               ` Paweł Staszewski
2009-06-26  8:03   ` Jarek Poplawski
2009-06-26  9:19     ` Robert Olsson
2009-06-26  9:37       ` Jarek Poplawski
2009-06-26 10:26         ` Jorge Boncompte [DTI2]
2009-06-26 12:42         ` Robert Olsson
2009-06-26 12:54           ` Jarek Poplawski [this message]
2009-06-26 13:28             ` Jarek Poplawski
2009-06-26 13:52               ` Robert Olsson
2009-06-26 15:10                 ` Jarek Poplawski
2009-06-26 15:30                   ` Paul E. McKenney
2009-06-26 15:54                     ` Jarek Poplawski
2009-06-26 16:15                       ` Jarek Poplawski
2009-06-26 16:23                         ` Paul E. McKenney
2009-06-26 16:45                           ` Jarek Poplawski
2009-06-26 17:05                             ` Paul E. McKenney
2009-06-26 18:05                               ` Jarek Poplawski
2009-06-26 18:21                                 ` Paul E. McKenney
2009-06-26 20:19                                   ` Jarek Poplawski
2009-06-26 20:26                                 ` Robert Olsson
2009-06-26 20:37                                   ` Jarek Poplawski
2009-06-26 21:20                                     ` Jarek Poplawski
2009-06-27 19:20       ` Jarek Poplawski
2009-06-27 20:51         ` Jarek Poplawski
2009-06-28  0:28           ` Paweł Staszewski
2009-06-28 11:11           ` Robert Olsson
2009-06-29  7:57             ` Paweł Staszewski
2009-06-28 11:04         ` Robert Olsson
2009-06-28 12:03           ` Jarek Poplawski
2009-06-28 14:35           ` Jarek Poplawski
2009-06-28 15:32             ` Paweł Staszewski
2009-06-28 15:48               ` Paweł Staszewski
2009-06-28 19:56                 ` Jarek Poplawski
2009-06-28 21:36                 ` Jarek Poplawski
2009-06-29  8:08                   ` Paweł Staszewski
2009-06-29  8:47                     ` Paweł Staszewski
2009-06-29  9:27                       ` Jarek Poplawski
2009-06-29  9:43                         ` Paweł Staszewski
2009-06-29  8:33                   ` [PATCH net-2.6] " Jarek Poplawski
2009-06-29  9:51                     ` Paweł Staszewski
2009-06-29 10:47                       ` Jarek Poplawski
2009-06-29 16:24                         ` Paweł Staszewski
2009-06-29 17:09                           ` Jarek Poplawski
2009-06-30  7:09                         ` Jarek Poplawski
2009-06-30 20:16                           ` Paweł Staszewski
2009-06-30 20:41                             ` Jarek Poplawski
2009-06-30 23:31                               ` Paweł Staszewski
2009-07-01  6:36                                 ` Jarek Poplawski
     [not found]                                   ` <20090701072409.GA12592@ff.dom.local>
2009-07-01  9:43                                     ` Paweł Staszewski
2009-07-01  9:50                                       ` Paweł Staszewski
2009-07-01 10:13                                       ` Jarek Poplawski
2009-07-01 11:04                                         ` Jarek Poplawski
2009-07-01 22:17                                           ` Paweł Staszewski
2009-07-02  5:32                                             ` Jarek Poplawski
2009-07-02  5:43                                               ` Paweł Staszewski
2009-07-02  6:00                                                 ` Jarek Poplawski
2009-07-02 15:31                                                   ` Robert Olsson
2009-07-02 19:06                                                     ` Jarek Poplawski
2009-07-02 21:32                                                       ` Robert Olsson
2009-07-02 22:13                                                         ` Jarek Poplawski
2009-07-05  0:26                                                   ` Paweł Staszewski
2009-07-05  0:30                                                     ` Paweł Staszewski
2009-07-05 16:20                                                       ` Jarek Poplawski
2009-07-05 17:32                                                         ` Jarek Poplawski
2009-07-05 21:32                                                           ` Paul E. McKenney
2009-07-05 22:23                                                             ` Jarek Poplawski
2009-07-05 23:53                                                               ` Paweł Staszewski
2009-07-06  9:02                                                                 ` Jarek Poplawski
2009-07-07 22:56                                                                   ` Paweł Staszewski
2009-07-07 23:50                                                                     ` Jarek Poplawski
2009-07-09 20:34                                                                       ` Paweł Staszewski
2009-07-14 19:41                                                                         ` [PATCH net-next] " Jarek Poplawski
2009-07-15  7:43                                                                           ` Robert Olsson
2009-07-15 13:05                                                                             ` Jarek Poplawski
2009-07-17  8:08                                                                               ` Robert Olsson
2009-07-20 14:41                                                                           ` David Miller
2009-07-07 23:23                                                                   ` [PATCH net-2.6] " Paweł Staszewski
2009-07-07 23:30                                                                     ` Paweł Staszewski
2009-07-14 18:33                                                             ` [PATCH net-next] " Jarek Poplawski
2009-07-20 14:41                                                               ` David Miller
2009-07-14 21:20                                                             ` [PATCH net-next] ipv4: fib_trie: Use tnode_get_child_rcu() and node_parent_rcu() in lookups Jarek Poplawski
2009-07-20 14:41                                                               ` David Miller
2009-07-05  0:31                                                     ` [PATCH net-2.6] Re: rib_trie / Fix inflate_threshold_root. Now=15 size=11 bits Paweł Staszewski
2009-07-05 12:56                                                     ` [PATCH -stable] " Jarek Poplawski
2009-07-05 13:08                                                     ` [PATCH v2 " Jarek Poplawski
2009-07-08  2:42                                                       ` David Miller
2009-07-08  6:44                                                         ` Jarek Poplawski
2009-06-29 10:58                       ` [PATCH net-2.6] " Jarek Poplawski
2009-06-30 19:48                         ` David Miller
2009-06-30 20:14                           ` Jarek Poplawski
2009-07-10 15:29                           ` Stephen Hemminger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090626125449.GA8897@ff.dom.local \
    --to=jarkao2@gmail.com \
    --cc=Robert.Olsson@data.slu.se \
    --cc=dada1@cosmosbay.com \
    --cc=netdev@vger.kernel.org \
    --cc=pstaszewski@itcare.pl \
    --cc=robert.olsson@its.uu.se \
    --cc=robert@robur.slu.se \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).