All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] radix-tree: fix small lockless radix-tree bug
@ 2008-06-12 19:03 Nick Piggin
  2008-06-12 19:15 ` Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Nick Piggin @ 2008-06-12 19:03 UTC (permalink / raw)
  To: akpm, Peter Zijlstra, linux-kernel, paulmck

[-- Attachment #1: Type: text/plain, Size: 254 bytes --]

Hi guys,

Although this doesn't seem like cause for alarm (as per the analysis),
it may still be a good 2.6.26 candidate as we should have a few more
weeks of testing left.

It should definitely go in -mm with the lockless pagecache patch.

Thanks,
Nick

[-- Attachment #2: radix-tree-fixup.patch --]
[-- Type: text/x-diff, Size: 5689 bytes --]

When shrinking a radix-tree, we do it in a lockless manner by atomically
switching the root pointer away from the redundant node (one that only
has a single entry in the left most slot), and switching it over to its
lone child.

Because a lockless lookup may have got a reference to the parent and be
in the middle of deciding what to do with it while it is being swapped
away for its child. For this reason, we also have to keep it around and
in a valid state for the lookup to proceed and give a valid result, for
at least an RCU grace period. So we need to keep the child in the left
most slot there in case that is requested by the lookup.

This is all pretty standard RCU stuff. It is worth repeating because
in my eagerness to obey the radix tree node constructor scheme, I had
broken this by zeroing the radix tree node before the grace period.

Fix it by clearing those fields in the RCU callback. I would normally
want to rip out the constructor entirely, but radix tree nodes are one
of those places where they make sense (only few cachelines will be
touched soon after allocation).


This was never actually observed in any lockless pagecache testing or
using the test harness, but as a rare problem testing my scalable vmap
rewrite.

Fortunately, it is not a problem anywhere lockless pagecache is used in
mainline kernels (pagecache probe is not a guarantee, and brd does not
have concurrent lookups and deletes).

However, it would eventually pop up for someone using lockless pagecache :P

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/lib/radix-tree.c
===================================================================
--- linux-2.6.orig/lib/radix-tree.c	2008-06-13 04:26:31.000000000 +1000
+++ linux-2.6/lib/radix-tree.c	2008-06-13 04:31:38.000000000 +1000
@@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct
 	return root->gfp_mask & __GFP_BITS_MASK;
 }
 
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	return test_bit(offset, node->tags[tag]);
+}
+
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+	int idx;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (node->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
 /*
  * This assumes that the caller has performed appropriate preallocation, and
  * that the caller has pinned this thread of control to the current CPU.
@@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
 {
 	struct radix_tree_node *node =
 			container_of(head, struct radix_tree_node, rcu_head);
+
+	/*
+	 * must only free zeroed nodes into the slab. radix_tree_shrink
+	 * can leave us with a non-NULL entry in the first slot, so clear
+	 * that here to make sure.
+	 */
+	tag_clear(node, 0, 0);
+	tag_clear(node, 1, 0);
+	node->slots[0] = NULL;
+	node->count = 0;
+
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
@@ -165,59 +227,6 @@ out:
 }
 EXPORT_SYMBOL(radix_tree_preload);
 
-static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__set_bit(offset, node->tags[tag]);
-}
-
-static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__clear_bit(offset, node->tags[tag]);
-}
-
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	return test_bit(offset, node->tags[tag]);
-}
-
-static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
-{
-	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
-}
-
-
-static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
-{
-	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
-}
-
-static inline void root_tag_clear_all(struct radix_tree_root *root)
-{
-	root->gfp_mask &= __GFP_BITS_MASK;
-}
-
-static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
-{
-	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
-}
-
-/*
- * Returns 1 if any slot in the node has this tag set.
- * Otherwise returns 0.
- */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
-{
-	int idx;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (node->tags[tag][idx])
-			return 1;
-	}
-	return 0;
-}
-
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -930,11 +939,6 @@ static inline void radix_tree_shrink(str
 			newptr = radix_tree_ptr_to_indirect(newptr);
 		root->rnode = newptr;
 		root->height--;
-		/* must only free zeroed nodes into the slab */
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
-		to_free->slots[0] = NULL;
-		to_free->count = 0;
 		radix_tree_node_free(to_free);
 	}
 }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:03 [patch] radix-tree: fix small lockless radix-tree bug Nick Piggin
@ 2008-06-12 19:15 ` Peter Zijlstra
  2008-06-12 19:31 ` Andrew Morton
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2008-06-12 19:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, linux-kernel, paulmck

On Fri, 2008-06-13 at 05:03 +1000, Nick Piggin wrote:
> Hi guys,
> 
> Although this doesn't seem like cause for alarm (as per the analysis),
> it may still be a good 2.6.26 candidate as we should have a few more
> weeks of testing left.
> 
> It should definitely go in -mm with the lockless pagecache patch.

Ouch - good one, I'll back-port it to -rt.

This reminds me, I should get back to my radix-tree path compression
stuff one day.


> When shrinking a radix-tree, we do it in a lockless manner by atomically
> switching the root pointer away from the redundant node (one that only
> has a single entry in the left most slot), and switching it over to its
> lone child.
> 
> Because a lockless lookup may have got a reference to the parent and be
> in the middle of deciding what to do with it while it is being swapped
> away for its child. For this reason, we also have to keep it around and
> in a valid state for the lookup to proceed and give a valid result, for
> at least an RCU grace period. So we need to keep the child in the left
> most slot there in case that is requested by the lookup.
> 
> This is all pretty standard RCU stuff. It is worth repeating because
> in my eagerness to obey the radix tree node constructor scheme, I had
> broken this by zeroing the radix tree node before the grace period.
> 
> Fix it by clearing those fields in the RCU callback. I would normally
> want to rip out the constructor entirely, but radix tree nodes are one
> of those places where they make sense (only few cachelines will be
> touched soon after allocation).
> 
> 
> This was never actually observed in any lockless pagecache testing or
> using the test harness, but as a rare problem testing my scalable vmap
> rewrite.
> 
> Fortunately, it is not a problem anywhere lockless pagecache is used in
> mainline kernels (pagecache probe is not a guarantee, and brd does not
> have concurrent lookups and deletes).
> 
> However, it would eventually pop up for someone using lockless pagecache :P
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

> ---
> Index: linux-2.6/lib/radix-tree.c
> ===================================================================
> --- linux-2.6.orig/lib/radix-tree.c     2008-06-13 04:26:31.000000000
> +1000
> +++ linux-2.6/lib/radix-tree.c  2008-06-13 04:31:38.000000000 +1000
> @@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct
>         return root->gfp_mask & __GFP_BITS_MASK;
>  }
>  
> +static inline void tag_set(struct radix_tree_node *node, unsigned int
> tag,
> +               int offset)
> +{
> +       __set_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void tag_clear(struct radix_tree_node *node, unsigned
> int tag,
> +               int offset)
> +{
> +       __clear_bit(offset, node->tags[tag]);
> +}
> +
> +static inline int tag_get(struct radix_tree_node *node, unsigned int
> tag,
> +               int offset)
> +{
> +       return test_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void root_tag_set(struct radix_tree_root *root,
> unsigned int tag)
> +{
> +       root->gfp_mask |= (__force gfp_t)(1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear(struct radix_tree_root *root,
> unsigned int tag)
> +{
> +       root->gfp_mask &= (__force gfp_t)~(1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear_all(struct radix_tree_root *root)
> +{
> +       root->gfp_mask &= __GFP_BITS_MASK;
> +}
> +
> +static inline int root_tag_get(struct radix_tree_root *root, unsigned
> int tag)
> +{
> +       return (__force unsigned)root->gfp_mask & (1 << (tag +
> __GFP_BITS_SHIFT));
> +}
> +
> +/*
> + * Returns 1 if any slot in the node has this tag set.
> + * Otherwise returns 0.
> + */
> +static inline int any_tag_set(struct radix_tree_node *node, unsigned
> int tag)
> +{
> +       int idx;
> +       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> +               if (node->tags[tag][idx])
> +                       return 1;
> +       }
> +       return 0;
> +}
>  /*
>   * This assumes that the caller has performed appropriate
> preallocation, and
>   * that the caller has pinned this thread of control to the current
> CPU.
> @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
>  {
>         struct radix_tree_node *node =
>                         container_of(head, struct radix_tree_node,
> rcu_head);
> +
> +       /*
> +        * must only free zeroed nodes into the slab.
> radix_tree_shrink
> +        * can leave us with a non-NULL entry in the first slot, so
> clear
> +        * that here to make sure.
> +        */
> +       tag_clear(node, 0, 0);
> +       tag_clear(node, 1, 0);
> +       node->slots[0] = NULL;
> +       node->count = 0;
> +
>         kmem_cache_free(radix_tree_node_cachep, node);
>  }
>  
> @@ -165,59 +227,6 @@ out:
>  }
>  EXPORT_SYMBOL(radix_tree_preload);
>  
> -static inline void tag_set(struct radix_tree_node *node, unsigned int
> tag,
> -               int offset)
> -{
> -       __set_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void tag_clear(struct radix_tree_node *node, unsigned
> int tag,
> -               int offset)
> -{
> -       __clear_bit(offset, node->tags[tag]);
> -}
> -
> -static inline int tag_get(struct radix_tree_node *node, unsigned int
> tag,
> -               int offset)
> -{
> -       return test_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void root_tag_set(struct radix_tree_root *root,
> unsigned int tag)
> -{
> -       root->gfp_mask |= (__force gfp_t)(1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -
> -static inline void root_tag_clear(struct radix_tree_root *root,
> unsigned int tag)
> -{
> -       root->gfp_mask &= (__force gfp_t)~(1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -static inline void root_tag_clear_all(struct radix_tree_root *root)
> -{
> -       root->gfp_mask &= __GFP_BITS_MASK;
> -}
> -
> -static inline int root_tag_get(struct radix_tree_root *root, unsigned
> int tag)
> -{
> -       return (__force unsigned)root->gfp_mask & (1 << (tag +
> __GFP_BITS_SHIFT));
> -}
> -
> -/*
> - * Returns 1 if any slot in the node has this tag set.
> - * Otherwise returns 0.
> - */
> -static inline int any_tag_set(struct radix_tree_node *node, unsigned
> int tag)
> -{
> -       int idx;
> -       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> -               if (node->tags[tag][idx])
> -                       return 1;
> -       }
> -       return 0;
> -}
> -
>  /*
>   *     Return the maximum key which can be store into a
>   *     radix tree with height HEIGHT.
> @@ -930,11 +939,6 @@ static inline void radix_tree_shrink(str
>                         newptr = radix_tree_ptr_to_indirect(newptr);
>                 root->rnode = newptr;
>                 root->height--;
> -               /* must only free zeroed nodes into the slab */
> -               tag_clear(to_free, 0, 0);
> -               tag_clear(to_free, 1, 0);
> -               to_free->slots[0] = NULL;
> -               to_free->count = 0;
>                 radix_tree_node_free(to_free);
>         }
>  }


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:03 [patch] radix-tree: fix small lockless radix-tree bug Nick Piggin
  2008-06-12 19:15 ` Peter Zijlstra
@ 2008-06-12 19:31 ` Andrew Morton
  2008-06-12 19:38   ` Peter Zijlstra
  2008-06-12 19:34 ` Andrew Morton
  2008-06-13  7:53 ` Paul E. McKenney
  3 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2008-06-12 19:31 UTC (permalink / raw)
  To: Nick Piggin; +Cc: peterz, linux-kernel, paulmck

On Fri, 13 Jun 2008 05:03:45 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Hi guys,
> 
> Although this doesn't seem like cause for alarm (as per the analysis),
> it may still be a good 2.6.26 candidate as we should have a few more
> weeks of testing left.
> 
> It should definitely go in -mm with the lockless pagecache patch.
> 
> When shrinking a radix-tree, we do it in a lockless manner by atomically
> switching the root pointer away from the redundant node (one that only
> has a single entry in the left most slot), and switching it over to its
> lone child.
> 
> Because a lockless lookup may have got a reference to the parent and be
> in the middle of deciding what to do with it while it is being swapped
> away for its child. For this reason, we also have to keep it around and
> in a valid state for the lookup to proceed and give a valid result, for
> at least an RCU grace period. So we need to keep the child in the left
> most slot there in case that is requested by the lookup.
> 
> This is all pretty standard RCU stuff. It is worth repeating because
> in my eagerness to obey the radix tree node constructor scheme, I had
> broken this by zeroing the radix tree node before the grace period.
> 
> Fix it by clearing those fields in the RCU callback. I would normally
> want to rip out the constructor entirely, but radix tree nodes are one
> of those places where they make sense (only few cachelines will be
> touched soon after allocation).
> 
> 
> This was never actually observed in any lockless pagecache testing or
> using the test harness, but as a rare problem testing my scalable vmap
> rewrite.
> 
> Fortunately, it is not a problem anywhere lockless pagecache is used in
> mainline kernels (pagecache probe is not a guarantee, and brd does not
> have concurrent lookups and deletes).
> 
> However, it would eventually pop up for someone using lockless pagecache :P
> 

OK, I give up.  A cannot spot what you actually changed amongst all the
code motion?


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:03 [patch] radix-tree: fix small lockless radix-tree bug Nick Piggin
  2008-06-12 19:15 ` Peter Zijlstra
  2008-06-12 19:31 ` Andrew Morton
@ 2008-06-12 19:34 ` Andrew Morton
  2008-06-12 19:47   ` Nick Piggin
  2008-06-13  7:53 ` Paul E. McKenney
  3 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2008-06-12 19:34 UTC (permalink / raw)
  To: Nick Piggin; +Cc: peterz, linux-kernel, paulmck

On Fri, 13 Jun 2008 05:03:45 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
>  {
>  	struct radix_tree_node *node =
>  			container_of(head, struct radix_tree_node, rcu_head);
> +
> +	/*
> +	 * must only free zeroed nodes into the slab. radix_tree_shrink
> +	 * can leave us with a non-NULL entry in the first slot, so clear
> +	 * that here to make sure.
> +	 */
> +	tag_clear(node, 0, 0);
> +	tag_clear(node, 1, 0);
> +	node->slots[0] = NULL;
> +	node->count = 0;
> +
>  	kmem_cache_free(radix_tree_node_cachep, node);
>  }

oic, that stuff got moved from the synchronous case into the RCU callback
case.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:31 ` Andrew Morton
@ 2008-06-12 19:38   ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2008-06-12 19:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, linux-kernel, paulmck

On Thu, 2008-06-12 at 12:31 -0700, Andrew Morton wrote:
> On Fri, 13 Jun 2008 05:03:45 +1000
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> > Hi guys,
> > 
> > Although this doesn't seem like cause for alarm (as per the analysis),
> > it may still be a good 2.6.26 candidate as we should have a few more
> > weeks of testing left.
> > 
> > It should definitely go in -mm with the lockless pagecache patch.
> > 
> > When shrinking a radix-tree, we do it in a lockless manner by atomically
> > switching the root pointer away from the redundant node (one that only
> > has a single entry in the left most slot), and switching it over to its
> > lone child.
> > 
> > Because a lockless lookup may have got a reference to the parent and be
> > in the middle of deciding what to do with it while it is being swapped
> > away for its child. For this reason, we also have to keep it around and
> > in a valid state for the lookup to proceed and give a valid result, for
> > at least an RCU grace period. So we need to keep the child in the left
> > most slot there in case that is requested by the lookup.
> > 
> > This is all pretty standard RCU stuff. It is worth repeating because
> > in my eagerness to obey the radix tree node constructor scheme, I had
> > broken this by zeroing the radix tree node before the grace period.
> > 
> > Fix it by clearing those fields in the RCU callback. I would normally
> > want to rip out the constructor entirely, but radix tree nodes are one
> > of those places where they make sense (only few cachelines will be
> > touched soon after allocation).
> > 
> > 
> > This was never actually observed in any lockless pagecache testing or
> > using the test harness, but as a rare problem testing my scalable vmap
> > rewrite.
> > 
> > Fortunately, it is not a problem anywhere lockless pagecache is used in
> > mainline kernels (pagecache probe is not a guarantee, and brd does not
> > have concurrent lookups and deletes).
> > 
> > However, it would eventually pop up for someone using lockless pagecache :P
> > 
> 
> OK, I give up.  A cannot spot what you actually changed amongst all the
> code motion?

The two relevant hunks:

@@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
 {
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);
+
+       /*
+        * must only free zeroed nodes into the slab. radix_tree_shrink
+        * can leave us with a non-NULL entry in the first slot, so clear
+        * that here to make sure.
+        */
+       tag_clear(node, 0, 0);
+       tag_clear(node, 1, 0);
+       node->slots[0] = NULL;
+       node->count = 0;
+
        kmem_cache_free(radix_tree_node_cachep, node);
 }
 
@@ -930,11 +939,6 @@ static inline void radix_tree_shrink(str
                        newptr = radix_tree_ptr_to_indirect(newptr);
                root->rnode = newptr;
                root->height--;
-               /* must only free zeroed nodes into the slab */
-               tag_clear(to_free, 0, 0);
-               tag_clear(to_free, 1, 0);
-               to_free->slots[0] = NULL;
-               to_free->count = 0;
                radix_tree_node_free(to_free);
        }
 }

So instead of clearing the first slot on free, we delay it one grace
period and clear it later. So that anybody already having a pointer to
the node can correctly resolve the item.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:34 ` Andrew Morton
@ 2008-06-12 19:47   ` Nick Piggin
  0 siblings, 0 replies; 7+ messages in thread
From: Nick Piggin @ 2008-06-12 19:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: peterz, linux-kernel, paulmck

On Friday 13 June 2008 05:34, Andrew Morton wrote:
> On Fri, 13 Jun 2008 05:03:45 +1000
>
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
> >  {
> >  	struct radix_tree_node *node =
> >  			container_of(head, struct radix_tree_node, rcu_head);
> > +
> > +	/*
> > +	 * must only free zeroed nodes into the slab. radix_tree_shrink
> > +	 * can leave us with a non-NULL entry in the first slot, so clear
> > +	 * that here to make sure.
> > +	 */
> > +	tag_clear(node, 0, 0);
> > +	tag_clear(node, 1, 0);
> > +	node->slots[0] = NULL;
> > +	node->count = 0;
> > +
> >  	kmem_cache_free(radix_tree_node_cachep, node);
> >  }
>
> oic, that stuff got moved from the synchronous case into the RCU callback
> case.

Yeah, it was annoying because the real change moved the first references
of a couple of those up. Then we have to move all of them to keep them
in one place.

The changelog I submitted looks like utter gibberish, btw. Rewritten one
here which should be a bit better.


We shrink a radix tree when its root node has only one child, in the left
most slot. The child becomes the new root node. To perform this operation
in a manner compatible with concurrent lockless lookups, we atomically
switch the root pointer from the parent to its child.

However a concurrent lockless lookup may now have loaded a pointer to the
parent (and is presently deciding what to do next). For this reason, we
also have to keep the parent node in a valid state after shrinking the
tree, until the next RCU grace period -- otherwise this lookup with the
parent pointer may not do the right thing. Notably, we need to keep the
child in the left most slot there in case that is requested by the lookup.

This is all pretty standard RCU stuff. It is worth repeating because
in my eagerness to obey the radix tree node constructor scheme, I had
broken it by zeroing the radix tree node before the grace period.

What could happen is that a lookup can load the parent pointer, then
decide it wants to follow the left most child slot, only to find the
slot contained NULL due to the concurrent shrinker having zeroed the
parent node before waiting for a grace period. The lookup would return
a false negative as a result.

Fix it by doing that clearing in the RCU callback. I would normally want
to rip out the constructor entirely, but radix tree nodes are one of those
places where they make sense (only few cachelines will be touched soon
after allocation).


This was never actually found in any lockless pagecache testing or by
the test harness, but by seeing the odd problem with my scalable vmap
rewrite. I have not tickled the test harness into reproducing it yet,
but I'll keep working at it.

Fortunately, it is not a problem anywhere lockless pagecache is used in
mainline kernels (pagecache probe is not a guarantee, and brd does not
have concurrent lookups and deletes).

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] radix-tree: fix small lockless radix-tree bug
  2008-06-12 19:03 [patch] radix-tree: fix small lockless radix-tree bug Nick Piggin
                   ` (2 preceding siblings ...)
  2008-06-12 19:34 ` Andrew Morton
@ 2008-06-13  7:53 ` Paul E. McKenney
  3 siblings, 0 replies; 7+ messages in thread
From: Paul E. McKenney @ 2008-06-13  7:53 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, Peter Zijlstra, linux-kernel

On Fri, Jun 13, 2008 at 05:03:45AM +1000, Nick Piggin wrote:
> Hi guys,
> 
> Although this doesn't seem like cause for alarm (as per the analysis),
> it may still be a good 2.6.26 candidate as we should have a few more
> weeks of testing left.
> 
> It should definitely go in -mm with the lockless pagecache patch.

Good catch!!!

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Thanks,
> Nick

> When shrinking a radix-tree, we do it in a lockless manner by atomically
> switching the root pointer away from the redundant node (one that only
> has a single entry in the left most slot), and switching it over to its
> lone child.
> 
> Because a lockless lookup may have got a reference to the parent and be
> in the middle of deciding what to do with it while it is being swapped
> away for its child. For this reason, we also have to keep it around and
> in a valid state for the lookup to proceed and give a valid result, for
> at least an RCU grace period. So we need to keep the child in the left
> most slot there in case that is requested by the lookup.
> 
> This is all pretty standard RCU stuff. It is worth repeating because
> in my eagerness to obey the radix tree node constructor scheme, I had
> broken this by zeroing the radix tree node before the grace period.
> 
> Fix it by clearing those fields in the RCU callback. I would normally
> want to rip out the constructor entirely, but radix tree nodes are one
> of those places where they make sense (only few cachelines will be
> touched soon after allocation).
> 
> 
> This was never actually observed in any lockless pagecache testing or
> using the test harness, but as a rare problem testing my scalable vmap
> rewrite.
> 
> Fortunately, it is not a problem anywhere lockless pagecache is used in
> mainline kernels (pagecache probe is not a guarantee, and brd does not
> have concurrent lookups and deletes).
> 
> However, it would eventually pop up for someone using lockless pagecache :P
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>
> ---
> Index: linux-2.6/lib/radix-tree.c
> ===================================================================
> --- linux-2.6.orig/lib/radix-tree.c	2008-06-13 04:26:31.000000000 +1000
> +++ linux-2.6/lib/radix-tree.c	2008-06-13 04:31:38.000000000 +1000
> @@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct
>  	return root->gfp_mask & __GFP_BITS_MASK;
>  }
> 
> +static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
> +		int offset)
> +{
> +	__set_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
> +		int offset)
> +{
> +	__clear_bit(offset, node->tags[tag]);
> +}
> +
> +static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
> +		int offset)
> +{
> +	return test_bit(offset, node->tags[tag]);
> +}
> +
> +static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
> +{
> +	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
> +{
> +	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
> +}
> +
> +static inline void root_tag_clear_all(struct radix_tree_root *root)
> +{
> +	root->gfp_mask &= __GFP_BITS_MASK;
> +}
> +
> +static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
> +{
> +	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
> +}
> +
> +/*
> + * Returns 1 if any slot in the node has this tag set.
> + * Otherwise returns 0.
> + */
> +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
> +{
> +	int idx;
> +	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> +		if (node->tags[tag][idx])
> +			return 1;
> +	}
> +	return 0;
> +}
>  /*
>   * This assumes that the caller has performed appropriate preallocation, and
>   * that the caller has pinned this thread of control to the current CPU.
> @@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(str
>  {
>  	struct radix_tree_node *node =
>  			container_of(head, struct radix_tree_node, rcu_head);
> +
> +	/*
> +	 * must only free zeroed nodes into the slab. radix_tree_shrink
> +	 * can leave us with a non-NULL entry in the first slot, so clear
> +	 * that here to make sure.
> +	 */
> +	tag_clear(node, 0, 0);
> +	tag_clear(node, 1, 0);
> +	node->slots[0] = NULL;
> +	node->count = 0;
> +
>  	kmem_cache_free(radix_tree_node_cachep, node);
>  }
> 
> @@ -165,59 +227,6 @@ out:
>  }
>  EXPORT_SYMBOL(radix_tree_preload);
> 
> -static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
> -		int offset)
> -{
> -	__set_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
> -		int offset)
> -{
> -	__clear_bit(offset, node->tags[tag]);
> -}
> -
> -static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
> -		int offset)
> -{
> -	return test_bit(offset, node->tags[tag]);
> -}
> -
> -static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
> -{
> -	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
> -}
> -
> -
> -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
> -{
> -	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
> -}
> -
> -static inline void root_tag_clear_all(struct radix_tree_root *root)
> -{
> -	root->gfp_mask &= __GFP_BITS_MASK;
> -}
> -
> -static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
> -{
> -	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
> -}
> -
> -/*
> - * Returns 1 if any slot in the node has this tag set.
> - * Otherwise returns 0.
> - */
> -static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
> -{
> -	int idx;
> -	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
> -		if (node->tags[tag][idx])
> -			return 1;
> -	}
> -	return 0;
> -}
> -
>  /*
>   *	Return the maximum key which can be store into a
>   *	radix tree with height HEIGHT.
> @@ -930,11 +939,6 @@ static inline void radix_tree_shrink(str
>  			newptr = radix_tree_ptr_to_indirect(newptr);
>  		root->rnode = newptr;
>  		root->height--;
> -		/* must only free zeroed nodes into the slab */
> -		tag_clear(to_free, 0, 0);
> -		tag_clear(to_free, 1, 0);
> -		to_free->slots[0] = NULL;
> -		to_free->count = 0;
>  		radix_tree_node_free(to_free);
>  	}
>  }


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-06-13  8:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-12 19:03 [patch] radix-tree: fix small lockless radix-tree bug Nick Piggin
2008-06-12 19:15 ` Peter Zijlstra
2008-06-12 19:31 ` Andrew Morton
2008-06-12 19:38   ` Peter Zijlstra
2008-06-12 19:34 ` Andrew Morton
2008-06-12 19:47   ` Nick Piggin
2008-06-13  7:53 ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.