[patch] mm: vmap area cache

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch] mm: vmap area cache
@ 2010-05-31  8:07 Nick Piggin
  2010-05-31 13:21 ` Minchan Kim
  2010-06-02 21:49 ` Andrew Morton
  0 siblings, 2 replies; 14+ messages in thread
From: Nick Piggin @ 2010-05-31  8:07 UTC (permalink / raw)
  To: Steven Whitehouse, Minchan Kim, Andrew Morton, linux-mm

Hi Andrew,

Could you put this in your tree? It could do with a bit more testing. I
will update you with updates or results from Steven.

Thanks,
Nick
--

Provide a free area cache for the vmalloc virtual address allocator, based
on the approach taken in the user virtual memory allocator.

This reduces the number of rbtree operations and linear traversals over
the vmap extents to find a free area. The lazy vmap flushing makes this problem
worse because because freed but not yet flushed vmaps tend to build up in
the address space between flushes.

Steven noticed a performance problem with GFS2. Results are as follows...



 mm/vmalloc.c |  100 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 22 deletions(-)

Index: linux-2.6/mm/vmalloc.c
===================================================================
--- linux-2.6.orig/mm/vmalloc.c
+++ linux-2.6/mm/vmalloc.c
@@ -262,8 +262,14 @@ struct vmap_area {
 };
 
 static DEFINE_SPINLOCK(vmap_area_lock);
-static struct rb_root vmap_area_root = RB_ROOT;
 static LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_start;
+static unsigned long cached_align;
+
 static unsigned long vmap_area_pcpu_hole;
 
 static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -332,9 +338,11 @@ static struct vmap_area *alloc_vmap_area
 	struct rb_node *n;
 	unsigned long addr;
 	int purged = 0;
+	struct vmap_area *first;
 
 	BUG_ON(!size);
 	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(!is_power_of_2(align));
 
 	va = kmalloc_node(sizeof(struct vmap_area),
 			gfp_mask & GFP_RECLAIM_MASK, node);
@@ -342,17 +350,39 @@ static struct vmap_area *alloc_vmap_area
 		return ERR_PTR(-ENOMEM);
 
 retry:
-	addr = ALIGN(vstart, align);
-
 	spin_lock(&vmap_area_lock);
-	if (addr + size - 1 < addr)
-		goto overflow;
+	/* invalidate cache if we have more permissive parameters */
+	if (!free_vmap_cache ||
+			size <= cached_hole_size ||
+			vstart < cached_start ||
+			align < cached_align) {
+nocache:
+		cached_hole_size = 0;
+		free_vmap_cache = NULL;
+	}
+	/* record if we encounter less permissive parameters */
+	cached_start = vstart;
+	cached_align = align;
+
+	/* find starting point for our search */
+	if (free_vmap_cache) {
+		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+		addr = ALIGN(first->va_end + PAGE_SIZE, align);
+		if (addr < vstart)
+			goto nocache;
+		if (addr + size - 1 < addr)
+			goto overflow;
+
+	} else {
+		addr = ALIGN(vstart, align);
+		if (addr + size - 1 < addr)
+			goto overflow;
 
-	/* XXX: could have a last_hole cache */
-	n = vmap_area_root.rb_node;
-	if (n) {
-		struct vmap_area *first = NULL;
+		n = vmap_area_root.rb_node;
+		if (!n)
+			goto found;
 
+		first = NULL;
 		do {
 			struct vmap_area *tmp;
 			tmp = rb_entry(n, struct vmap_area, rb_node);
@@ -369,26 +399,36 @@ retry:
 		if (!first)
 			goto found;
 
-		if (first->va_end < addr) {
-			n = rb_next(&first->rb_node);
-			if (n)
-				first = rb_entry(n, struct vmap_area, rb_node);
-			else
-				goto found;
-		}
-
-		while (addr + size > first->va_start && addr + size <= vend) {
-			addr = ALIGN(first->va_end + PAGE_SIZE, align);
+		if (first->va_start < addr) {
+			addr = ALIGN(max(first->va_end + PAGE_SIZE, addr), align);
 			if (addr + size - 1 < addr)
 				goto overflow;
-
 			n = rb_next(&first->rb_node);
 			if (n)
 				first = rb_entry(n, struct vmap_area, rb_node);
 			else
 				goto found;
 		}
+		BUG_ON(first->va_start < addr);
+		if (addr + cached_hole_size < first->va_start)
+			cached_hole_size = first->va_start - addr;
 	}
+
+	/* from the starting point, walk areas until a suitable hole is found */
+	while (addr + size > first->va_start && addr + size <= vend) {
+		if (addr + cached_hole_size < first->va_start)
+			cached_hole_size = first->va_start - addr;
+		addr = ALIGN(first->va_end + PAGE_SIZE, align);
+		if (addr + size - 1 < addr)
+			goto overflow;
+
+		n = rb_next(&first->rb_node);
+		if (n)
+			first = rb_entry(n, struct vmap_area, rb_node);
+		else
+			goto found;
+	}
+
 found:
 	if (addr + size > vend) {
 overflow:
@@ -406,14 +446,17 @@ overflow:
 		return ERR_PTR(-EBUSY);
 	}
 
-	BUG_ON(addr & (align-1));
-
 	va->va_start = addr;
 	va->va_end = addr + size;
 	va->flags = 0;
 	__insert_vmap_area(va);
+	free_vmap_cache = &va->rb_node;
 	spin_unlock(&vmap_area_lock);
 
+	BUG_ON(va->va_start & (align-1));
+	BUG_ON(va->va_start < vstart);
+	BUG_ON(va->va_end > vend);
+
 	return va;
 }
 
@@ -427,6 +470,19 @@ static void rcu_free_va(struct rcu_head
 static void __free_vmap_area(struct vmap_area *va)
 {
 	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+
+	if (free_vmap_cache) {
+		if (va->va_end < cached_start) {
+			free_vmap_cache = NULL;
+		} else {
+			struct vmap_area *cache;
+			cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+			if (va->va_start <= cache->va_start) {
+				free_vmap_cache = rb_prev(&va->rb_node);
+				cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+			}
+		}
+	}
 	rb_erase(&va->rb_node, &vmap_area_root);
 	RB_CLEAR_NODE(&va->rb_node);
 	list_del_rcu(&va->list);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-05-31  8:07 [patch] mm: vmap area cache Nick Piggin
@ 2010-05-31 13:21 ` Minchan Kim
  2010-06-02 21:49 ` Andrew Morton
  1 sibling, 0 replies; 14+ messages in thread
From: Minchan Kim @ 2010-05-31 13:21 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Steven Whitehouse, Andrew Morton, linux-mm

On Mon, May 31, 2010 at 06:07:57PM +1000, Nick Piggin wrote:
> Hi Andrew,
> 
> Could you put this in your tree? It could do with a bit more testing. I
> will update you with updates or results from Steven.
> 
> Thanks,
> Nick
> --
> 
> Provide a free area cache for the vmalloc virtual address allocator, based
> on the approach taken in the user virtual memory allocator.
> 
> This reduces the number of rbtree operations and linear traversals over
> the vmap extents to find a free area. The lazy vmap flushing makes this problem
> worse because because freed but not yet flushed vmaps tend to build up in
> the address space between flushes.
> 
> Steven noticed a performance problem with GFS2. Results are as follows...
> 

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-05-31  8:07 [patch] mm: vmap area cache Nick Piggin
  2010-05-31 13:21 ` Minchan Kim
@ 2010-06-02 21:49 ` Andrew Morton
  2010-06-03 13:55   ` Nick Piggin
  1 sibling, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2010-06-02 21:49 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Steven Whitehouse, Minchan Kim, linux-mm

On Mon, 31 May 2010 18:07:57 +1000
Nick Piggin <npiggin@suse.de> wrote:

> Hi Andrew,
> 
> Could you put this in your tree? It could do with a bit more testing. I
> will update you with updates or results from Steven.
> 
> Thanks,
> Nick
> --
> 
> Provide a free area cache for the vmalloc virtual address allocator, based
> on the approach taken in the user virtual memory allocator.
> 
> This reduces the number of rbtree operations and linear traversals over
> the vmap extents to find a free area. The lazy vmap flushing makes this problem
> worse because because freed but not yet flushed vmaps tend to build up in
> the address space between flushes.
> 
> Steven noticed a performance problem with GFS2. Results are as follows...
> 
> 
> 

changelog got truncated - the "results" and the signoff are missing.

> --- linux-2.6.orig/mm/vmalloc.c
> +++ linux-2.6/mm/vmalloc.c
> @@ -262,8 +262,14 @@ struct vmap_area {
>  };
>  
>  static DEFINE_SPINLOCK(vmap_area_lock);
> -static struct rb_root vmap_area_root = RB_ROOT;
>  static LIST_HEAD(vmap_area_list);
> +static struct rb_root vmap_area_root = RB_ROOT;
> +
> +static struct rb_node *free_vmap_cache;
> +static unsigned long cached_hole_size;
> +static unsigned long cached_start;
> +static unsigned long cached_align;
> +
>  static unsigned long vmap_area_pcpu_hole;
>  
>  static struct vmap_area *__find_vmap_area(unsigned long addr)
> @@ -332,9 +338,11 @@ static struct vmap_area *alloc_vmap_area
>  	struct rb_node *n;
>  	unsigned long addr;
>  	int purged = 0;
> +	struct vmap_area *first;
>  
>  	BUG_ON(!size);
>  	BUG_ON(size & ~PAGE_MASK);
> +	BUG_ON(!is_power_of_2(align));

Worried.  How do we know this won't trigger?

>  	va = kmalloc_node(sizeof(struct vmap_area),
>  			gfp_mask & GFP_RECLAIM_MASK, node);
> @@ -342,17 +350,39 @@ static struct vmap_area *alloc_vmap_area
>  		return ERR_PTR(-ENOMEM);
>  
>  retry:
> -	addr = ALIGN(vstart, align);
> -
>  	spin_lock(&vmap_area_lock);
> -	if (addr + size - 1 < addr)
> -		goto overflow;
> +	/* invalidate cache if we have more permissive parameters */
> +	if (!free_vmap_cache ||
> +			size <= cached_hole_size ||
> +			vstart < cached_start ||
> +			align < cached_align) {
> +nocache:
> +		cached_hole_size = 0;
> +		free_vmap_cache = NULL;
> +	}
> +	/* record if we encounter less permissive parameters */
> +	cached_start = vstart;
> +	cached_align = align;
> +
> +	/* find starting point for our search */
> +	if (free_vmap_cache) {
> +		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
> +		addr = ALIGN(first->va_end + PAGE_SIZE, align);
> +		if (addr < vstart)
> +			goto nocache;
> +		if (addr + size - 1 < addr)
> +			goto overflow;

Some comments attached to the `if' tests would make it easier to
understand what's going on.

> +
> +	} else {
> +		addr = ALIGN(vstart, align);
> +		if (addr + size - 1 < addr)
> +			goto overflow;
> -	/* XXX: could have a last_hole cache */
> -	n = vmap_area_root.rb_node;
> -	if (n) {
> -		struct vmap_area *first = NULL;
> +		n = vmap_area_root.rb_node;
> +		if (!n)
> +			goto found;
>  
> +		first = NULL;
>  		do {
>  			struct vmap_area *tmp;
>  			tmp = rb_entry(n, struct vmap_area, rb_node);

this?

--- a/mm/vmalloc.c~mm-vmap-area-cache-fix
+++ a/mm/vmalloc.c
@@ -265,6 +265,7 @@ static DEFINE_SPINLOCK(vmap_area_lock);
 static LIST_HEAD(vmap_area_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
+/* The vmap cache globals are protected by vmap_area_lock */
 static struct rb_node *free_vmap_cache;
 static unsigned long cached_hole_size;
 static unsigned long cached_start;
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-02 21:49 ` Andrew Morton
@ 2010-06-03 13:55   ` Nick Piggin
  2010-06-25 13:00     ` Steven Whitehouse
  0 siblings, 1 reply; 14+ messages in thread
From: Nick Piggin @ 2010-06-03 13:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Steven Whitehouse, Minchan Kim, linux-mm

On Wed, Jun 02, 2010 at 02:49:05PM -0700, Andrew Morton wrote:
> On Mon, 31 May 2010 18:07:57 +1000
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > Hi Andrew,
> > 
> > Could you put this in your tree? It could do with a bit more testing. I
> > will update you with updates or results from Steven.
> > 
> > Thanks,
> > Nick
> > --
> > 
> > Provide a free area cache for the vmalloc virtual address allocator, based
> > on the approach taken in the user virtual memory allocator.
> > 
> > This reduces the number of rbtree operations and linear traversals over
> > the vmap extents to find a free area. The lazy vmap flushing makes this problem
> > worse because because freed but not yet flushed vmaps tend to build up in
> > the address space between flushes.
> > 
> > Steven noticed a performance problem with GFS2. Results are as follows...
> > 
> > 
> > 
> 
> changelog got truncated - the "results" and the signoff are missing.

Yes I was going to add them when Steven gets a chance to re test his
performance case. Indications from earlier iterations are that it
should solve the regression.

I had hoped to just get some wider testing in -mm before getting the
results and asking to push it upstream.


> > --- linux-2.6.orig/mm/vmalloc.c
> > +++ linux-2.6/mm/vmalloc.c
> > @@ -262,8 +262,14 @@ struct vmap_area {
> >  };
> >  
> >  static DEFINE_SPINLOCK(vmap_area_lock);
> > -static struct rb_root vmap_area_root = RB_ROOT;
> >  static LIST_HEAD(vmap_area_list);
> > +static struct rb_root vmap_area_root = RB_ROOT;
> > +
> > +static struct rb_node *free_vmap_cache;
> > +static unsigned long cached_hole_size;
> > +static unsigned long cached_start;
> > +static unsigned long cached_align;
> > +
> >  static unsigned long vmap_area_pcpu_hole;
> >  
> >  static struct vmap_area *__find_vmap_area(unsigned long addr)
> > @@ -332,9 +338,11 @@ static struct vmap_area *alloc_vmap_area
> >  	struct rb_node *n;
> >  	unsigned long addr;
> >  	int purged = 0;
> > +	struct vmap_area *first;
> >  
> >  	BUG_ON(!size);
> >  	BUG_ON(size & ~PAGE_MASK);
> > +	BUG_ON(!is_power_of_2(align));
> 
> Worried.  How do we know this won't trigger?

I'd be very sure that nobody relies on it unless the caller is buggy
anyway. I mean, such alignment hardly means anything because the
caller doesn't know what address it will get.

I just put it in there because in my vmalloc user test harness, non
power of 2 alignments were the only case I encountered where allocator
behaviour was different before/after this patch. I don't think it is
a valid test case but I just wanted to be satisfied we don't have
weird callers.

 
> >  	va = kmalloc_node(sizeof(struct vmap_area),
> >  			gfp_mask & GFP_RECLAIM_MASK, node);
> > @@ -342,17 +350,39 @@ static struct vmap_area *alloc_vmap_area
> >  		return ERR_PTR(-ENOMEM);
> >  
> >  retry:
> > -	addr = ALIGN(vstart, align);
> > -
> >  	spin_lock(&vmap_area_lock);
> > -	if (addr + size - 1 < addr)
> > -		goto overflow;
> > +	/* invalidate cache if we have more permissive parameters */
> > +	if (!free_vmap_cache ||
> > +			size <= cached_hole_size ||
> > +			vstart < cached_start ||
> > +			align < cached_align) {
> > +nocache:
> > +		cached_hole_size = 0;
> > +		free_vmap_cache = NULL;
> > +	}
> > +	/* record if we encounter less permissive parameters */
> > +	cached_start = vstart;
> > +	cached_align = align;
> > +
> > +	/* find starting point for our search */
> > +	if (free_vmap_cache) {
> > +		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
> > +		addr = ALIGN(first->va_end + PAGE_SIZE, align);
> > +		if (addr < vstart)
> > +			goto nocache;
> > +		if (addr + size - 1 < addr)
> > +			goto overflow;
> 
> Some comments attached to the `if' tests would make it easier to
> understand what's going on.

OK, I'll try to come up with something.

> 
> > +
> > +	} else {
> > +		addr = ALIGN(vstart, align);
> > +		if (addr + size - 1 < addr)
> > +			goto overflow;
> > -	/* XXX: could have a last_hole cache */
> > -	n = vmap_area_root.rb_node;
> > -	if (n) {
> > -		struct vmap_area *first = NULL;
> > +		n = vmap_area_root.rb_node;
> > +		if (!n)
> > +			goto found;
> >  
> > +		first = NULL;
> >  		do {
> >  			struct vmap_area *tmp;
> >  			tmp = rb_entry(n, struct vmap_area, rb_node);
> 
> this?

Yes, thanks.

 
> --- a/mm/vmalloc.c~mm-vmap-area-cache-fix
> +++ a/mm/vmalloc.c
> @@ -265,6 +265,7 @@ static DEFINE_SPINLOCK(vmap_area_lock);
>  static LIST_HEAD(vmap_area_list);
>  static struct rb_root vmap_area_root = RB_ROOT;
>  
> +/* The vmap cache globals are protected by vmap_area_lock */
>  static struct rb_node *free_vmap_cache;
>  static unsigned long cached_hole_size;
>  static unsigned long cached_start;
> _

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-03 13:55   ` Nick Piggin
@ 2010-06-25 13:00     ` Steven Whitehouse
  2010-06-26  8:31       ` Nick Piggin
  0 siblings, 1 reply; 14+ messages in thread
From: Steven Whitehouse @ 2010-06-25 13:00 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Minchan Kim, linux-mm

Hi,

Barry Marson has now tested your patch and it seems to work just fine.
Sorry for the delay,

Steve.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-25 13:00     ` Steven Whitehouse
@ 2010-06-26  8:31       ` Nick Piggin
  2010-06-28  8:37         ` Steven Whitehouse
  2010-06-30 23:26         ` Andrew Morton
  0 siblings, 2 replies; 14+ messages in thread
From: Nick Piggin @ 2010-06-26  8:31 UTC (permalink / raw)
  To: Steven Whitehouse; +Cc: Andrew Morton, Minchan Kim, linux-mm

On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> Hi,
> 
> Barry Marson has now tested your patch and it seems to work just fine.
> Sorry for the delay,
> 
> Steve.

Hi Steve,

Thanks for that, do you mean that it has solved thee regression?

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-26  8:31       ` Nick Piggin
@ 2010-06-28  8:37         ` Steven Whitehouse
  2010-06-28  8:45           ` Nick Piggin
  2010-06-30 23:26         ` Andrew Morton
  1 sibling, 1 reply; 14+ messages in thread
From: Steven Whitehouse @ 2010-06-28  8:37 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Minchan Kim, linux-mm

Hi,

On Sat, 2010-06-26 at 18:31 +1000, Nick Piggin wrote:
> On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > Hi,
> > 
> > Barry Marson has now tested your patch and it seems to work just fine.
> > Sorry for the delay,
> > 
> > Steve.
> 
> Hi Steve,
> 
> Thanks for that, do you mean that it has solved thee regression?
> 
> Thanks,
> Nick
> 

Yes, thats what I have heard from Barry. He said that it was pretty
close to the expected performance but did not give any figures. The fact
that his test actually completes shows that most of the problem has been
solved,

Steve.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-28  8:37         ` Steven Whitehouse
@ 2010-06-28  8:45           ` Nick Piggin
  2010-06-28  9:05             ` Steven Whitehouse
  0 siblings, 1 reply; 14+ messages in thread
From: Nick Piggin @ 2010-06-28  8:45 UTC (permalink / raw)
  To: Steven Whitehouse; +Cc: Andrew Morton, Minchan Kim, linux-mm

On Mon, Jun 28, 2010 at 09:37:42AM +0100, Steven Whitehouse wrote:
> Hi,
> 
> On Sat, 2010-06-26 at 18:31 +1000, Nick Piggin wrote:
> > On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > > Hi,
> > > 
> > > Barry Marson has now tested your patch and it seems to work just fine.
> > > Sorry for the delay,
> > > 
> > > Steve.
> > 
> > Hi Steve,
> > 
> > Thanks for that, do you mean that it has solved thee regression?
> > 
> > Thanks,
> > Nick
> > 
> 
> Yes, thats what I have heard from Barry. He said that it was pretty
> close to the expected performance but did not give any figures. The fact
> that his test actually completes shows that most of the problem has been
> solved,

Thanks, so I think it's good to go.

It is interesting that it is just "close" to expected performance.
The lazy vunmap patch is preventing a global IPI and TLB flush on
all CPUs for every vfree() (amortizing it down to basically nothing
if you are doing just small vmaps).

Unless you are testing on a UP machine, I would have expected
improved performance if you are doing a lot of vmalloc/vfree activity.
It could be that the search is still taking some time, and is
outweighing the gains.

We have a few options for being cleverer, so if you're ever interested
to get more detailed results and find a bit more performance here,
let me know.

And thanks for all the reporting and testing so far.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-28  8:45           ` Nick Piggin
@ 2010-06-28  9:05             ` Steven Whitehouse
  0 siblings, 0 replies; 14+ messages in thread
From: Steven Whitehouse @ 2010-06-28  9:05 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Minchan Kim, linux-mm

Hi,

On Mon, 2010-06-28 at 18:45 +1000, Nick Piggin wrote:
> On Mon, Jun 28, 2010 at 09:37:42AM +0100, Steven Whitehouse wrote:
> > Hi,
> > 
> > On Sat, 2010-06-26 at 18:31 +1000, Nick Piggin wrote:
> > > On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > > > Hi,
> > > > 
> > > > Barry Marson has now tested your patch and it seems to work just fine.
> > > > Sorry for the delay,
> > > > 
> > > > Steve.
> > > 
> > > Hi Steve,
> > > 
> > > Thanks for that, do you mean that it has solved thee regression?
> > > 
> > > Thanks,
> > > Nick
> > > 
> > 
> > Yes, thats what I have heard from Barry. He said that it was pretty
> > close to the expected performance but did not give any figures. The fact
> > that his test actually completes shows that most of the problem has been
> > solved,
> 
> Thanks, so I think it's good to go.
> 
> It is interesting that it is just "close" to expected performance.
> The lazy vunmap patch is preventing a global IPI and TLB flush on
> all CPUs for every vfree() (amortizing it down to basically nothing
> if you are doing just small vmaps).
> 
> Unless you are testing on a UP machine, I would have expected
> improved performance if you are doing a lot of vmalloc/vfree activity.
> It could be that the search is still taking some time, and is
> outweighing the gains.
> 
I gather that the machine has 8 cores over 4 cpus. On the other hand,
there is only one vmalloc call per gfs2 readdir and it might be that
beyond solving the initial regression, the performance of vmalloc is
masked by something else in the readdir path.

> We have a few options for being cleverer, so if you're ever interested
> to get more detailed results and find a bit more performance here,
> let me know.
> 
> And thanks for all the reporting and testing so far.
> 
Ok. We'll see what we can do. The bottleneck here is the particular test
set up we are using which is in demand from several directions and
cannot easily be duplicated. If I can get more detailed results I'll let
you know,

Steve.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-26  8:31       ` Nick Piggin
  2010-06-28  8:37         ` Steven Whitehouse
@ 2010-06-30 23:26         ` Andrew Morton
  2010-07-01  7:50           ` Nick Piggin
  2010-07-01  8:49           ` Steven Whitehouse
  1 sibling, 2 replies; 14+ messages in thread
From: Andrew Morton @ 2010-06-30 23:26 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Steven Whitehouse, Minchan Kim, linux-mm

On Sat, 26 Jun 2010 18:31:22 +1000
Nick Piggin <npiggin@suse.de> wrote:

> On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > Hi,
> > 
> > Barry Marson has now tested your patch and it seems to work just fine.
> > Sorry for the delay,
> > 
> > Steve.
> 
> Hi Steve,
> 
> Thanks for that, do you mean that it has solved thee regression?

Nick, can we please have an updated changelog for this patch?  I didn't
even know it fixed a regression (what regression?).  Barry's tested-by:
would be nice too, along with any quantitative results from that.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-30 23:26         ` Andrew Morton
@ 2010-07-01  7:50           ` Nick Piggin
  2010-07-01  8:49           ` Steven Whitehouse
  1 sibling, 0 replies; 14+ messages in thread
From: Nick Piggin @ 2010-07-01  7:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Steven Whitehouse, Minchan Kim, linux-mm, Avi Kivity, stable

On Wed, Jun 30, 2010 at 04:26:02PM -0700, Andrew Morton wrote:
> On Sat, 26 Jun 2010 18:31:22 +1000
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > > Hi,
> > > 
> > > Barry Marson has now tested your patch and it seems to work just fine.
> > > Sorry for the delay,
> > > 
> > > Steve.
> > 
> > Hi Steve,
> > 
> > Thanks for that, do you mean that it has solved thee regression?
> 
> Nick, can we please have an updated changelog for this patch?  I didn't
> even know it fixed a regression (what regression?).  Barry's tested-by:
> would be nice too, along with any quantitative results from that.
> 
> Thanks.

Sure. It is a performance regression caused by the lazy vunmap patches
which went in a while back. So it's appropriate for 2.6.36, and then
probably distros will want to backport it, if not -stable.

How's this?
--

mm: vmalloc add a free area cache for vmaps

Provide a free area cache for the vmalloc virtual address allocator,
based on the algorithm used by the user virtual memory allocator.

This reduces the number of rbtree operations and linear traversals over
the vmap extents in order to find a free area, by starting off at the
last point that a free area was found.

The free area cache is reset if areas are freed behind it, or if we are
searching for a smaller area or alignment than last time. So allocation
patterns are not changed (verified by corner-case and random test cases
in userspace testing).

This solves a regression caused by lazy vunmap TLB purging introduced
in db64fe02 (mm: rewrite vmap layer). That patch will leave extents in
the vmap allocator after they are vunmapped, and until a significant
number accumulate that can be flushed in a single batch. So in a
workload that vmalloc/vfree frequently, a chain of extents will build
up from VMALLOC_START address, which have to be iterated over each
time (giving an O(n) type of behaviour).

After this patch, the search will start from where it left off, giving
closer to an amortized O(1).

This is verified to solve regressions reported Steven in GFS2, and Avi
in KVM.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reported-and-tested-by: Steven Whitehouse <swhiteho@redhat.com>
Reported-and-tested-by: Avi Kivity <avi@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-06-30 23:26         ` Andrew Morton
  2010-07-01  7:50           ` Nick Piggin
@ 2010-07-01  8:49           ` Steven Whitehouse
  2010-07-01  9:02             ` Nick Piggin
  1 sibling, 1 reply; 14+ messages in thread
From: Steven Whitehouse @ 2010-07-01  8:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Minchan Kim, linux-mm, Barry J. Marson

Hi,

On Wed, 2010-06-30 at 16:26 -0700, Andrew Morton wrote:
> On Sat, 26 Jun 2010 18:31:22 +1000
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > > Hi,
> > > 
> > > Barry Marson has now tested your patch and it seems to work just fine.
> > > Sorry for the delay,
> > > 
> > > Steve.
> > 
> > Hi Steve,
> > 
> > Thanks for that, do you mean that it has solved thee regression?
> 
> Nick, can we please have an updated changelog for this patch?  I didn't
> even know it fixed a regression (what regression?).  Barry's tested-by:
> would be nice too, along with any quantitative results from that.
> 
> Thanks.

Barry is running a benchmark test against GFS2 which simulates NFS
activity on the filesystem. Without this patch, the GFS2 ->readdir()
function (the only part of GFS2 which uses vmalloc) runs so slowly that
the test does not complete. With the patch, the test runs the same speed
as it did on earlier kernels.

I don't have an exact pointer to when the regression was introduced, but
it was after RHEL5 branched.

I've cc'd Barry so that he can add his Tested-By: if he is happy to do
so,

Steve.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-07-01  8:49           ` Steven Whitehouse
@ 2010-07-01  9:02             ` Nick Piggin
  2010-07-14  8:55               ` Steven Whitehouse
  0 siblings, 1 reply; 14+ messages in thread
From: Nick Piggin @ 2010-07-01  9:02 UTC (permalink / raw)
  To: Steven Whitehouse
  Cc: Andrew Morton, Minchan Kim, linux-mm, Barry J. Marson, avi

On Thu, Jul 01, 2010 at 09:49:14AM +0100, Steven Whitehouse wrote:
> Hi,
> 
> On Wed, 2010-06-30 at 16:26 -0700, Andrew Morton wrote:
> > On Sat, 26 Jun 2010 18:31:22 +1000
> > Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > On Fri, Jun 25, 2010 at 02:00:17PM +0100, Steven Whitehouse wrote:
> > > > Hi,
> > > > 
> > > > Barry Marson has now tested your patch and it seems to work just fine.
> > > > Sorry for the delay,
> > > > 
> > > > Steve.
> > > 
> > > Hi Steve,
> > > 
> > > Thanks for that, do you mean that it has solved thee regression?
> > 
> > Nick, can we please have an updated changelog for this patch?  I didn't
> > even know it fixed a regression (what regression?).  Barry's tested-by:
> > would be nice too, along with any quantitative results from that.
> > 
> > Thanks.
> 
> Barry is running a benchmark test against GFS2 which simulates NFS
> activity on the filesystem. Without this patch, the GFS2 ->readdir()
> function (the only part of GFS2 which uses vmalloc) runs so slowly that
> the test does not complete. With the patch, the test runs the same speed
> as it did on earlier kernels.

It would have been due to the commit I referenced.

> I don't have an exact pointer to when the regression was introduced, but
> it was after RHEL5 branched.

OK the patch should be pretty fine to go into RHEL5, I'd think.

Interesting that it slowed down so much for you. I would say this is due
to a few differences between your testing and mine.

Firstly, I was using a 64 CPU machine, and hammering vmap flushing on
all CPUs. TLB broadcasting and flushing cost is going to be much much
higher because there is an O(n^2) effect (N CPUs worth of work, each
unit of work requires TLB flush to N CPUs). Interconnect cost would be
much higher too compared to your 2s8c machine. So the cost of searching
vmaps would be more hidden by the gains from avoiding flushing.

Secondly, you were testing with probably 4K vmallocs. Wheras I was using
64K vmallocs on a 16KB page size machine with XFS. So in your testing
there would be significantly more vmaps build up, by a factor of 10.

Your workload is similar to Avi's as well.

So in summary, I should have paid attention to the search complexity
aspect and designed cases specifically to test that aspect. Oh well...
thanks for the reporting and testing :)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch] mm: vmap area cache
  2010-07-01  9:02             ` Nick Piggin
@ 2010-07-14  8:55               ` Steven Whitehouse
  0 siblings, 0 replies; 14+ messages in thread
From: Steven Whitehouse @ 2010-07-14  8:55 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Minchan Kim, linux-mm, Barry J. Marson, avi

Hi,

During testing of the most recent version of this patch we've hit a kernel bug
in alloc_vmap_area():


             }
412                 BUG_ON(first->va_start < addr);                     <- Bug triggers here      
413                 if (addr + cached_hole_size < first->va_start)
414                         cached_hole_size = first->va_start - addr;
415         }


This appears to be caused by a call from:

[<c050f870>] ? kmem_cache_alloc_notrace+0xa0/0xb0                
[<c0503695>] ? __get_vm_area_node+0xc5/0x1c0                     
[<c0458625>] ? walk_system_ram_range+0xa5/0x1c0                  
[<c050383e>] ? get_vm_area_caller+0x4e/0x60                      
[<f827328c>] ? intel_opregion_init+0xbc/0x510 [i915]             
[<c0430ce0>] ? __ioremap_caller+0x2b0/0x420                      
[<f827328c>] ? intel_opregion_init+0xbc/0x510 [i915]             
[<c0430f98>] ? ioremap_nocache+0x18/0x20                         
[<f827328c>] ? intel_opregion_init+0xbc/0x510 [i915]             
[<f827328c>] ? intel_opregion_init+0xbc/0x510 [i915]
[<c04cdc7c>] ? delayed_slow_work_enqueue+0xcc/0xf0               
[<f80f7c20>] ? drm_kms_helper_poll_init+0xc0/0x130 [drm_kms_helper]
[<f8248ec7>] ? i915_driver_load+0x757/0x10c0 [i915]              
[<f82471b0>] ? i915_vga_set_decode+0x0/0x20 [i915]               
[<f81c5675>] ? drm_sysfs_device_add+0x75/0xa0 [drm]              
[<f81c33d7>] ? drm_get_dev+0x2b7/0x4c0 [drm]                     
[<c05f6986>] ? pci_match_device+0x16/0xc0                        
[<c05f68ab>] ? local_pci_probe+0xb/0x10                          
[<c05f76b1>] ? pci_device_probe+0x61/0x80                        
[<c06a34f7>] ? driver_probe_device+0x87/0x290                 
etc.

So I guess that there might be something odd about that ioremap. It
triggers on every boot of the machine in question. It looks to me as if
perhaps its found a cached entry which doesn't fit in some way, but I'm
not certain of that yet,

Steve.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2010-07-14  8:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-31  8:07 [patch] mm: vmap area cache Nick Piggin
2010-05-31 13:21 ` Minchan Kim
2010-06-02 21:49 ` Andrew Morton
2010-06-03 13:55   ` Nick Piggin
2010-06-25 13:00     ` Steven Whitehouse
2010-06-26  8:31       ` Nick Piggin
2010-06-28  8:37         ` Steven Whitehouse
2010-06-28  8:45           ` Nick Piggin
2010-06-28  9:05             ` Steven Whitehouse
2010-06-30 23:26         ` Andrew Morton
2010-07-01  7:50           ` Nick Piggin
2010-07-01  8:49           ` Steven Whitehouse
2010-07-01  9:02             ` Nick Piggin
2010-07-14  8:55               ` Steven Whitehouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).