SLUB tbench regression due to page allocator deficiency

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* SLUB tbench regression due to page allocator deficiency
@ 2008-02-09 21:45 Christoph Lameter
  2008-02-09 22:35 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-09 21:45 UTC (permalink / raw)
  To: Mel Gorman; +Cc: linux-mm, akpm, Nick Piggin, Pekka J Enberg

I have been chasing the tbench regression (1-4%) for two weeks now and 
even after I added statistics I could only verify that behavior was just 
optimal.

None of the tricks that I threw at the problem changed anything until I 
realized that the tbench load depends heavily on 4k allocations that SLUB 
hands off to the page allocator (SLAB handles 4k itself). I extended the 
kmalloc array to 4k and I got:

christoph@stapp:~$ slabinfo -AD
Name                   Objects    Alloc     Free   %Fast
:0004096                   180 665259550 665259415  99  99
skbuff_fclone_cache         46 665196592 665196592  99  99
:0000192                  2575 31232665 31230129  99  99
:0001024                   854 31204838 31204006  99  99
vm_area_struct            1093   108941   107954  91  17
dentry                    7738    26248    18544  92  43
:0000064                  2179    19208    17287  97  73

So the kmalloc-4096 is heavily used. If I give the 4k objects a reasonable 
allocation size in slub (PAGE_ALLOC_COSTLY_ORDER) then the fastpath of 
SLUB becomes effective for 4k allocs and then SLUB is faster than SLAB 
here.

Performance on tbench (Dual Quad 8p 8G):

SLAB		2223.32 MB/sec
SLUB unmodified	2144.36 MB/sec
SLUB+patch	2245.56 MB/sec (stats still active so this isnt optimal yet)

4k allocations cannot optimally be handled by SLUB if we are restricted to 
order 0 allocs because the fastpath only handles fractions of one 
allocation unit and if the allocation unit is 4k then we only have one 
object per slab.

Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
allocations in such a way that is competitive with the slab allocators? 
The cycle count for an allocation needs to be <100 not just below 1000 as 
it is now.

---
 include/linux/slub_def.h |    6 +++---
 mm/slub.c                |   25 +++++++++++++++++--------
 2 files changed, 20 insertions(+), 11 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2008-02-09 13:04:48.464203968 -0800
+++ linux-2.6/include/linux/slub_def.h	2008-02-09 13:08:37.413120259 -0800
@@ -110,7 +110,7 @@ struct kmem_cache {
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT];
+extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
 
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
@@ -191,7 +191,7 @@ void *__kmalloc(size_t size, gfp_t flags
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > PAGE_SIZE / 2)
+		if (size > PAGE_SIZE)
 			return (void *)__get_free_pages(flags | __GFP_COMP,
 							get_order(size));
 
@@ -214,7 +214,7 @@ void *kmem_cache_alloc_node(struct kmem_
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= PAGE_SIZE / 2 && !(flags & SLUB_DMA)) {
+		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2008-02-09 13:04:48.472203975 -0800
+++ linux-2.6/mm/slub.c	2008-02-09 13:14:43.786633258 -0800
@@ -1919,6 +1919,15 @@ static inline int calculate_order(int si
 	int fraction;
 
 	/*
+	 * Cover up bad performance of page allocator fastpath vs
+	 * slab allocator fastpaths. Take the largest order reasonable
+	 * in order to be able to avoid partial list overhead.
+	 *
+	 * This yields 8 4k objects per 32k slab allocation.
+	 */
+	if (size == PAGE_SIZE)
+		return PAGE_ALLOC_COSTLY_ORDER;
+	/*
 	 * Attempt to find best configuration for a slab. This
 	 * works by first attempting to generate a layout with
 	 * the best configuration and backing off gradually.
@@ -2484,11 +2493,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 #endif
 
 static int __init setup_slub_min_order(char *str)
@@ -2670,7 +2679,7 @@ void *__kmalloc(size_t size, gfp_t flags
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return (void *)__get_free_pages(flags | __GFP_COMP,
 							get_order(size));
 
@@ -2688,7 +2697,7 @@ void *__kmalloc_node(size_t size, gfp_t 
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return (void *)__get_free_pages(flags | __GFP_COMP,
 							get_order(size));
 
@@ -3001,7 +3010,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3028,7 +3037,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3218,7 +3227,7 @@ void *__kmalloc_track_caller(size_t size
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
 							get_order(size));
 	s = get_slab(size, gfpflags);
@@ -3234,7 +3243,7 @@ void *__kmalloc_node_track_caller(size_t
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
 							get_order(size));
 	s = get_slab(size, gfpflags);


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-09 21:45 SLUB tbench regression due to page allocator deficiency Christoph Lameter
@ 2008-02-09 22:35 ` Andrew Morton
  2008-02-10  0:19   ` Christoph Lameter
  2008-02-11 13:50 ` Mel Gorman
  2008-02-13 11:15 ` Mel Gorman
  2 siblings, 1 reply; 24+ messages in thread
From: Andrew Morton @ 2008-02-09 22:35 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Mel Gorman, linux-mm, Nick Piggin, Pekka J Enberg

On Sat, 9 Feb 2008 13:45:11 -0800 (PST) Christoph Lameter <clameter@sgi.com> wrote:

> Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
> allocations in such a way that is competitive with the slab allocators? 
> The cycle count for an allocation needs to be <100 not just below 1000 as 
> it is now.
> 

Well.  Where are the cycles spent?

We are notorious for sucking but I don't think even we suck enough to have
left a 10x optimisation opportunity in the core page allocator ;)

>  include/linux/slub_def.h |    6 +++---
>  mm/slub.c                |   25 +++++++++++++++++--------

I am worrried by a patch which squeezes a few percent out of tbench.  Does
it improve real things?  Does anything regress?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-09 22:35 ` Andrew Morton
@ 2008-02-10  0:19   ` Christoph Lameter
  2008-02-10  2:45     ` Nick Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-10  0:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Mel Gorman, linux-mm, Nick Piggin, Pekka J Enberg

On Sat, 9 Feb 2008, Andrew Morton wrote:

> On Sat, 9 Feb 2008 13:45:11 -0800 (PST) Christoph Lameter <clameter@sgi.com> wrote:
> 
> > Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
> > allocations in such a way that is competitive with the slab allocators? 
> > The cycle count for an allocation needs to be <100 not just below 1000 as 
> > it is now.
> Well.  Where are the cycles spent?

No idea. This is from some measurements I took with my page allocator 
benchmarks. For the tests see the code at
git://git.kernel.org/pub/scm/linux/kernel/git/christoph/vm.git tests

We do a gazillion of tests before doing anything. Most of that is NUMA 
stuff it seems but even in SMP this is still signficant.

> We are notorious for sucking but I don't think even we suck enough to have
> left a 10x optimisation opportunity in the core page allocator ;)

The regression only occurs if there is intensive allocation and freeing of 
pages. If there is a contiguous stream of allocations then there will be 
no regression since the slab allocators will have to go to the page 
allocator to get new pages. So the suckiness gets pushed under the carpet.

The SLUB fastpath takes around 40-50 cycles if things align right.
SLAB takes around 80-100 cycles.
The page allocator fastpath takes 342 cycles(!) at its best (Note kernel 
compiled for SMP no NUMA!)

It seems that we may increase kernel performance in general if we would 
come up with a better fastpath. That would not only improve slub but the 
kernel in general.

> >  include/linux/slub_def.h |    6 +++---
> >  mm/slub.c                |   25 +++++++++++++++++--------
> 
> I am worrried by a patch which squeezes a few percent out of tbench.  Does
> it improve real things?  Does anything regress?

It uses an order 3 alloc to get a big allocation unit to be able to stuff 
8 4k pages into it. Should improve networking. Howeve an order 3 
alloc is considered to be not good by many.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10  0:19   ` Christoph Lameter
@ 2008-02-10  2:45     ` Nick Piggin
  2008-02-10  3:36       ` Christoph Lameter
  2008-02-10  3:39       ` Christoph Lameter
  0 siblings, 2 replies; 24+ messages in thread
From: Nick Piggin @ 2008-02-10  2:45 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Sat, Feb 09, 2008 at 04:19:39PM -0800, Christoph Lameter wrote:
> On Sat, 9 Feb 2008, Andrew Morton wrote:
> 
> > On Sat, 9 Feb 2008 13:45:11 -0800 (PST) Christoph Lameter <clameter@sgi.com> wrote:
> > 
> > > Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
> > > allocations in such a way that is competitive with the slab allocators? 
> > > The cycle count for an allocation needs to be <100 not just below 1000 as 
> > > it is now.
> > Well.  Where are the cycles spent?
> 
> No idea. This is from some measurements I took with my page allocator 
> benchmarks. For the tests see the code at
> git://git.kernel.org/pub/scm/linux/kernel/git/christoph/vm.git tests
> 
> We do a gazillion of tests before doing anything. Most of that is NUMA 
> stuff it seems but even in SMP this is still signficant.
> 
> > We are notorious for sucking but I don't think even we suck enough to have
> > left a 10x optimisation opportunity in the core page allocator ;)
> 
> The regression only occurs if there is intensive allocation and freeing of 
> pages. If there is a contiguous stream of allocations then there will be 
> no regression since the slab allocators will have to go to the page 
> allocator to get new pages. So the suckiness gets pushed under the carpet.
> 
> The SLUB fastpath takes around 40-50 cycles if things align right.
> SLAB takes around 80-100 cycles.
> The page allocator fastpath takes 342 cycles(!) at its best (Note kernel 
> compiled for SMP no NUMA!)

What kind of allocating and freeing of pages are you talking about? Are
you just measuring single threaded performance?

I haven't looked at the page allocator for a while, but last time I did
there are quirks in the pcp lists where say if the freed page is
considered cold but the allocation wants a hot page, then it always
goes to the page zone.

Other things you can do like not looking at the watermarks if the zone
has pcp pages avoids cacheline bouncing on SMP. 

I had a set of patches do to various little optimisations like that, but
I don't actually know if they would help you significantly or not.

I could try a bit of profiling if you tell me what specific test you
are interested in?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10  2:45     ` Nick Piggin
@ 2008-02-10  3:36       ` Christoph Lameter
  2008-02-10  3:39       ` Christoph Lameter
  1 sibling, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-10  3:36 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Sun, 10 Feb 2008, Nick Piggin wrote:

> What kind of allocating and freeing of pages are you talking about? Are
> you just measuring single threaded performance?

The tests that I did do measure a couple of scenarios. tbench seems to 
free/release page size chunks quite a bit and benefits from SLAB queueing
up the pages. tbench stays on each processor it seems, so very limited 
contention effects. The page allocator problem is simply caused by too
many instructions that need to run in order to get a page.

> Other things you can do like not looking at the watermarks if the zone
> has pcp pages avoids cacheline bouncing on SMP. 
> 
> I had a set of patches do to various little optimisations like that, but
> I don't actually know if they would help you significantly or not.
> 
> I could try a bit of profiling if you tell me what specific test you
> are interested in?

Run tbench with SLUB and tbench will hit the page allocator hard with 
page sized allocations. If you apply the patch that I provided in this 
thread then these will go away. SLUB will reduce the load on the page 
allocator like SLAB. The SLUB fastpath will stand in for the page 
allocator fastpath.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10  2:45     ` Nick Piggin
  2008-02-10  3:36       ` Christoph Lameter
@ 2008-02-10  3:39       ` Christoph Lameter
  2008-02-10 23:24         ` Nick Piggin
  2008-02-11  7:18         ` Nick Piggin
  1 sibling, 2 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-10  3:39 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Sun, 10 Feb 2008, Nick Piggin wrote:

> What kind of allocating and freeing of pages are you talking about? Are
> you just measuring single threaded performance?

What I did was (on an 8p, may want to tune this to the # procs you have):

1. Run tbench_srv on console

2. run tbench 8 from an ssh session

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10  3:39       ` Christoph Lameter
@ 2008-02-10 23:24         ` Nick Piggin
  2008-02-11 19:14           ` Christoph Lameter
  2008-02-11 22:03           ` Christoph Lameter
  2008-02-11  7:18         ` Nick Piggin
  1 sibling, 2 replies; 24+ messages in thread
From: Nick Piggin @ 2008-02-10 23:24 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Sat, Feb 09, 2008 at 07:39:17PM -0800, Christoph Lameter wrote:
> On Sun, 10 Feb 2008, Nick Piggin wrote:
> 
> > What kind of allocating and freeing of pages are you talking about? Are
> > you just measuring single threaded performance?
> 
> What I did was (on an 8p, may want to tune this to the # procs you have):
> 
> 1. Run tbench_srv on console
> 
> 2. run tbench 8 from an ssh session

OK, that's easy... You did it with an SMP kernel, right? (I only have a
8p NUMA, but I should be able to turn on cacheline interleaving and
run an SMP kernel on it).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10 23:24         ` Nick Piggin
@ 2008-02-11 19:14           ` Christoph Lameter
  2008-02-11 22:03           ` Christoph Lameter
  1 sibling, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-11 19:14 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Mon, 11 Feb 2008, Nick Piggin wrote:

> OK, that's easy... You did it with an SMP kernel, right? (I only have a
> 8p NUMA, but I should be able to turn on cacheline interleaving and
> run an SMP kernel on it).

Correct.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10 23:24         ` Nick Piggin
  2008-02-11 19:14           ` Christoph Lameter
@ 2008-02-11 22:03           ` Christoph Lameter
  1 sibling, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-11 22:03 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Mon, 11 Feb 2008, Nick Piggin wrote:

> OK, that's easy... You did it with an SMP kernel, right? (I only have a
> 8p NUMA, but I should be able to turn on cacheline interleaving and
> run an SMP kernel on it).

I did a test with your patchset and got

Throughput 2165.97 MB/sec 8 procs

So that is a 1% improvement against upstream but still 3-4% off my patch.

Earlier numbers:

SLAB            2223.32 MB/sec
SLUB unmodified 2144.36 MB/sec
SLUB+patch      2245.56 MB/sec (stats still active so this isnt optimal yet)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-10  3:39       ` Christoph Lameter
  2008-02-10 23:24         ` Nick Piggin
@ 2008-02-11  7:18         ` Nick Piggin
  2008-02-11 19:21           ` Christoph Lameter
  1 sibling, 1 reply; 24+ messages in thread
From: Nick Piggin @ 2008-02-11  7:18 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Sat, Feb 09, 2008 at 07:39:17PM -0800, Christoph Lameter wrote:
> On Sun, 10 Feb 2008, Nick Piggin wrote:
> 
> > What kind of allocating and freeing of pages are you talking about? Are
> > you just measuring single threaded performance?
> 
> What I did was (on an 8p, may want to tune this to the # procs you have):
> 
> 1. Run tbench_srv on console
> 
> 2. run tbench 8 from an ssh session

OK, it's a bit variable, so I used 20 10 second runs and took the average.
With this patch, I got a 1% increase of that average (with 2.6.25-rc1 and
slub).

It avoids some branches and tests; doesn't check the watermarks if there
are pcp pages; avoids atomic refcounting operations in the caller requests
it (this is really annoying because it adds another branch -- I don't think
we should be funneling all these options through flags, rather provide a
few helpers or something for it).

I don't know if this will get back all the regression, but it should help
(although I guess we should do the same refcounting for slab, so that
might speed up a bit too).

BTW. could you please make kmalloc-2048 just use order-0 allocations by
default, like kmalloc-1024 and kmalloc-4096, and kmalloc-2048 with slub.

Thanks,
Nick

---
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h	2008-02-11 10:06:36.000000000 +1100
+++ linux-2.6/include/linux/gfp.h	2008-02-11 11:08:00.000000000 +1100
@@ -50,8 +50,9 @@
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
 #define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
+#define __GFP_NOREFS	((__force gfp_t)0x200000u)  /* Page is not refcounted */
 
-#define __GFP_BITS_SHIFT 21	/* Room for 21 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 24	/* Room for 24 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -218,6 +219,7 @@
 #define __get_dma_pages(gfp_mask, order) \
 		__get_free_pages((gfp_mask) | GFP_DMA,(order))
 
+extern void FASTCALL(__free_pages_noref(struct page *page, unsigned int order));
 extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
 extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
 extern void FASTCALL(free_hot_page(struct page *page));
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2008-02-11 10:06:36.000000000 +1100
+++ linux-2.6/mm/page_alloc.c	2008-02-11 11:08:00.000000000 +1100
@@ -449,7 +449,7 @@
 	zone->free_area[order].nr_free++;
 }
 
-static inline int free_pages_check(struct page *page)
+static inline void free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -467,12 +467,6 @@
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
-	/*
-	 * For now, we report if PG_reserved was found set, but do not
-	 * clear it, and do not free the page.  But we shall soon need
-	 * to do more, for when the ZERO_PAGE count wraps negative.
-	 */
-	return PageReserved(page);
 }
 
 /*
@@ -517,12 +511,9 @@
 {
 	unsigned long flags;
 	int i;
-	int reserved = 0;
 
 	for (i = 0 ; i < (1 << order) ; ++i)
-		reserved += free_pages_check(page + i);
-	if (reserved)
-		return;
+		free_pages_check(page + i);
 
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -598,7 +589,7 @@
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static void prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -616,18 +607,12 @@
 			1 << PG_buddy ))))
 		bad_page(page);
 
-	/*
-	 * For now, we report if PG_reserved was found set, but do not
-	 * clear it, and do not allocate the page: as a safety net.
-	 */
-	if (PageReserved(page))
-		return 1;
-
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-	set_page_refcounted(page);
+	if (!(gfp_flags & __GFP_NOREFS))
+		set_page_refcounted(page);
 
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
@@ -637,8 +622,6 @@
 
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
-
-	return 0;
 }
 
 /*
@@ -983,8 +966,7 @@
 
 	if (PageAnon(page))
 		page->mapping = NULL;
-	if (free_pages_check(page))
-		return;
+	free_pages_check(page);
 
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
@@ -1037,13 +1019,43 @@
 		set_page_refcounted(page + i);
 }
 
+#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+
+static int alloc_watermarks_ok(struct zonelist *zonelist, struct zone *zone,
+			int order, gfp_t gfp_mask, int alloc_flags)
+{
+	int classzone_idx = zone_idx(zonelist->zones[0]);
+	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+		unsigned long mark;
+		if (alloc_flags & ALLOC_WMARK_MIN)
+			mark = zone->pages_min;
+		else if (alloc_flags & ALLOC_WMARK_LOW)
+			mark = zone->pages_low;
+		else
+			mark = zone->pages_high;
+		if (!zone_watermark_ok(zone, order, mark,
+			    classzone_idx, alloc_flags)) {
+			if (!zone_reclaim_mode ||
+			    !zone_reclaim(zone, gfp_mask, order))
+				return 0;
+		}
+	}
+	return 1;
+}
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static struct page *buffered_rmqueue(struct zonelist *zonelist,
-			struct zone *zone, int order, gfp_t gfp_flags)
+		struct zone *zone, int order, gfp_t gfp_flags, int alloc_flags)
 {
 	unsigned long flags;
 	struct page *page;
@@ -1051,14 +1063,15 @@
 	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
-again:
-	cpu  = get_cpu();
+	local_irq_save(flags);
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
+		cpu  = smp_processor_id();
 		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
 		if (!pcp->count) {
+			if (!alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags))
+				goto failed;
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			if (unlikely(!pcp->count))
@@ -1078,7 +1091,8 @@
 
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
-			pcp->count += rmqueue_bulk(zone, 0,
+			if (alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags))
+				pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			page = list_entry(pcp->list.next, struct page, lru);
 		}
@@ -1086,7 +1100,9 @@
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
-		spin_lock_irqsave(&zone->lock, flags);
+		if (!alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags))
+			goto failed;
+		spin_lock(&zone->lock);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
@@ -1096,27 +1112,16 @@
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
-	put_cpu();
 
 	VM_BUG_ON(bad_range(zone, page));
-	if (prep_new_page(page, order, gfp_flags))
-		goto again;
+	prep_new_page(page, order, gfp_flags);
 	return page;
 
 failed:
 	local_irq_restore(flags);
-	put_cpu();
 	return NULL;
 }
 
-#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
-#define ALLOC_HARDER		0x10 /* try to alloc harder */
-#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
 static struct fail_page_alloc_attr {
@@ -1374,7 +1379,6 @@
 {
 	struct zone **z;
 	struct page *page = NULL;
-	int classzone_idx = zone_idx(zonelist->zones[0]);
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
@@ -1409,26 +1413,10 @@
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
 
-		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-			unsigned long mark;
-			if (alloc_flags & ALLOC_WMARK_MIN)
-				mark = zone->pages_min;
-			else if (alloc_flags & ALLOC_WMARK_LOW)
-				mark = zone->pages_low;
-			else
-				mark = zone->pages_high;
-			if (!zone_watermark_ok(zone, order, mark,
-				    classzone_idx, alloc_flags)) {
-				if (!zone_reclaim_mode ||
-				    !zone_reclaim(zone, gfp_mask, order))
-					goto this_zone_full;
-			}
-		}
-
-		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
+		page = buffered_rmqueue(zonelist, zone, order, gfp_mask, alloc_flags);
 		if (page)
 			break;
-this_zone_full:
+
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
@@ -1669,7 +1657,6 @@
 		return (unsigned long) page_address(page);
 	return 0;
 }
-
 EXPORT_SYMBOL(get_zeroed_page);
 
 void __pagevec_free(struct pagevec *pvec)
@@ -1680,16 +1667,20 @@
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 
-void __free_pages(struct page *page, unsigned int order)
+void __free_pages_noref(struct page *page, unsigned int order)
 {
-	if (put_page_testzero(page)) {
-		if (order == 0)
-			free_hot_page(page);
-		else
-			__free_pages_ok(page, order);
-	}
+	if (likely(order == 0))
+		free_hot_page(page);
+	else
+		__free_pages_ok(page, order);
 }
+EXPORT_SYMBOL(__free_pages_noref);
 
+void __free_pages(struct page *page, unsigned int order)
+{
+	if (put_page_testzero(page))
+		__free_pages_noref(page, order);
+}
 EXPORT_SYMBOL(__free_pages);
 
 void free_pages(unsigned long addr, unsigned int order)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2008-02-11 10:06:36.000000000 +1100
+++ linux-2.6/mm/slub.c	2008-02-11 11:08:00.000000000 +1100
@@ -1078,6 +1078,7 @@
 	struct page *page;
 	int pages = 1 << s->order;
 
+	flags |= __GFP_NOREFS;
 	if (s->order)
 		flags |= __GFP_COMP;
 
@@ -1175,7 +1176,7 @@
 		-pages);
 
 	page->mapping = NULL;
-	__free_pages(page, s->order);
+	__free_pages_noref(page, s->order);
 }
 
 static void rcu_free_slab(struct rcu_head *h)
@@ -2671,7 +2672,7 @@
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(flags | __GFP_COMP,
+		return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS,
 							get_order(size));
 
 	s = get_slab(size, flags);
@@ -2689,7 +2690,7 @@
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(flags | __GFP_COMP,
+		return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS,
 							get_order(size));
 
 	s = get_slab(size, flags);
@@ -2752,7 +2753,10 @@
 
 	page = virt_to_head_page(x);
 	if (unlikely(!PageSlab(page))) {
-		put_page(page);
+		unsigned int order = 0;
+		if (unlikely(PageCompound(page)))
+			order = compound_order(page);
+		__free_pages_noref(page, order);
 		return;
 	}
 	slab_free(page->slab, page, object, __builtin_return_address(0));
@@ -3219,7 +3223,7 @@
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+		return (void *)__get_free_pages(gfpflags | __GFP_COMP | __GFP_NOREFS,
 							get_order(size));
 	s = get_slab(size, gfpflags);
 
@@ -3235,7 +3239,7 @@
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+		return (void *)__get_free_pages(gfpflags | __GFP_COMP | __GFP_NOREFS,
 							get_order(size));
 	s = get_slab(size, gfpflags);
 
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2008-02-11 10:06:36.000000000 +1100
+++ linux-2.6/include/linux/slub_def.h	2008-02-11 11:08:00.000000000 +1100
@@ -192,7 +192,7 @@
 {
 	if (__builtin_constant_p(size)) {
 		if (size > PAGE_SIZE / 2)
-			return (void *)__get_free_pages(flags | __GFP_COMP,
+			return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS,
 							get_order(size));
 
 		if (!(flags & SLUB_DMA)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11  7:18         ` Nick Piggin
@ 2008-02-11 19:21           ` Christoph Lameter
  2008-02-11 23:40             ` Nick Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-11 19:21 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Mon, 11 Feb 2008, Nick Piggin wrote:

> OK, it's a bit variable, so I used 20 10 second runs and took the average.
> With this patch, I got a 1% increase of that average (with 2.6.25-rc1 and
> slub).
> 
> It avoids some branches and tests; doesn't check the watermarks if there
> are pcp pages; avoids atomic refcounting operations in the caller requests
> it (this is really annoying because it adds another branch -- I don't think
> we should be funneling all these options through flags, rather provide a
> few helpers or something for it).

Hmmm... That is a bit weak. The slub patch gets you around 3-5%. I thought 
maybe we could do something like the slub cmpxchg_local fastpath for the 
page allocator?

> I don't know if this will get back all the regression, but it should help
> (although I guess we should do the same refcounting for slab, so that
> might speed up a bit too).
> 
> BTW. could you please make kmalloc-2048 just use order-0 allocations by
> default, like kmalloc-1024 and kmalloc-4096, and kmalloc-2048 with slub.

The mininum number of objects per slab is currently 4 that means that 1k 
slabs can use order 0 allocs but 2k slabs must use order 2 in order to get 
4 objects. If I reduce that then the performance for 2k slabs may become 
a problem. The fastpath use will be reduced to 50% since every other 
allocation will have to go to the page allocator. Maybe we can do that 
if the page allocator performance is up to snuff.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11 19:21           ` Christoph Lameter
@ 2008-02-11 23:40             ` Nick Piggin
  2008-02-11 23:42               ` Christoph Lameter
  0 siblings, 1 reply; 24+ messages in thread
From: Nick Piggin @ 2008-02-11 23:40 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Mon, Feb 11, 2008 at 11:21:59AM -0800, Christoph Lameter wrote:
> On Mon, 11 Feb 2008, Nick Piggin wrote:
> 
> > OK, it's a bit variable, so I used 20 10 second runs and took the average.
> > With this patch, I got a 1% increase of that average (with 2.6.25-rc1 and
> > slub).
> > 
> > It avoids some branches and tests; doesn't check the watermarks if there
> > are pcp pages; avoids atomic refcounting operations in the caller requests
> > it (this is really annoying because it adds another branch -- I don't think
> > we should be funneling all these options through flags, rather provide a
> > few helpers or something for it).
> 
> Hmmm... That is a bit weak. The slub patch gets you around 3-5%. I thought 
> maybe we could do something like the slub cmpxchg_local fastpath for the 
> page allocator?

It might be possible but would take quite a bit of rework (eg. have a
look at pcp->count and the horrible anti fragmentation loops).


> > I don't know if this will get back all the regression, but it should help
> > (although I guess we should do the same refcounting for slab, so that
> > might speed up a bit too).
> > 
> > BTW. could you please make kmalloc-2048 just use order-0 allocations by
> > default, like kmalloc-1024 and kmalloc-4096, and kmalloc-2048 with slub.
> 
> The mininum number of objects per slab is currently 4 that means that 1k 
> slabs can use order 0 allocs but 2k slabs must use order 2 in order to get 
> 4 objects. If I reduce that then the performance for 2k slabs may become 
> a problem.

It doesn't make sense that 4K allocations use order-0 but 2K do not. And
it is a regression because slab uses order-0 for 2K.


> The fastpath use will be reduced to 50% since every other 
> allocation will have to go to the page allocator. Maybe we can do that 
> if the page allocator performance is up to snuff.

The page allocator has to do quite a lot more than the slab allocator
does. It has to check watermarks and all the NUMA and zone and anti
fragmentation stuff, and does quite a lot of branches and stores to
tes tand set up the struct page.

So it's never going to be as fast as a simple slab allocation.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11 23:40             ` Nick Piggin
@ 2008-02-11 23:42               ` Christoph Lameter
  2008-02-11 23:56                 ` Nick Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-11 23:42 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Tue, 12 Feb 2008, Nick Piggin wrote:

> It might be possible but would take quite a bit of rework (eg. have a
> look at pcp->count and the horrible anti fragmentation loops).

Yeah. May influece the way we have to handle freelists. Sigh.

> > The fastpath use will be reduced to 50% since every other 
> > allocation will have to go to the page allocator. Maybe we can do that 
> > if the page allocator performance is up to snuff.
> 
> The page allocator has to do quite a lot more than the slab allocator
> does. It has to check watermarks and all the NUMA and zone and anti
> fragmentation stuff, and does quite a lot of branches and stores to
> tes tand set up the struct page.
> 
> So it's never going to be as fast as a simple slab allocation.

Well but does it have to do all of that on *each* allocation? The slab 
allocators also do quite a number of things including NUMA handling but 
all of that is in the slow path and its not done for every single 
allocation.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11 23:42               ` Christoph Lameter
@ 2008-02-11 23:56                 ` Nick Piggin
  2008-02-12  0:08                   ` Christoph Lameter
                                     ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Nick Piggin @ 2008-02-11 23:56 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Mon, Feb 11, 2008 at 03:42:34PM -0800, Christoph Lameter wrote:
> On Tue, 12 Feb 2008, Nick Piggin wrote:
> 
> > It might be possible but would take quite a bit of rework (eg. have a
> > look at pcp->count and the horrible anti fragmentation loops).
> 
> Yeah. May influece the way we have to handle freelists. Sigh.
> 
> > > The fastpath use will be reduced to 50% since every other 
> > > allocation will have to go to the page allocator. Maybe we can do that 
> > > if the page allocator performance is up to snuff.
> > 
> > The page allocator has to do quite a lot more than the slab allocator
> > does. It has to check watermarks and all the NUMA and zone and anti
> > fragmentation stuff, and does quite a lot of branches and stores to
> > tes tand set up the struct page.
> > 
> > So it's never going to be as fast as a simple slab allocation.
> 
> Well but does it have to do all of that on *each* allocation?

NUMA -- because of policies and zlc
cpuset -- zone softwall stuff
Anti fragmentation -- we can't allocate pages of a different migration type
watermarks -- because of watermarks

Actually eg. my patch to avoid watermark checking for fastpath allocations
is not even very nice itself, because it can result in problems like Peter
wsa running into with slab because one PF_MEMALLOC task can refill a batch
and others can use up the rest of the memory even if they aren't allowed
to.

struct page initialization and checking -- yeah we could skip most of
this in the allocation fastpath. I don't think such a patch would be too
popular though. 

So yeah there are non-trivial issues.


> The slab 
> allocators also do quite a number of things including NUMA handling but 
> all of that is in the slow path and its not done for every single 
> allocation.

Yeah but a lot of things it either doesn't have to worry about (eg. zones,
anti fragmentation), or it sweeps under the carpet (policies, watermarks).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11 23:56                 ` Nick Piggin
@ 2008-02-12  0:08                   ` Christoph Lameter
  2008-02-12  6:06                   ` Fastpath prototype? Christoph Lameter
  2008-02-13 18:33                   ` SLUB tbench regression due to page allocator deficiency Paul Jackson
  2 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-12  0:08 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Tue, 12 Feb 2008, Nick Piggin wrote:

> > Well but does it have to do all of that on *each* allocation?
> 
> NUMA -- because of policies and zlc
> cpuset -- zone softwall stuff
> Anti fragmentation -- we can't allocate pages of a different migration type
> watermarks -- because of watermarks
> 
> Actually eg. my patch to avoid watermark checking for fastpath allocations
> is not even very nice itself, because it can result in problems like Peter
> wsa running into with slab because one PF_MEMALLOC task can refill a batch
> and others can use up the rest of the memory even if they aren't allowed
> to.
> 
> struct page initialization and checking -- yeah we could skip most of
> this in the allocation fastpath. I don't think such a patch would be too
> popular though. 
> 
> So yeah there are non-trivial issues.

We just need to make sure that repeated allocations of the same type do 
not require checking on every alloc.

So if there are no memory policies in effect and no cpuset redirection 
(can be figured out via a thread flag) and if we are doing the same 
allocation as last time (likely) then take the fastpath.

> > The slab 
> > allocators also do quite a number of things including NUMA handling but 
> > all of that is in the slow path and its not done for every single 
> > allocation.
> 
> Yeah but a lot of things it either doesn't have to worry about (eg. zones,
> anti fragmentation), or it sweeps under the carpet (policies, watermarks).

Most of these things should be handled in the page allocator right. But 
the page allocator needs to be effective at these allocations. IMHO a 
slowdown up to factor 10 vs. the slabs is not acceptable (and that was 
just an SMP test, NUMA is likely much worse!).

The pcps are likely also not effective because they use linked lists 
meaning memory in 3 pages has to be updated to extract a page from the 
lists.

So I think what would be useful is to have a single linked list of limited 
size that is taken off the freelist. All the pages are of the same flavor 
and watermark checks are only performed when the pages are taken off.

Then just do

	if (no policies or cpuset and freelist contains right flavor and is not empty)
		acquire from freelist
	else
		refill freelist

That should be doable in less than 100 cycles.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Fastpath prototype?
  2008-02-11 23:56                 ` Nick Piggin
  2008-02-12  0:08                   ` Christoph Lameter
@ 2008-02-12  6:06                   ` Christoph Lameter
  2008-02-12 10:40                     ` Andi Kleen
  2008-02-13 18:33                   ` SLUB tbench regression due to page allocator deficiency Paul Jackson
  2 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-12  6:06 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

This patch preserves the performance while only needing order 0 allocs. 
Pretty primitive.

---
 include/linux/gfp.h      |    4 ++
 include/linux/slub_def.h |    6 ++-
 mm/page_alloc.c          |   94 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/slub.c                |    5 ++
 4 files changed, 107 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h	2008-02-11 20:24:37.550298970 -0800
+++ linux-2.6/include/linux/gfp.h	2008-02-11 20:24:56.655574504 -0800
@@ -231,4 +231,8 @@ void drain_zone_pages(struct zone *zone,
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
+struct page *fast_alloc(gfp_t gfp_mask);
+void fast_free(struct page *page);
+void flush_fast_pages(void);
+void *fast_alloc_addr(gfp_t gfp_mask);
 #endif /* __LINUX_GFP_H */
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2008-02-11 20:24:37.558299609 -0800
+++ linux-2.6/mm/page_alloc.c	2008-02-11 21:51:46.806898404 -0800
@@ -1485,6 +1485,7 @@ restart:
 	if (page)
 		goto got_pg;
 
+	flush_fast_pages();
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
@@ -4550,3 +4551,96 @@ __offline_isolated_pages(unsigned long s
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
+
+#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT)
+#define PAGE_ALLOC_FASTPATH
+#endif
+
+DEFINE_PER_CPU(struct page *, fast_pages);
+
+void flush_fast_pages_cpu(void *dummy)
+{
+	struct page *page;
+
+	while (__get_cpu_var(fast_pages)) {
+		page = __get_cpu_var(fast_pages);
+#ifdef PAGE_ALLOC_FASTPATH
+		if (cmpxchg_local(&__get_cpu_var(fast_pages), page,
+				(struct page *)page->lru.next) != page)
+			continue;
+#else
+		__get_cpu_var(fast_pages) = (struct page *)page->lru.next;
+#endif
+		__free_page(page);
+	}
+}
+
+void flush_fast_pages(void)
+{
+	printk("flush_fast_pages\n");
+	on_each_cpu(flush_fast_pages_cpu, NULL, 0, 1);
+}
+
+struct page *fast_alloc(gfp_t mask)
+{
+	struct page *page;
+
+#ifdef PAGE_ALLOC_FASTPATH
+	do {
+		page = __get_cpu_var(fast_pages);
+		if (unlikely(!page))
+			goto slow;
+
+	} while (unlikely(cmpxchg_local(&__get_cpu_var(fast_pages),
+			page, (struct page *)page->lru.next) != page));
+#else
+	unsigned long flags;
+
+	local_irq_save(flags);
+	page = __get_cpu_var(fast_pages);
+	if (unlikely(!page)) {
+		local_irq_restore(flags);
+		goto slow;
+	}
+
+	__get_cpu_var(fast_pages) = (struct page *)page->lru.next;
+	local_irq_restore(flags);
+#endif
+	if (unlikely(mask & __GFP_ZERO))
+		memset(page_address(page), 0, PAGE_SIZE);
+	return page;
+
+slow:
+	return alloc_page(mask);
+}
+
+void fast_free(struct page *page)
+{
+#ifdef PAGE_ALLOC_FASTPAH
+	struct page *old;
+
+	do {
+		p = &__get_cpu_var(fast_pages);
+		old = *p;
+		page->lru.next = (void *)old;
+	} while (unlikely(cmpxchg_local(p, old, page) != old));
+#else
+	unsigned long flags;
+
+	local_irq_save(flags);
+	page->lru.next = (void *)__get_cpu_var(fast_pages);
+	__get_cpu_var(fast_pages) = page;
+	local_irq_restore(flags);
+#endif
+}
+
+void *fast_alloc_addr(gfp_t gfp_mask)
+{
+	struct page * page;
+
+	page = fast_alloc(gfp_mask);
+	if (likely(page))
+		return (void *) page_address(page);
+
+	return NULL;
+}
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2008-02-11 20:24:37.570300573 -0800
+++ linux-2.6/mm/slub.c	2008-02-11 20:24:56.655574504 -0800
@@ -2750,7 +2750,10 @@ void kfree(const void *x)
 
 	page = virt_to_head_page(x);
 	if (unlikely(!PageSlab(page))) {
-		put_page(page);
+		if (unlikely(PageCompound(page)))
+			put_page(page);
+		else
+			fast_free(page);
 		return;
 	}
 	slab_free(page->slab, page, object, __builtin_return_address(0));
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2008-02-11 20:24:55.111479643 -0800
+++ linux-2.6/include/linux/slub_def.h	2008-02-11 20:27:19.076283534 -0800
@@ -190,7 +190,11 @@ void *__kmalloc(size_t size, gfp_t flags
 
 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 {
-	return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size));
+	if (size <= PAGE_SIZE)
+		return fast_alloc_addr(flags);
+	else
+		return (void *)__get_free_pages(flags | __GFP_COMP,
+							get_order(size));
 }
 
 static __always_inline void *kmalloc(size_t size, gfp_t flags)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Fastpath prototype?
  2008-02-12  6:06                   ` Fastpath prototype? Christoph Lameter
@ 2008-02-12 10:40                     ` Andi Kleen
  2008-02-12 20:10                       ` Christoph Lameter
  0 siblings, 1 reply; 24+ messages in thread
From: Andi Kleen @ 2008-02-12 10:40 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Tuesday 12 February 2008 07:06:48 Christoph Lameter wrote:
> This patch preserves the performance while only needing order 0 allocs. 
> Pretty primitive.

The per CPU caches in the zone were originally intended to be exactly
such a fast path.

That is why I find your patch pretty ironic. 

I can understand it because a lot of the page_alloc.c code is frankly
bizarre now (the file could probably really need a rewrite) and it doesn't
surprise me that the old fast path is not very fast anymore.

But if you add another fast path you should first remove the old one 
at least.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Fastpath prototype?
  2008-02-12 10:40                     ` Andi Kleen
@ 2008-02-12 20:10                       ` Christoph Lameter
  2008-02-12 22:31                         ` Christoph Lameter
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-12 20:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Nick Piggin, Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Tue, 12 Feb 2008, Andi Kleen wrote:

> But if you add another fast path you should first remove the old one 
> at least.

Definitely thinking about that. We could just drop the pcp stuff. The 
current page allocator "fastpath" causes a 5% regression on my in kernel 
page allocator benchmarks and also on Mel's tests. The current pcp 
page queuing is mainly useful to hold off zone lock contention.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Fastpath prototype?
  2008-02-12 20:10                       ` Christoph Lameter
@ 2008-02-12 22:31                         ` Christoph Lameter
  2008-02-13 11:38                           ` Andi Kleen
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2008-02-12 22:31 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Nick Piggin, Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

Here is a patch to remove the pcp lists (just in case someone wants to toy 
around with these things too). It hits tbench/SLUB badly because that 
relies heavily on effective caching by the page allocator.

tbench/SLUB:  726.25 MB/sec

Even adding the fast path prototype (covers only slab allocs >=4K 
allocs) yields only 1825.68 MB/sec

I guess these results indicate that tbench would improve even more if we 
had a better fastpath.

A lot of the ugly NUMA stuff (page draining etc) would go away with the 
pcps. Also we could likely simplify NUMA bootstrap.


---
 include/linux/gfp.h    |    5 
 include/linux/mmzone.h |    8 -
 kernel/sysctl.c        |   12 --
 mm/page_alloc.c        |  272 +------------------------------------------------
 mm/vmstat.c            |   39 -------
 5 files changed, 12 insertions(+), 324 deletions(-)

Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h	2008-02-12 14:06:48.883814096 -0800
+++ linux-2.6/include/linux/gfp.h	2008-02-12 14:25:11.185781673 -0800
@@ -227,8 +227,7 @@ extern void FASTCALL(free_cold_page(stru
 #define free_page(addr) free_pages((addr),0)
 
 void page_alloc_init(void);
-void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
-void drain_all_pages(void);
-void drain_local_pages(void *dummy);
+static inline void drain_all_pages(void) {}
+static inline void drain_local_pages(void *dummy) {}
 
 #endif /* __LINUX_GFP_H */
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2008-02-07 23:28:11.328553973 -0800
+++ linux-2.6/include/linux/mmzone.h	2008-02-12 14:06:53.599840561 -0800
@@ -105,15 +105,7 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
-struct per_cpu_pages {
-	int count;		/* number of pages in the list */
-	int high;		/* high watermark, emptying needed */
-	int batch;		/* chunk size for buddy add/remove */
-	struct list_head list;	/* the list of pages */
-};
-
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
 #endif
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2008-02-12 14:06:48.895814167 -0800
+++ linux-2.6/mm/page_alloc.c	2008-02-12 14:25:11.213781917 -0800
@@ -475,35 +475,6 @@ static inline int free_pages_check(struc
 	return PageReserved(page);
 }
 
-/*
- * Frees a list of pages. 
- * Assumes all pages on list are in same zone, and of same order.
- * count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
- */
-static void free_pages_bulk(struct zone *zone, int count,
-					struct list_head *list, int order)
-{
-	spin_lock(&zone->lock);
-	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
-	zone->pages_scanned = 0;
-	while (count--) {
-		struct page *page;
-
-		VM_BUG_ON(list_empty(list));
-		page = list_entry(list->prev, struct page, lru);
-		/* have to delete it as __free_one_page list manipulates */
-		list_del(&page->lru);
-		__free_one_page(page, zone, order);
-	}
-	spin_unlock(&zone->lock);
-}
-
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
 	spin_lock(&zone->lock);
@@ -832,110 +803,6 @@ static struct page *__rmqueue(struct zon
 	return page;
 }
 
-/* 
- * Obtain a specified number of elements from the buddy allocator, all under
- * a single hold of the lock, for efficiency.  Add them to the supplied list.
- * Returns the number of new pages which were placed at *list.
- */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-			unsigned long count, struct list_head *list,
-			int migratetype)
-{
-	int i;
-	
-	spin_lock(&zone->lock);
-	for (i = 0; i < count; ++i) {
-		struct page *page = __rmqueue(zone, order, migratetype);
-		if (unlikely(page == NULL))
-			break;
-
-		/*
-		 * Split buddy pages returned by expand() are received here
-		 * in physical page order. The page is added to the callers and
-		 * list and the list head then moves forward. From the callers
-		 * perspective, the linked list is ordered by page number in
-		 * some conditions. This is useful for IO devices that can
-		 * merge IO requests if the physical pages are ordered
-		 * properly.
-		 */
-		list_add(&page->lru, list);
-		set_page_private(page, migratetype);
-		list = &page->lru;
-	}
-	spin_unlock(&zone->lock);
-	return i;
-}
-
-#ifdef CONFIG_NUMA
-/*
- * Called from the vmstat counter updater to drain pagesets of this
- * currently executing processor on remote nodes after they have
- * expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
- */
-void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
-{
-	unsigned long flags;
-	int to_drain;
-
-	local_irq_save(flags);
-	if (pcp->count >= pcp->batch)
-		to_drain = pcp->batch;
-	else
-		to_drain = pcp->count;
-	free_pages_bulk(zone, to_drain, &pcp->list, 0);
-	pcp->count -= to_drain;
-	local_irq_restore(flags);
-}
-#endif
-
-/*
- * Drain pages of the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
- */
-static void drain_pages(unsigned int cpu)
-{
-	unsigned long flags;
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset;
-		struct per_cpu_pages *pcp;
-
-		if (!populated_zone(zone))
-			continue;
-
-		pset = zone_pcp(zone, cpu);
-
-		pcp = &pset->pcp;
-		local_irq_save(flags);
-		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-		pcp->count = 0;
-		local_irq_restore(flags);
-	}
-}
-
-/*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void *arg)
-{
-	drain_pages(smp_processor_id());
-}
-
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
- */
-void drain_all_pages(void)
-{
-	on_each_cpu(drain_local_pages, NULL, 0, 1);
-}
-
 #ifdef CONFIG_HIBERNATION
 
 void mark_free_pages(struct zone *zone)
@@ -978,7 +845,6 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
-	struct per_cpu_pages *pcp;
 	unsigned long flags;
 
 	if (PageAnon(page))
@@ -992,21 +858,11 @@ static void free_hot_cold_page(struct pa
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	if (cold)
-		list_add_tail(&page->lru, &pcp->list);
-	else
-		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
-	pcp->count++;
-	if (pcp->count >= pcp->high) {
-		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
-		pcp->count -= pcp->batch;
-	}
+	free_one_page(zone, page, 0);
 	local_irq_restore(flags);
-	put_cpu();
 }
 
 void free_hot_page(struct page *page)
@@ -1047,56 +903,18 @@ static struct page *buffered_rmqueue(str
 {
 	unsigned long flags;
 	struct page *page;
-	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
 again:
-	cpu  = get_cpu();
-	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
-
-		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
-		if (!pcp->count) {
-			pcp->count = rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
-			if (unlikely(!pcp->count))
-				goto failed;
-		}
-
-		/* Find a page of the appropriate migrate type */
-		if (cold) {
-			list_for_each_entry_reverse(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		} else {
-			list_for_each_entry(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		}
-
-		/* Allocate more to the pcp list if necessary */
-		if (unlikely(&page->lru == &pcp->list)) {
-			pcp->count += rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
-			page = list_entry(pcp->list.next, struct page, lru);
-		}
-
-		list_del(&page->lru);
-		pcp->count--;
-	} else {
-		spin_lock_irqsave(&zone->lock, flags);
-		page = __rmqueue(zone, order, migratetype);
-		spin_unlock(&zone->lock);
-		if (!page)
-			goto failed;
-	}
+	spin_lock_irqsave(&zone->lock, flags);
+	page = __rmqueue(zone, order, migratetype);
+	spin_unlock(&zone->lock);
+	if (!page)
+		goto failed;
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
-	put_cpu();
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1786,7 +1604,6 @@ void si_meminfo_node(struct sysinfo *val
  */
 void show_free_areas(void)
 {
-	int cpu;
 	struct zone *zone;
 
 	for_each_zone(zone) {
@@ -1794,17 +1611,6 @@ void show_free_areas(void)
 			continue;
 
 		show_node(zone);
-		printk("%s per-cpu:\n", zone->name);
-
-		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *pageset;
-
-			pageset = zone_pcp(zone, cpu);
-
-			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-			       cpu, pageset->pcp.high,
-			       pageset->pcp.batch, pageset->pcp.count);
-		}
 	}
 
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
@@ -2597,37 +2403,11 @@ static int zone_batchsize(struct zone *z
 	return batch;
 }
 
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+inline void setup_pageset(struct per_cpu_pageset *p)
 {
-	struct per_cpu_pages *pcp;
-
 	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp;
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
 }
 
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp;
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -2669,11 +2449,7 @@ static int __cpuinit process_zones(int c
 		if (!zone_pcp(zone, cpu))
 			goto bad;
 
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
+		setup_pageset(zone_pcp(zone, cpu));
 	}
 
 	return 0;
@@ -2798,9 +2574,9 @@ static __meminit void zone_pcp_init(stru
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
 		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
+		setup_pageset(&boot_pageset[cpu]);
 #else
-		setup_pageset(zone_pcp(zone,cpu), batch);
+		setup_pageset(zone_pcp(zone,cpu));
 #endif
 	}
 	if (zone->present_pages)
@@ -3971,8 +3747,6 @@ static int page_alloc_cpu_notify(struct 
 	int cpu = (unsigned long)hcpu;
 
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		drain_pages(cpu);
-
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
@@ -4236,32 +4010,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2008-02-07 23:28:12.596577762 -0800
+++ linux-2.6/mm/vmstat.c	2008-02-12 14:06:53.619840675 -0800
@@ -317,37 +317,7 @@ void refresh_cpu_vm_stats(int cpu)
 				local_irq_restore(flags);
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_diff[i] += v;
-#ifdef CONFIG_NUMA
-				/* 3 seconds idle till flush */
-				p->expire = 3;
-#endif
 			}
-#ifdef CONFIG_NUMA
-		/*
-		 * Deal with draining the remote pageset of this
-		 * processor
-		 *
-		 * Check if there are pages remaining in this pageset
-		 * if not then there is nothing to expire.
-		 */
-		if (!p->expire || !p->pcp.count)
-			continue;
-
-		/*
-		 * We never drain zones local to this processor.
-		 */
-		if (zone_to_nid(zone) == numa_node_id()) {
-			p->expire = 0;
-			continue;
-		}
-
-		p->expire--;
-		if (p->expire)
-			continue;
-
-		if (p->pcp.count)
-			drain_zone_pages(zone, &p->pcp);
-#endif
 	}
 
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -685,15 +655,6 @@ static void zoneinfo_show_print(struct s
 		struct per_cpu_pageset *pageset;
 
 		pageset = zone_pcp(zone, i);
-		seq_printf(m,
-			   "\n    cpu: %i"
-			   "\n              count: %i"
-			   "\n              high:  %i"
-			   "\n              batch: %i",
-			   i,
-			   pageset->pcp.count,
-			   pageset->pcp.high,
-			   pageset->pcp.batch);
 #ifdef CONFIG_SMP
 		seq_printf(m, "\n  vm stats threshold: %d",
 				pageset->stat_threshold);
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2008-02-12 14:11:38.553441536 -0800
+++ linux-2.6/kernel/sysctl.c	2008-02-12 14:11:56.161540376 -0800
@@ -75,7 +75,6 @@ extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
-extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
@@ -100,7 +99,6 @@ static int one_hundred = 100;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -1012,16 +1010,6 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
-	{
-		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
-		.procname	= "percpu_pagelist_fraction",
-		.data		= &percpu_pagelist_fraction,
-		.maxlen		= sizeof(percpu_pagelist_fraction),
-		.mode		= 0644,
-		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_percpu_pagelist_fract,
-	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Fastpath prototype?
  2008-02-12 22:31                         ` Christoph Lameter
@ 2008-02-13 11:38                           ` Andi Kleen
  2008-02-13 20:09                             ` Christoph Lameter
  0 siblings, 1 reply; 24+ messages in thread
From: Andi Kleen @ 2008-02-13 11:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

Christoph Lameter wrote:
> Here is a patch to remove the pcp lists (just in case someone wants to toy 
> around with these things too). It hits tbench/SLUB badly because that 
> relies heavily on effective caching by the page allocator
> 
> tbench/SLUB:  726.25 MB/sec
> 
> Even adding the fast path prototype (covers only slab allocs >=4K 
> allocs) yields only 1825.68 MB/sec

So why is the new fast path slower than the old one? Because it is not
NUMA aware?

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Fastpath prototype?
  2008-02-13 11:38                           ` Andi Kleen
@ 2008-02-13 20:09                             ` Christoph Lameter
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2008-02-13 20:09 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Nick Piggin, Andrew Morton, Mel Gorman, linux-mm, Pekka J Enberg

On Wed, 13 Feb 2008, Andi Kleen wrote:

> > tbench/SLUB:  726.25 MB/sec
> > 
> > Even adding the fast path prototype (covers only slab allocs >=4K 
> > allocs) yields only 1825.68 MB/sec
> 
> So why is the new fast path slower than the old one? Because it is not
> NUMA aware?

No because it only covers kmalloc-4096 and nothing else. All other 
allocations go to the now queueless page allocator.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-11 23:56                 ` Nick Piggin
  2008-02-12  0:08                   ` Christoph Lameter
  2008-02-12  6:06                   ` Fastpath prototype? Christoph Lameter
@ 2008-02-13 18:33                   ` Paul Jackson
  2 siblings, 0 replies; 24+ messages in thread
From: Paul Jackson @ 2008-02-13 18:33 UTC (permalink / raw)
  To: Nick Piggin; +Cc: clameter, akpm, mel, linux-mm, penberg

Nick, listing things __alloc_pages() does:
> cpuset -- zone softwall stuff

That should be off the fast path.  So long as there is enough memory
(above watermarks on some node in current->mems_allowed) then there
should be no need to consider the cpuset softwall details.  Notice
that the first invocation of get_page_from_freelist() in __alloc_pages()
has the __GFP_HARDWALL included, bypassing the cpuset software code.

==

Is there someway to get profiling data from __alloc_pages() and
get_page_from_freelist(), for Christoph's test case.  I am imagining
publishing a listing of that code, with a column to the right
indicating how many times each line of code was executed, during the
interesting portion of running such a test case.

That would shine a bright light on any line(s) of code that get
executed way more often than seem necessary for such a load, and enable
the cast of dozens of us who have hacked this code at sometime to
notice opportunities for sucking less.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-09 21:45 SLUB tbench regression due to page allocator deficiency Christoph Lameter
  2008-02-09 22:35 ` Andrew Morton
@ 2008-02-11 13:50 ` Mel Gorman
  2008-02-13 11:15 ` Mel Gorman
  2 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2008-02-11 13:50 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, akpm, Nick Piggin, Pekka J Enberg

On (09/02/08 13:45), Christoph Lameter didst pronounce:
> I have been chasing the tbench regression (1-4%) for two weeks now and 
> even after I added statistics I could only verify that behavior was just 
> optimal.
> 
> None of the tricks that I threw at the problem changed anything until I 
> realized that the tbench load depends heavily on 4k allocations that SLUB 
> hands off to the page allocator (SLAB handles 4k itself). I extended the 
> kmalloc array to 4k and I got:

This poked me into checking the results I got when comparing SLAB/SLUB
in 2.6.24. I ran a fairly large set of benchmarks and then failed to
follow up on it :/

The results I got for tbench were considerably worse than 4% (sorry for
the wide output). This

bl6-13/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
bl6-13/report.txt-                     Min                         Average                     Max                         Std. Deviation              
bl6-13/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
bl6-13/report.txt-clients-1              176.37/156.81   (-12.47%)   186.57/173.51   ( -7.53%)   204.71/209.94   (  2.49%)     9.51/13.64    ( -43.38%)
bl6-13/report.txt-clients-2              319.70/282.60   (-13.13%)   347.16/313.87   (-10.61%)   414.66/343.35   (-20.77%)    21.12/12.45    (  41.04%)
bl6-13/report.txt-clients-4              854.17/685.53   (-24.60%)  1024.46/845.32   (-21.19%)  1067.28/905.61   (-17.85%)    44.97/46.26    (  -2.87%)
bl6-13/report.txt-clients-8              974.06/835.80   (-16.54%)  1010.90/882.97   (-14.49%)  1027.36/917.22   (-12.01%)    13.68/19.84    ( -45.00%)
bl6-13/report.txt-
--
elm3a203/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
elm3a203/report.txt-                     Min                         Average                     Max                         Std. Deviation              
elm3a203/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
elm3a203/report.txt-clients-1              111.25/97.59    (-13.99%)   112.30/99.66    (-12.68%)   113.25/101.21   (-11.89%)     0.49/0.78     ( -59.29%)
elm3a203/report.txt-clients-1              112.28/97.39    (-15.29%)   113.13/99.68    (-13.50%)   113.79/100.58   (-13.13%)     0.32/0.87     (-176.48%)
elm3a203/report.txt-clients-2              149.01/131.90   (-12.97%)   151.04/136.51   (-10.64%)   152.52/139.26   ( -9.53%)     0.97/1.51     ( -55.79%)
elm3a203/report.txt-clients-4              145.94/130.05   (-12.22%)   147.62/132.33   (-11.56%)   148.92/134.26   (-10.92%)     0.88/1.10     ( -25.10%)
elm3a203/report.txt-
--
elm3b133/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
elm3b133/report.txt-                     Min                         Average                     Max                         Std. Deviation              
elm3b133/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
elm3b133/report.txt-clients-1               28.17/26.95    ( -4.53%)    28.34/27.25    ( -4.01%)    28.53/27.38    ( -4.22%)     0.09/0.10     (  -5.42%)
elm3b133/report.txt-clients-2               52.55/50.61    ( -3.83%)    53.20/51.28    ( -3.74%)    54.47/51.82    ( -5.11%)     0.49/0.33     (  32.41%)
elm3b133/report.txt-clients-4              111.15/105.14   ( -5.71%)   113.29/107.29   ( -5.59%)   114.16/108.58   ( -5.13%)     0.69/0.91     ( -32.14%)
elm3b133/report.txt-clients-8              109.63/104.37   ( -5.04%)   110.14/104.78   ( -5.12%)   110.80/105.43   ( -5.10%)     0.25/0.27     (  -8.94%)
elm3b133/report.txt-
--
elm3b19/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
elm3b19/report.txt-                     Min                         Average                     Max                         Std. Deviation              
elm3b19/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
elm3b19/report.txt-clients-1              118.85/0.00     (  0.00%)   123.72/115.79   ( -6.84%)   131.11/129.94   ( -0.90%)     3.77/26.77    (-609.67%)
elm3b19/report.txt-clients-1              118.68/117.89   ( -0.67%)   124.65/123.52   ( -0.91%)   137.54/132.09   ( -4.13%)     5.52/4.20     (  23.78%)
elm3b19/report.txt-clients-2              223.73/211.77   ( -5.64%)   339.06/334.21   ( -1.45%)   367.83/357.20   ( -2.97%)    38.36/30.30    (  21.03%)
elm3b19/report.txt-clients-4              320.07/316.04   ( -1.28%)   331.93/324.42   ( -2.31%)   341.92/332.29   ( -2.90%)     5.51/4.07     (  26.03%)
elm3b19/report.txt-
--
elm3b6/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
elm3b6/report.txt-                     Min                         Average                     Max                         Std. Deviation              
elm3b6/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
elm3b6/report.txt-clients-1              148.01/140.79   ( -5.13%)   156.19/153.67   ( -1.64%)   182.30/185.11   (  1.52%)     9.76/13.84    ( -41.84%)
elm3b6/report.txt-clients-2              251.07/253.07   (  0.79%)   292.60/286.59   ( -2.10%)   338.81/360.93   (  6.13%)    22.58/21.48    (   4.85%)
elm3b6/report.txt-clients-4              673.43/523.51   (-28.64%)   784.56/761.89   ( -2.98%)   846.40/818.38   ( -3.42%)    36.95/82.30    (-122.75%)
elm3b6/report.txt-clients-8              652.73/700.72   (  6.85%)   783.54/772.22   ( -1.47%)   833.56/812.21   ( -2.63%)    47.45/27.48    (  42.09%)
elm3b6/report.txt-
--
gekko-lp1/report.txt:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-debug)
gekko-lp1/report.txt-                     Min                         Average                     Max                         Std. Deviation              
gekko-lp1/report.txt-                     --------------------------- --------------------------- --------------------------- ----------------------------
gekko-lp1/report.txt-clients-1              170.56/163.15   ( -4.55%)   206.59/194.96   ( -5.96%)   221.27/206.46   ( -7.17%)    17.06/13.59    (  20.34%)
gekko-lp1/report.txt-clients-2              302.55/277.45   ( -9.05%)   319.14/306.39   ( -4.16%)   328.74/313.01   ( -5.03%)     6.21/8.18     ( -31.81%)
gekko-lp1/report.txt-clients-4              467.98/393.05   (-19.06%)   490.42/464.23   ( -5.64%)   503.74/477.13   ( -5.58%)    10.49/17.68    ( -68.61%)
gekko-lp1/report.txt-clients-8              469.16/447.00   ( -4.96%)   492.14/468.37   ( -5.07%)   498.79/472.47   ( -5.57%)     7.08/5.61     (  20.79%)
gekko-lp1/report.txt-

I think I didn't look too closely because kernbench was generally ok,
hackbench showed gains and losses depending on the machine and as TBench
has historically been a bit all over the place. That was a mistake
though as there was a definite slow-up even with the variances taken
into account.

> 
> christoph@stapp:~$ slabinfo -AD
> Name                   Objects    Alloc     Free   %Fast
> :0004096                   180 665259550 665259415  99  99
> skbuff_fclone_cache         46 665196592 665196592  99  99
> :0000192                  2575 31232665 31230129  99  99
> :0001024                   854 31204838 31204006  99  99
> vm_area_struct            1093   108941   107954  91  17
> dentry                    7738    26248    18544  92  43
> :0000064                  2179    19208    17287  97  73
> 
> So the kmalloc-4096 is heavily used. If I give the 4k objects a reasonable 
> allocation size in slub (PAGE_ALLOC_COSTLY_ORDER) then the fastpath of 
> SLUB becomes effective for 4k allocs and then SLUB is faster than SLAB 
> here.
> 
> Performance on tbench (Dual Quad 8p 8G):
> 
> SLAB		2223.32 MB/sec
> SLUB unmodified	2144.36 MB/sec
> SLUB+patch	2245.56 MB/sec (stats still active so this isnt optimal yet)
> 

I'll run tests for this patch and see what it looks like.

> 4k allocations cannot optimally be handled by SLUB if we are restricted to 
> order 0 allocs because the fastpath only handles fractions of one 
> allocation unit and if the allocation unit is 4k then we only have one 
> object per slab.
> 
> Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
> allocations in such a way that is competitive with the slab allocators? 

Probably. It's been on my TODO list for an age to see what can be done.

> The cycle count for an allocation needs to be <100 not just below 1000 as 
> it is now.
> 
> ---
>  include/linux/slub_def.h |    6 +++---
>  mm/slub.c                |   25 +++++++++++++++++--------
>  2 files changed, 20 insertions(+), 11 deletions(-)
> 
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2008-02-09 13:04:48.464203968 -0800
> +++ linux-2.6/include/linux/slub_def.h	2008-02-09 13:08:37.413120259 -0800
> @@ -110,7 +110,7 @@ struct kmem_cache {
>   * We keep the general caches in an array of slab caches that are used for
>   * 2^x bytes of allocations.
>   */
> -extern struct kmem_cache kmalloc_caches[PAGE_SHIFT];
> +extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
>  
>  /*
>   * Sorry that the following has to be that ugly but some versions of GCC
> @@ -191,7 +191,7 @@ void *__kmalloc(size_t size, gfp_t flags
>  static __always_inline void *kmalloc(size_t size, gfp_t flags)
>  {
>  	if (__builtin_constant_p(size)) {
> -		if (size > PAGE_SIZE / 2)
> +		if (size > PAGE_SIZE)
>  			return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -214,7 +214,7 @@ void *kmem_cache_alloc_node(struct kmem_
>  static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
>  {
>  	if (__builtin_constant_p(size) &&
> -		size <= PAGE_SIZE / 2 && !(flags & SLUB_DMA)) {
> +		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
>  			struct kmem_cache *s = kmalloc_slab(size);
>  
>  		if (!s)
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2008-02-09 13:04:48.472203975 -0800
> +++ linux-2.6/mm/slub.c	2008-02-09 13:14:43.786633258 -0800
> @@ -1919,6 +1919,15 @@ static inline int calculate_order(int si
>  	int fraction;
>  
>  	/*
> +	 * Cover up bad performance of page allocator fastpath vs
> +	 * slab allocator fastpaths. Take the largest order reasonable
> +	 * in order to be able to avoid partial list overhead.
> +	 *
> +	 * This yields 8 4k objects per 32k slab allocation.
> +	 */
> +	if (size == PAGE_SIZE)
> +		return PAGE_ALLOC_COSTLY_ORDER;
> +	/*
>  	 * Attempt to find best configuration for a slab. This
>  	 * works by first attempting to generate a layout with
>  	 * the best configuration and backing off gradually.
> @@ -2484,11 +2493,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
>   *		Kmalloc subsystem
>   *******************************************************************/
>  
> -struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
> +struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
>  EXPORT_SYMBOL(kmalloc_caches);
>  
>  #ifdef CONFIG_ZONE_DMA
> -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
> +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
>  #endif
>  
>  static int __init setup_slub_min_order(char *str)
> @@ -2670,7 +2679,7 @@ void *__kmalloc(size_t size, gfp_t flags
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -2688,7 +2697,7 @@ void *__kmalloc_node(size_t size, gfp_t 
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -3001,7 +3010,7 @@ void __init kmem_cache_init(void)
>  		caches++;
>  	}
>  
> -	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
> +	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
>  		create_kmalloc_cache(&kmalloc_caches[i],
>  			"kmalloc", 1 << i, GFP_KERNEL);
>  		caches++;
> @@ -3028,7 +3037,7 @@ void __init kmem_cache_init(void)
>  	slab_state = UP;
>  
>  	/* Provide the correct kmalloc names now that the caches are up */
> -	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
> +	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
>  		kmalloc_caches[i]. name =
>  			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
>  
> @@ -3218,7 +3227,7 @@ void *__kmalloc_track_caller(size_t size
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
>  							get_order(size));
>  	s = get_slab(size, gfpflags);
> @@ -3234,7 +3243,7 @@ void *__kmalloc_node_track_caller(size_t
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
>  							get_order(size));
>  	s = get_slab(size, gfpflags);
> 
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: SLUB tbench regression due to page allocator deficiency
  2008-02-09 21:45 SLUB tbench regression due to page allocator deficiency Christoph Lameter
  2008-02-09 22:35 ` Andrew Morton
  2008-02-11 13:50 ` Mel Gorman
@ 2008-02-13 11:15 ` Mel Gorman
  2 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2008-02-13 11:15 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, akpm, Nick Piggin, Pekka J Enberg

On (09/02/08 13:45), Christoph Lameter didst pronounce:
> I have been chasing the tbench regression (1-4%) for two weeks now and 
> even after I added statistics I could only verify that behavior was just 
> optimal.
> 
> None of the tricks that I threw at the problem changed anything until I 
> realized that the tbench load depends heavily on 4k allocations that SLUB 
> hands off to the page allocator (SLAB handles 4k itself). I extended the 
> kmalloc array to 4k and I got:
> 
> christoph@stapp:~$ slabinfo -AD
> Name                   Objects    Alloc     Free   %Fast
> :0004096                   180 665259550 665259415  99  99
> skbuff_fclone_cache         46 665196592 665196592  99  99
> :0000192                  2575 31232665 31230129  99  99
> :0001024                   854 31204838 31204006  99  99
> vm_area_struct            1093   108941   107954  91  17
> dentry                    7738    26248    18544  92  43
> :0000064                  2179    19208    17287  97  73
> 
> So the kmalloc-4096 is heavily used. If I give the 4k objects a reasonable 
> allocation size in slub (PAGE_ALLOC_COSTLY_ORDER) then the fastpath of 
> SLUB becomes effective for 4k allocs and then SLUB is faster than SLAB 
> here.
> 
> Performance on tbench (Dual Quad 8p 8G):
> 
> SLAB		2223.32 MB/sec
> SLUB unmodified	2144.36 MB/sec
> SLUB+patch	2245.56 MB/sec (stats still active so this isnt optimal yet)
> 

I ran similar tests for tbench and also sysbench as it is closer to a
real workload. I have results from other tests as well although the oddest
was HackBench which showed +/- 30% performance gains/losses depending on
the machine.

In the results, the tbench comparisons are between slab and SLUB-lameter
which is the first patch posted in this thread. I posted up the figures
of slab vs slub-vanilla already. sysbench compares 2.6.23, 2.6.24-slab,
2.6.24-slub-vanilla and 2.6.24-slub-lameter.

Short answer: slub-lameter appears to be usually a win in many cases over
slab. However, such different behaviours between machines on even small
tests is something to be wary of. In one machine, this patch makes SLUB
slower but it was not typical.

Note that sysbench was not run everywhere as some crinkles in the
automation that prevent it running everywhere are still being ironed
out. Incidentally, its scalability sucks. Over 8 threads, performance
starts dropping sharply but I haven't checked out a different userspace
allocator yet as the system malloc was identified as a problem in the past
(http://ozlabs.org/~anton/linux/sysbench/).

elm3a238:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3a238-             Min                         Average                     Max                         Std. Deviation              
elm3a238-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3a238-clients-1       80.00/82.66    (  3.22%)    84.22/83.43    ( -0.95%)    84.77/83.84    ( -1.11%)     1.00/0.25     (  75.20%)
elm3a238-clients-1       84.00/83.01    ( -1.20%)    84.41/83.42    ( -1.18%)    84.87/83.73    ( -1.36%)     0.20/0.22     ( -11.47%)
elm3a238-clients-2      115.71/114.89   ( -0.71%)   117.03/115.25   ( -1.55%)   117.71/115.79   ( -1.65%)     0.41/0.25     (  39.51%)
elm3a238-clients-4      116.37/113.40   ( -2.63%)   116.81/113.78   ( -2.66%)   117.24/114.24   ( -2.62%)     0.24/0.21     (  13.45%)
elm3a238-
sysbench: http://www.csn.ul.ie/~mel/postings/tsysbench-20080213/elm3a238-comparison.ps

Still showing a small regression against SLAB here. However, sysbench tells
a different story. 2.6.23 was fastest but 2.6.24-slub-vanilla was faster
than slab on 2.6.24. This patch made sysbench at least slower on this machine.
==

elm3a69:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3a69-             Min                         Average                     Max                         Std. Deviation              
elm3a69-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3a69-clients-1      174.21/173.15   ( -0.62%)   174.83/174.13   ( -0.41%)   175.52/175.22   ( -0.17%)     0.39/0.50     ( -29.83%)
elm3a69-clients-1      173.73/173.94   (  0.12%)   175.10/174.35   ( -0.43%)   175.97/174.92   ( -0.60%)     0.52/0.22     (  57.76%)
elm3a69-clients-2      261.58/256.71   ( -1.90%)   299.03/301.13   (  0.70%)   319.82/318.36   ( -0.46%)    23.85/23.56    (   1.22%)
elm3a69-clients-4      312.55/308.88   ( -1.19%)   316.44/313.57   ( -0.92%)   319.31/316.10   ( -1.02%)     1.61/1.76     (  -9.28%)
elm3a69-
sysbench: unavailable

The patch makes SLUB and SLAB comparable on this machine. For example,
with 2 clients, it was previously a 4.24% regression and here it shows a
0.70% gain. However, the difference between kernels is within the standard
deviation of multiple runs so the only conclusion is to say that with the
patch the two allocators become comparable.

==
elm3b133:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3b133-             Min                         Average                     Max                         Std. Deviation              
elm3b133-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3b133-clients-1       28.00/28.25    (  0.89%)    28.23/28.46    (  0.81%)    28.42/28.63    (  0.73%)     0.09/0.11     ( -22.52%)
elm3b133-clients-2       52.59/52.70    (  0.20%)    53.33/53.86    (  0.98%)    53.93/54.97    (  1.89%)     0.45/0.52     ( -16.18%)
elm3b133-clients-4      111.24/110.89   ( -0.31%)   112.75/114.51   (  1.53%)   114.07/115.77   (  1.46%)     0.78/1.38     ( -76.86%)
elm3b133-clients-8      110.03/110.13   (  0.09%)   110.57/110.99   (  0.38%)   111.06/111.55   (  0.44%)     0.25/0.33     ( -34.68%)
sysbench: unavailable

The patch is clearly a win on this machine. Regressions were between
3.7% and 5.6%. With the patch applied, it's mainly gains.

==
elm3b19:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3b19-             Min                         Average                     Max                         Std. Deviation              
elm3b19-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3b19-clients-1      115.39/119.30   (  3.27%)   118.53/124.61   (  4.88%)   126.01/158.10   ( 20.29%)     2.79/8.61     (-208.87%)
elm3b19-clients-1      116.56/117.51   (  0.81%)   120.83/123.23   (  1.95%)   131.91/130.33   ( -1.21%)     3.68/3.24     (  11.85%)
elm3b19-clients-2      255.60/350.33   ( 27.04%)   345.43/365.53   (  5.50%)   366.62/375.17   (  2.28%)    27.64/7.05     (  74.49%)
elm3b19-clients-4      323.56/324.04   (  0.15%)   334.94/334.64   ( -0.09%)   344.60/339.71   ( -1.44%)     4.98/3.73     (  25.05%)
sysbench: unavalable

The patch is even more clearly a win on this machine for tbench. Went from
losses of between 0.9% and 6.8% to decent gains in some cases.

==
elm3b6:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3b6-             Min                         Average                     Max                         Std. Deviation              
elm3b6-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3b6-clients-1      138.88/143.41   (  3.16%)   152.46/154.96   (  1.62%)   176.49/179.71   (  1.79%)    10.05/10.84    (  -7.88%)
elm3b6-clients-2      264.83/263.46   ( -0.52%)   289.35/290.40   (  0.36%)   316.65/337.88   (  6.28%)    12.49/19.85    ( -58.99%)
elm3b6-clients-4      704.21/642.25   ( -9.65%)   751.06/764.37   (  1.74%)   778.11/812.44   (  4.23%)    20.93/45.31    (-116.53%)
elm3b6-clients-8      635.54/650.29   (  2.27%)   732.18/745.58   (  1.80%)   799.16/794.92   ( -0.53%)    49.19/42.10    (  14.43%)
sysbench:http://www.csn.ul.ie/~mel/postings/tsysbench-20080213/elm3b6-comparison.ps

Again solid gains. Went from losses of around 2% to gains of of around
1%. sysbench is less clear cut but slub-lameter inches slightly ahead more
often than not versus slab.

==
gekko-lp1:TBench Throughput Comparisons (2.6.24.2-slab/2.6.24.2-slub-lameter)
gekko-lp1-             Min                         Average                     Max                         Std. Deviation              
gekko-lp1-             --------------------------- --------------------------- --------------------------- ----------------------------
gekko-lp1-clients-1      169.17/176.54   (  4.18%)   198.45/213.08   (  6.87%)   219.36/227.24   (  3.47%)    16.80/14.73    (  12.31%)
gekko-lp1-clients-2      308.51/319.39   (  3.41%)   323.06/329.19   (  1.86%)   333.15/337.09   (  1.17%)     7.04/4.43     (  37.10%)
gekko-lp1-clients-4      465.10/390.12   (-19.22%)   494.48/493.46   ( -0.21%)   508.70/516.23   (  1.46%)    11.28/33.51    (-196.97%)
gekko-lp1-clients-8      476.20/435.68   ( -9.30%)   494.68/505.39   (  2.12%)   504.11/513.86   (  1.90%)     8.89/16.92    ( -90.46%)
sysbench: unavailable

Once again, losses of up to 6% without the patch to gains of 6.8% with. Win.

==
gekko-lp4:TBench Throughput Comparisons (2.6.24.2-slab/2.6.24.2-slub-lameter)
gekko-lp4-             Min                         Average                     Max                         Std. Deviation              
gekko-lp4-             --------------------------- --------------------------- --------------------------- ----------------------------
gekko-lp4-clients-1      167.17/190.43   ( 12.21%)   167.96/190.72   ( 11.93%)   169.27/191.04   ( 11.40%)     0.42/0.18     (  58.78%)
gekko-lp4-clients-1      166.89/190.70   ( 12.48%)   167.88/191.25   ( 12.22%)   169.13/192.56   ( 12.17%)     0.47/0.52     (  -9.98%)
gekko-lp4-clients-2      250.79/300.33   ( 16.50%)   257.55/305.09   ( 15.58%)   260.71/309.14   ( 15.67%)     2.50/2.61     (  -4.49%)
gekko-lp4-clients-4      258.46/297.84   ( 13.22%)   259.18/303.32   ( 14.55%)   259.76/307.08   ( 15.41%)     0.44/2.62     (-494.48%)
sysbench: http://www.csn.ul.ie/~mel/postings/tsysbench-20080213/gekko-lp4-comparison.ps

Big gains here in tbench with the patch. Annoyingly, I find as I write this
I don't have tbench figures comparing slab with vanilla slub but I have no
reason to believe there is anything anomalous there. sysbench shows that SLUB
wins big over 2.6.24-slab on this machine although oddly it's only comparable
with 2.6.23-slab. The patch does not show any significant difference between
the two slub comparisons. So, on this machine the patch doesn't hurt.
==

bl6-13:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
bl6-13-             Min                         Average                     Max                         Std. Deviation              
bl6-13-             --------------------------- --------------------------- --------------------------- ----------------------------
bl6-13-clients-1      178.31/164.15   ( -8.63%)   204.12/186.17   ( -9.64%)   265.64/230.74   (-15.12%)    24.44/19.71    (  19.35%)
bl6-13-clients-2      305.58/304.75   ( -0.27%)   346.98/341.09   ( -1.72%)   406.30/405.27   ( -0.25%)    19.77/25.33    ( -28.15%)
bl6-13-clients-4      868.05/839.52   ( -3.40%)   990.49/893.36   (-10.87%)  1054.08/970.95   ( -8.56%)    47.59/37.87    (  20.42%)
bl6-13-clients-8      927.69/770.35   (-20.42%)  1003.17/894.60   (-12.14%)  1030.28/930.52   (-10.72%)    23.46/33.32    ( -42.05%)
sysbench: http://www.csn.ul.ie/~mel/postings/tsysbench-20080213/bl6-13-comparison.ps

Even with the patch, tbench blows on this machine. Without the patch it was
between 7% and 21% regression though so it's still an improvement. It's
worth noting that this machine routinely shows up big difference between
kernel versions with all small benchmarks so it's hard to draw a conclusion
from tbench here. sysbench shows significant gains over 2.6.23-slab in all
cases. 2.6.24-slab is marginally better than slub and the patch makes no
big difference to sysbench on this machine. Like, gekko-lp4 - the patch
doesn't hurt.
==

elm3a203:TBench Throughput Comparisons (2.6.24-slab/2.6.24-slub-lameter)
elm3a203-             Min                         Average                     Max                         Std. Deviation              
elm3a203-             --------------------------- --------------------------- --------------------------- ----------------------------
elm3a203-clients-1      111.14/108.56   ( -2.37%)   112.46/109.65   ( -2.56%)   113.08/110.40   ( -2.43%)     0.50/0.38     (  23.06%)
elm3a203-clients-1      111.57/108.41   ( -2.91%)   112.64/109.63   ( -2.75%)   113.48/110.56   ( -2.64%)     0.49/0.50     (  -1.53%)
elm3a203-clients-2      148.50/144.59   ( -2.70%)   151.82/147.48   ( -2.94%)   153.75/149.10   ( -3.12%)     1.27/1.31     (  -2.91%)
elm3a203-clients-4      146.39/140.32   ( -4.32%)   148.80/142.32   ( -4.55%)   150.70/143.56   ( -4.97%)     0.87/0.87     (  -0.99%)
elm3a203-
sysbench: unavailable

SLUB is a loss on this machine but similar to bl6-13, it went from regressions
of 10-13% to regressions of 2-4% so it is still an improvement.

==

So at the end of all that, it is very clear that modifications to this path
are as not as clear-cut a win/loss as one might like. Despite the lack of
clarity, the patch appears to be a plus on the balance in many cases so

Acked-by: Mel Gorman <mel@csn.ul.ie>

> 4k allocations cannot optimally be handled by SLUB if we are restricted to 
> order 0 allocs because the fastpath only handles fractions of one 
> allocation unit and if the allocation unit is 4k then we only have one 
> object per slab.
> 
> Isnt there a way that we can make the page allocator handle PAGE_SIZEd 
> allocations in such a way that is competitive with the slab allocators? 
> The cycle count for an allocation needs to be <100 not just below 1000 as 
> it is now.
> 
> ---
>  include/linux/slub_def.h |    6 +++---
>  mm/slub.c                |   25 +++++++++++++++++--------
>  2 files changed, 20 insertions(+), 11 deletions(-)
> 
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2008-02-09 13:04:48.464203968 -0800
> +++ linux-2.6/include/linux/slub_def.h	2008-02-09 13:08:37.413120259 -0800
> @@ -110,7 +110,7 @@ struct kmem_cache {
>   * We keep the general caches in an array of slab caches that are used for
>   * 2^x bytes of allocations.
>   */
> -extern struct kmem_cache kmalloc_caches[PAGE_SHIFT];
> +extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
>  
>  /*
>   * Sorry that the following has to be that ugly but some versions of GCC
> @@ -191,7 +191,7 @@ void *__kmalloc(size_t size, gfp_t flags
>  static __always_inline void *kmalloc(size_t size, gfp_t flags)
>  {
>  	if (__builtin_constant_p(size)) {
> -		if (size > PAGE_SIZE / 2)
> +		if (size > PAGE_SIZE)
>  			return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -214,7 +214,7 @@ void *kmem_cache_alloc_node(struct kmem_
>  static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
>  {
>  	if (__builtin_constant_p(size) &&
> -		size <= PAGE_SIZE / 2 && !(flags & SLUB_DMA)) {
> +		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
>  			struct kmem_cache *s = kmalloc_slab(size);
>  
>  		if (!s)
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2008-02-09 13:04:48.472203975 -0800
> +++ linux-2.6/mm/slub.c	2008-02-09 13:14:43.786633258 -0800
> @@ -1919,6 +1919,15 @@ static inline int calculate_order(int si
>  	int fraction;
>  
>  	/*
> +	 * Cover up bad performance of page allocator fastpath vs
> +	 * slab allocator fastpaths. Take the largest order reasonable
> +	 * in order to be able to avoid partial list overhead.
> +	 *
> +	 * This yields 8 4k objects per 32k slab allocation.
> +	 */
> +	if (size == PAGE_SIZE)
> +		return PAGE_ALLOC_COSTLY_ORDER;
> +	/*
>  	 * Attempt to find best configuration for a slab. This
>  	 * works by first attempting to generate a layout with
>  	 * the best configuration and backing off gradually.
> @@ -2484,11 +2493,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
>   *		Kmalloc subsystem
>   *******************************************************************/
>  
> -struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
> +struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
>  EXPORT_SYMBOL(kmalloc_caches);
>  
>  #ifdef CONFIG_ZONE_DMA
> -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
> +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
>  #endif
>  
>  static int __init setup_slub_min_order(char *str)
> @@ -2670,7 +2679,7 @@ void *__kmalloc(size_t size, gfp_t flags
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -2688,7 +2697,7 @@ void *__kmalloc_node(size_t size, gfp_t 
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(flags | __GFP_COMP,
>  							get_order(size));
>  
> @@ -3001,7 +3010,7 @@ void __init kmem_cache_init(void)
>  		caches++;
>  	}
>  
> -	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
> +	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
>  		create_kmalloc_cache(&kmalloc_caches[i],
>  			"kmalloc", 1 << i, GFP_KERNEL);
>  		caches++;
> @@ -3028,7 +3037,7 @@ void __init kmem_cache_init(void)
>  	slab_state = UP;
>  
>  	/* Provide the correct kmalloc names now that the caches are up */
> -	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
> +	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
>  		kmalloc_caches[i]. name =
>  			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
>  
> @@ -3218,7 +3227,7 @@ void *__kmalloc_track_caller(size_t size
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
>  							get_order(size));
>  	s = get_slab(size, gfpflags);
> @@ -3234,7 +3243,7 @@ void *__kmalloc_node_track_caller(size_t
>  {
>  	struct kmem_cache *s;
>  
> -	if (unlikely(size > PAGE_SIZE / 2))
> +	if (unlikely(size > PAGE_SIZE))
>  		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
>  							get_order(size));
>  	s = get_slab(size, gfpflags);
> 
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2008-02-13 20:09 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-09 21:45 SLUB tbench regression due to page allocator deficiency Christoph Lameter
2008-02-09 22:35 ` Andrew Morton
2008-02-10  0:19   ` Christoph Lameter
2008-02-10  2:45     ` Nick Piggin
2008-02-10  3:36       ` Christoph Lameter
2008-02-10  3:39       ` Christoph Lameter
2008-02-10 23:24         ` Nick Piggin
2008-02-11 19:14           ` Christoph Lameter
2008-02-11 22:03           ` Christoph Lameter
2008-02-11  7:18         ` Nick Piggin
2008-02-11 19:21           ` Christoph Lameter
2008-02-11 23:40             ` Nick Piggin
2008-02-11 23:42               ` Christoph Lameter
2008-02-11 23:56                 ` Nick Piggin
2008-02-12  0:08                   ` Christoph Lameter
2008-02-12  6:06                   ` Fastpath prototype? Christoph Lameter
2008-02-12 10:40                     ` Andi Kleen
2008-02-12 20:10                       ` Christoph Lameter
2008-02-12 22:31                         ` Christoph Lameter
2008-02-13 11:38                           ` Andi Kleen
2008-02-13 20:09                             ` Christoph Lameter
2008-02-13 18:33                   ` SLUB tbench regression due to page allocator deficiency Paul Jackson
2008-02-11 13:50 ` Mel Gorman
2008-02-13 11:15 ` Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).