[PATCH] Use numa policy API for boot time policy

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Use numa policy API for boot time policy
@ 2004-06-05  1:43 Andi Kleen
  2004-06-05  1:56 ` Manfred Spraul
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2004-06-05  1:43 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, manfred


Suggested by Manfred Spraul.

__get_free_pages had a hack to do node interleaving allocation at boot time.
This patch sets an interleave process policy using the NUMA API for init
and the idle threads instead. Before entering the user space init the policy
is reset to default again. Result is the same.

Advantage is less code and removing of a check from a fast path.

Removes more code than it adds.

I verified that the memory distribution after boot is roughly the same.

diff -u linux-2.6.7rc2-work/include/linux/mempolicy.h-o linux-2.6.7rc2-work/include/linux/mempolicy.h
--- linux-2.6.7rc2-work/include/linux/mempolicy.h-o	2004-05-31 23:22:36.000000000 +0200
+++ linux-2.6.7rc2-work/include/linux/mempolicy.h	2004-06-05 00:40:54.000000000 +0200
@@ -153,6 +153,9 @@
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
+extern void numa_default_policy(void);
+extern void numa_policy_init(void);
+
 #else
 
 struct mempolicy {};
@@ -215,6 +218,14 @@
 #define vma_policy(vma) NULL
 #define vma_set_policy(vma, pol) do {} while(0)
 
+static inline void numa_policy_init(void)
+{
+}
+
+static inline void numa_default_policy(void)
+{
+}
+
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff -u linux-2.6.7rc2-work/mm/page_alloc.c-o linux-2.6.7rc2-work/mm/page_alloc.c
--- linux-2.6.7rc2-work/mm/page_alloc.c-o	2004-05-31 23:22:37.000000000 +0200
+++ linux-2.6.7rc2-work/mm/page_alloc.c	2004-05-31 23:35:32.000000000 +0200
@@ -732,53 +732,12 @@
 
 EXPORT_SYMBOL(__alloc_pages);
 
-#ifdef CONFIG_NUMA
-/* Early boot: Everything is done by one cpu, but the data structures will be
- * used by all cpus - spread them on all nodes.
- */
-static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
-{
-static int nodenr;
-	int i = nodenr;
-	struct page *page;
-
-	for (;;) {
-		if (i > nodenr + numnodes)
-			return 0;
-		if (node_present_pages(i%numnodes)) {
-			struct zone **z;
-			/* The node contains memory. Check that there is
-			 * memory in the intended zonelist.
-			 */
-			z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
-			while (*z) {
-				if ( (*z)->free_pages > (1UL<<order))
-					goto found_node;
-				z++;
-			}
-		}
-		i++;
-	}
-found_node:
-	nodenr = i+1;
-	page = alloc_pages_node(i%numnodes, gfp_mask, order);
-	if (!page)
-		return 0;
-	return (unsigned long) page_address(page);
-}
-#endif
-
 /*
  * Common helper functions.
  */
 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 {
 	struct page * page;
-
-#ifdef CONFIG_NUMA
-	if (unlikely(system_state == SYSTEM_BOOTING))
-		return get_boot_pages(gfp_mask, order);
-#endif
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
diff -u linux-2.6.7rc2-work/mm/mempolicy.c-o linux-2.6.7rc2-work/mm/mempolicy.c
--- linux-2.6.7rc2-work/mm/mempolicy.c-o	2004-05-31 23:22:55.000000000 +0200
+++ linux-2.6.7rc2-work/mm/mempolicy.c	2004-06-05 00:50:50.000000000 +0200
@@ -1001,7 +1001,8 @@
 	up(&p->sem);
 }
 
-static __init int numa_policy_init(void)
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
 {
 	policy_cache = kmem_cache_create("numa_policy",
 					 sizeof(struct mempolicy),
@@ -1010,6 +1011,17 @@
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
 				     0, SLAB_PANIC, NULL, NULL);
-	return 0;
+
+	/* Set interleaving policy for system init. This way not all 
+	   the data structures allocated at system boot end up in node zero. */
+
+	if (sys_set_mempolicy(MPOL_INTERLEAVE, &node_online_map, MAX_NUMNODES) < 0) 
+		printk("numa_policy_init: interleaving failed\n");
+}
+
+/* Reset policy of current process to default.
+ * Assumes fs == KERNEL_DS */
+void __init numa_default_policy(void)
+{
+	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
-module_init(numa_policy_init);
diff -u linux-2.6.7rc2-work/init/main.c-o linux-2.6.7rc2-work/init/main.c
--- linux-2.6.7rc2-work/init/main.c-o	2004-05-31 23:22:55.000000000 +0200
+++ linux-2.6.7rc2-work/init/main.c	2004-06-02 03:45:14.000000000 +0200
@@ -43,6 +43,7 @@
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/mempolicy.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -385,6 +386,7 @@
 static void noinline rest_init(void)
 {
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
+	numa_default_policy();
 	unlock_kernel();
  	cpu_idle();
 } 
@@ -456,6 +458,7 @@
 #endif
 	mem_init();
 	kmem_cache_init();
+	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
@@ -645,6 +648,7 @@
 	free_initmem();
 	unlock_kernel();
 	system_state = SYSTEM_RUNNING;
+	numa_default_policy();
 
 	if (sys_open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05  1:43 [PATCH] Use numa policy API for boot time policy Andi Kleen
@ 2004-06-05  1:56 ` Manfred Spraul
  2004-06-05  2:18   ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: Manfred Spraul @ 2004-06-05  1:56 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel

Andi Kleen wrote:

>Suggested by Manfred Spraul.
>
>__get_free_pages had a hack to do node interleaving allocation at boot time.
>This patch sets an interleave process policy using the NUMA API for init
>and the idle threads instead. Before entering the user space init the policy
>is reset to default again. Result is the same.
>
>Advantage is less code and removing of a check from a fast path.
>
>Removes more code than it adds.
>
>I verified that the memory distribution after boot is roughly the same.
>
>  
>
Does it work for order != 0 allocations? It's important that the big 
hash tables do not end up all in node 0. AFAICS alloc_pages_current() 
calls interleave_nodes() only for order==0 allocs.

--
    Manfred


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05  1:56 ` Manfred Spraul
@ 2004-06-05  2:18   ` Andi Kleen
  2004-06-05  2:32     ` Anton Blanchard
  2004-06-05 10:20     ` Manfred Spraul
  0 siblings, 2 replies; 11+ messages in thread
From: Andi Kleen @ 2004-06-05  2:18 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: akpm, linux-kernel

On Sat, 05 Jun 2004 03:56:53 +0200
Manfred Spraul <manfred@colorfullife.com> wrote:

> Andi Kleen wrote:
> 
> >Suggested by Manfred Spraul.
> >
> >__get_free_pages had a hack to do node interleaving allocation at boot time.
> >This patch sets an interleave process policy using the NUMA API for init
> >and the idle threads instead. Before entering the user space init the policy
> >is reset to default again. Result is the same.
> >
> >Advantage is less code and removing of a check from a fast path.
> >
> >Removes more code than it adds.
> >
> >I verified that the memory distribution after boot is roughly the same.
> >
> >  
> >
> Does it work for order != 0 allocations? It's important that the big 
> hash tables do not end up all in node 0. AFAICS alloc_pages_current() 
> calls interleave_nodes() only for order==0 allocs.

That's correct. It will only work for order 0 allocations.

But it sounds quite bogus anyways to move the complete hash tables
to another node anyways. It would probably be better to use vmalloc() 
and a interleaving mapping for it. Then you would get the NUMA bandwidth 
benefit even for accessing single tables.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05  2:18   ` Andi Kleen
@ 2004-06-05  2:32     ` Anton Blanchard
  2004-06-05 10:22       ` Andi Kleen
  2004-06-05 10:20     ` Manfred Spraul
  1 sibling, 1 reply; 11+ messages in thread
From: Anton Blanchard @ 2004-06-05  2:32 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Manfred Spraul, akpm, linux-kernel

 
Hi,

> That's correct. It will only work for order 0 allocations.
> 
> But it sounds quite bogus anyways to move the complete hash tables
> to another node anyways. It would probably be better to use vmalloc() 
> and a interleaving mapping for it. Then you would get the NUMA bandwidth 
> benefit even for accessing single tables.

I posted some before and after numbers when we merged Manfreds patch,
it would be interesting to see the same thing with your patch applied.

Im not only worried about NUMA bandwidth but keeping the amount of
memory left in all the nodes reasonably even. Allocating all the big
hashes on node 0 will decrease that balance.

Anton

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05  2:18   ` Andi Kleen
  2004-06-05  2:32     ` Anton Blanchard
@ 2004-06-05 10:20     ` Manfred Spraul
  2004-06-05 10:33       ` Andi Kleen
  1 sibling, 1 reply; 11+ messages in thread
From: Manfred Spraul @ 2004-06-05 10:20 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel

Andi Kleen wrote:

>On Sat, 05 Jun 2004 03:56:53 +0200
>Manfred Spraul <manfred@colorfullife.com> wrote:
>  
>
>>Does it work for order != 0 allocations? It's important that the big 
>>hash tables do not end up all in node 0. AFAICS alloc_pages_current() 
>>calls interleave_nodes() only for order==0 allocs.
>>    
>>
>
>That's correct. It will only work for order 0 allocations.
>
>  
>
What's the purpose of the "&& order == 0)" test for MPOL_INTERLEAVE in 
alloc_pages_current?
What would break if it's removed?

And what about in_interrupt() allocations? During boot everything should 
be interleaved - I'd modify default_policy to MPOL_INTERLEAVE instead of 
setting process affinity.

--
    Manfred

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05  2:32     ` Anton Blanchard
@ 2004-06-05 10:22       ` Andi Kleen
  2004-06-05 10:48         ` Andi Kleen
  2004-06-09 15:44         ` Anton Blanchard
  0 siblings, 2 replies; 11+ messages in thread
From: Andi Kleen @ 2004-06-05 10:22 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: manfred, akpm, linux-kernel

On Sat, 5 Jun 2004 12:32:12 +1000
Anton Blanchard <anton@samba.org> wrote:

>  
> Hi,
> 
> > That's correct. It will only work for order 0 allocations.
> > 
> > But it sounds quite bogus anyways to move the complete hash tables
> > to another node anyways. It would probably be better to use vmalloc() 
> > and a interleaving mapping for it. Then you would get the NUMA bandwidth 
> > benefit even for accessing single tables.
> 
> I posted some before and after numbers when we merged Manfreds patch,
> it would be interesting to see the same thing with your patch applied.
> 
> Im not only worried about NUMA bandwidth but keeping the amount of
> memory left in all the nodes reasonably even. Allocating all the big
> hashes on node 0 will decrease that balance.

It would be a one liner change to allow process policy interleaving 
for orders > 0 in mempolicy. But I'm not sure how useful it is, since
the granuality would be really bad.

Have you ever tried to switch to implement a vmalloc_interleave() for these
tables instead? My bet is that it will perform better.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05 10:20     ` Manfred Spraul
@ 2004-06-05 10:33       ` Andi Kleen
  0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2004-06-05 10:33 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: akpm, linux-kernel

On Sat, 05 Jun 2004 12:20:53 +0200
Manfred Spraul <manfred@colorfullife.com> wrote:

> Andi Kleen wrote:
> 
> >On Sat, 05 Jun 2004 03:56:53 +0200
> >Manfred Spraul <manfred@colorfullife.com> wrote:
> >  
> >
> >>Does it work for order != 0 allocations? It's important that the big 
> >>hash tables do not end up all in node 0. AFAICS alloc_pages_current() 
> >>calls interleave_nodes() only for order==0 allocs.
> >>    
> >>
> >
> >That's correct. It will only work for order 0 allocations.
> >
> >  
> >
> What's the purpose of the "&& order == 0)" test for MPOL_INTERLEAVE in 
> alloc_pages_current?
> What would break if it's removed?

Nothing. Just the interleaving will not be very good.
Just the vma interleaving relies on order 0 right now.

But I would really try to use vmalloc() for this. In fact you don't
even need vmalloc_interleaved(), because the normal vmalloc allocation
together with the interleave policy should do the right thing.

> 
> And what about in_interrupt() allocations? During boot everything should 
> be interleaved - I'd modify default_policy to MPOL_INTERLEAVE instead of 
> setting process affinity.

Better don't do that. It may break some subtle assumptions.

I guess the in_interrupt() allocations will have to live with that.
They should be relatively rare.

In theory you could add a system_state == SYSTEM_BOOTING check again,
but polluting the fast path for this would be imho overkill.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05 10:22       ` Andi Kleen
@ 2004-06-05 10:48         ` Andi Kleen
  2004-06-09 15:44         ` Anton Blanchard
  1 sibling, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2004-06-05 10:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: anton, manfred, akpm, linux-kernel

On Sat, 5 Jun 2004 12:22:39 +0200
Andi Kleen <ak@suse.de> wrote:


> Have you ever tried to switch to implement a vmalloc_interleave() for these
> tables instead? My bet is that it will perform better.

Actually vmalloc_interleaved() is not needed. With process interleaving
policy a ordinary vmalloc() should do that already.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-05 10:22       ` Andi Kleen
  2004-06-05 10:48         ` Andi Kleen
@ 2004-06-09 15:44         ` Anton Blanchard
  2004-06-09 15:56           ` Andi Kleen
  1 sibling, 1 reply; 11+ messages in thread
From: Anton Blanchard @ 2004-06-09 15:44 UTC (permalink / raw)
  To: Andi Kleen; +Cc: manfred, akpm, linux-kernel

> It would be a one liner change to allow process policy interleaving 
> for orders > 0 in mempolicy. But I'm not sure how useful it is, since
> the granuality would be really bad.

OK. Id like to take a quick look at order > 0 allocations during boot
to see if its worth it. The ppc64 page size is small and we might be
doing a significant number of order 1 allocations.

> Have you ever tried to switch to implement a vmalloc_interleave() for these
> tables instead? My bet is that it will perform better.

Im warming to this idea. We would need a per arch override, since there
is a trade off here between interleaving and TLB usage.

We also have a problem in 2.6 on our bigger machines where our dcache
hash and inode hash cache are limited to MAX_ORDER (16MB on ppc64). By
using vmalloc would allow us to interleave the memory and allocate more
than 16MB for those hashes.

Anton

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-09 15:44         ` Anton Blanchard
@ 2004-06-09 15:56           ` Andi Kleen
  2004-06-09 16:12             ` Anton Blanchard
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2004-06-09 15:56 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: manfred, akpm, linux-kernel

On Thu, 10 Jun 2004 01:44:29 +1000
Anton Blanchard <anton@samba.org> wrote:

>  
> > It would be a one liner change to allow process policy interleaving 
> > for orders > 0 in mempolicy. But I'm not sure how useful it is, since
> > the granuality would be really bad.
> 
> OK. Id like to take a quick look at order > 0 allocations during boot
> to see if its worth it. The ppc64 page size is small and we might be
> doing a significant number of order 1 allocations.

For what? 

> > Have you ever tried to switch to implement a vmalloc_interleave() for these
> > tables instead? My bet is that it will perform better.
> 
> Im warming to this idea. We would need a per arch override, since there
> is a trade off here between interleaving and TLB usage.

Actually just standard vmalloc is enough. The interleave policy in alloc_pages
will transparently interleave the order 0 pages allocated by vmalloc.

When I find some time I will try that on Opteron too.

> 
> We also have a problem in 2.6 on our bigger machines where our dcache
> hash and inode hash cache are limited to MAX_ORDER (16MB on ppc64). By
> using vmalloc would allow us to interleave the memory and allocate more
> than 16MB for those hashes.

IMHO 16MB hash table for a kernel structure is madness. A different data
structure is probably needed if it's really a problem
(is your dcache that big?). Or maybe just limit the dcache more aggressively
to keep the max number of entries smaller.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use numa policy API for boot time policy
  2004-06-09 15:56           ` Andi Kleen
@ 2004-06-09 16:12             ` Anton Blanchard
  0 siblings, 0 replies; 11+ messages in thread
From: Anton Blanchard @ 2004-06-09 16:12 UTC (permalink / raw)
  To: Andi Kleen; +Cc: manfred, akpm, linux-kernel, jrsantos, mbligh

> For what? 

No idea, I just want to convince myself that there arent any out there.

> IMHO 16MB hash table for a kernel structure is madness. A different data
> structure is probably needed if it's really a problem
> (is your dcache that big?). Or maybe just limit the dcache more aggressively
> to keep the max number of entries smaller.

Yep, specSFS (an NFS benchmark) shows this up quite badly. I think Jose
and Martin were looking at strategies for keeping the dcache under
control.

This was on a machine with only 64GB of RAM, if we had an NFS server
with more memory then its reasonable to want more memory dedicated to
dentries. At that point we either need to increase the hash or look at
using another structure.

Anton

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-06-09 16:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-05  1:43 [PATCH] Use numa policy API for boot time policy Andi Kleen
2004-06-05  1:56 ` Manfred Spraul
2004-06-05  2:18   ` Andi Kleen
2004-06-05  2:32     ` Anton Blanchard
2004-06-05 10:22       ` Andi Kleen
2004-06-05 10:48         ` Andi Kleen
2004-06-09 15:44         ` Anton Blanchard
2004-06-09 15:56           ` Andi Kleen
2004-06-09 16:12             ` Anton Blanchard
2004-06-05 10:20     ` Manfred Spraul
2004-06-05 10:33       ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox