public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [Patch] Allocate sparse vmemmap block above 4G
@ 2007-11-07  7:07 Zou Nan hai
  2007-11-07 17:12 ` Dave Hansen
  0 siblings, 1 reply; 7+ messages in thread
From: Zou Nan hai @ 2007-11-07  7:07 UTC (permalink / raw)
  To: LKML; +Cc: Suresh.b.siddha

Try to allocate sparse vmemmap block above 4G on x64 system.

On some single node x64 system with huge amount of physical memory e.g >
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>


diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c	2007-11-06 15:16:12.000000000 +0800
+++ b/arch/x86/mm/init_64.c	2007-11-06 15:55:50.000000000 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+        unsigned long align)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size,
+                        align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h	2007-11-06 16:06:31.000000000 +0800
+++ b/include/linux/bootmem.h	2007-11-06 15:50:36.000000000 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
 				  unsigned long limit,
 				  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+                unsigned long size,
+                unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c	2007-11-06 16:06:31.000000000 +0800
+++ b/mm/bootmem.c	2007-11-06 15:49:20.000000000 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
 	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
 				    ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+        unsigned long align)
+{
+        return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c	2007-11-06 15:16:12.000000000 +0800
+++ b/mm/sparse-vmemmap.c	2007-11-06 16:08:52.000000000 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
 		if (page)
 			return page_address(page);
 		return NULL;
-	} else
+	} else {
+		void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+		if (p)
+			return p;
 		return __alloc_bootmem_node(NODE_DATA(node), size, size,
 				__pa(MAX_DMA_ADDRESS));
+	}
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch] Allocate sparse vmemmap block above 4G
  2007-11-07  7:07 [Patch] Allocate sparse vmemmap block above 4G Zou Nan hai
@ 2007-11-07 17:12 ` Dave Hansen
  0 siblings, 0 replies; 7+ messages in thread
From: Dave Hansen @ 2007-11-07 17:12 UTC (permalink / raw)
  To: Zou Nan hai; +Cc: LKML, Suresh.b.siddha

On Wed, 2007-11-07 at 15:07 +0800, Zou Nan hai wrote:
> Try to allocate sparse vmemmap block above 4G on x64 system.
> 
> On some single node x64 system with huge amount of physical memory e.g
> 64G. the memmap size maybe very big. 

Could we just change the default bootmem behavior to allocate top-down?

-- Dave


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Patch] Allocate sparse vmemmap block above 4G
@ 2007-11-08  0:52 Zou Nan hai
  2007-11-08 14:07 ` Mel Gorman
  0 siblings, 1 reply; 7+ messages in thread
From: Zou Nan hai @ 2007-11-08  0:52 UTC (permalink / raw)
  To: LKML
  Cc: Linus Torvalds, Greg KH, Dave Jones, Martin Ebourne,
	Suresh Siddha, Andi Kleen, Andrew Morton, Christoph Lameter,
	Mel Gorman, Andy Whitcroft

Resend the patch for more people to review

On some single node x64 system with huge amount of physical memory e.g >
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>

diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c	2007-11-06 15:16:12.000000000 +0800
+++ b/arch/x86/mm/init_64.c	2007-11-06 15:55:50.000000000 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+        unsigned long align)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size,
+                        align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h	2007-11-06 16:06:31.000000000 +0800
+++ b/include/linux/bootmem.h	2007-11-06 15:50:36.000000000 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
 				  unsigned long limit,
 				  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+                unsigned long size,
+                unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c	2007-11-06 16:06:31.000000000 +0800
+++ b/mm/bootmem.c	2007-11-06 15:49:20.000000000 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
 	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
 				    ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+        unsigned long align)
+{
+        return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c	2007-11-06 15:16:12.000000000 +0800
+++ b/mm/sparse-vmemmap.c	2007-11-06 16:08:52.000000000 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
 		if (page)
 			return page_address(page);
 		return NULL;
-	} else
+	} else {
+		void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+		if (p)
+			return p;
 		return __alloc_bootmem_node(NODE_DATA(node), size, size,
 				__pa(MAX_DMA_ADDRESS));
+	}
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,





^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch] Allocate sparse vmemmap block above 4G
  2007-11-08  0:52 Zou Nan hai
@ 2007-11-08 14:07 ` Mel Gorman
  2007-11-09  1:28   ` Zou, Nanhai
  0 siblings, 1 reply; 7+ messages in thread
From: Mel Gorman @ 2007-11-08 14:07 UTC (permalink / raw)
  To: Zou Nan hai
  Cc: LKML, Linus Torvalds, Greg KH, Dave Jones, Martin Ebourne,
	Suresh Siddha, Andi Kleen, Andrew Morton, Christoph Lameter,
	Andy Whitcroft

On (08/11/07 08:52), Zou Nan hai didst pronounce:
> Resend the patch for more people to review
> 
> On some single node x64 system with huge amount of physical memory e.g >
> 64G. the memmap size maybe very big. 
> 
> If the memmap is allocated from low pages, it may occupies too much
> memory below 4G. 
> then swiotlb could fail to reserve bounce buffer under 4G which will
> lead to boot failure.
> 
> This patch will first try to allocate memmap memory above 4G in sparse
> vmemmap code. 
> If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
> This patch is against 2.6.24-rc1-git14
> 

You never state that you depend on the strict_goal patch either here or in
a leader mail. The usual way of getting peoples attention is to have three
mails. In your case it would be

[PATCH 0/2] Allocate vmemmap from highmem where possible
[PATCH 1/2] Strictly place bootmem allocations when requested
[PATCH 2/2] Allocate sparse vmemmap block above 4G on x86_64

Maybe you have gone through all this already and I'm coming so late that I
missed it all. If that is the case, sorry for the noise.

> Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> 
> diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> --- a/arch/x86/mm/init_64.c	2007-11-06 15:16:12.000000000 +0800
> +++ b/arch/x86/mm/init_64.c	2007-11-06 15:55:50.000000000 +0800
> @@ -448,6 +448,13 @@ void online_page(struct page *page)
>  	num_physpages++;
>  }
>  
> +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
> +        unsigned long align)
> +{

Wrong markup there I believe. The __meminit markup is for functions
that are needed at runtime when memory is hot-added or hot-removed.
Bootmem functions do not qualify. __init is sufficient.

The name is confusing as well. I don't know what a high node is, but I
think you mean alloc_bootmem_highmem  or alloc_bootmem_highmem_zone.
That in *itself* is confusing on x86_64 because the memory above 4GiB is
ZONE_NORMAL, not ZONE_HIGHMEM. Calling it alloc_bootmem_nondma() makes
it worse.

I think you need to rework this to have an arch-specific function
like arch_alloc_vmemmap_block() that by default allocates with
____alloc_bootmem_node() and otherwise uses an arch-specific function.
In this case, it would know to call __alloc_bootmem_core() with the proper
addressing limits.

> +        return __alloc_bootmem_core(pgdat->bdata, size,
> +                        align, (4UL*1024*1024*1024), 0, 1);
> +}

More magic values, both the 4GiB address here and the magic "1" at the
end are problems.

> +
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  /*
>   * Memory is added always to NORMAL zone. This means you will never get
> diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
> --- a/include/linux/bootmem.h	2007-11-06 16:06:31.000000000 +0800
> +++ b/include/linux/bootmem.h	2007-11-06 15:50:36.000000000 +0800
> @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
>  				  unsigned long limit,
>  				  int strict_goal);
>  
> +extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
> +                unsigned long size,
> +                unsigned long align);
> +

Confusing. Your declaration here makes it look like a bootmem API
function but it is an arch-specific function that only exists for
x86-64. I know it gets overridden later when a weak symbol but the more
common approach is to have Kconfig define something like
ARCH_HIGHMEM_VMEMMAP and

#ifdef CONFIG_ARCH_HIGHMEM_VMEMMAP
extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
                unsigned long size,
                unsigned long align);
#else
static inline void *alloc_bootmem_high_node(pg_data_t *pgdat,
		unsigned long size,
		unsigned long align) {
	return NULL;
}
#endif /* CONFIG_ARCH_HIGHMEM_VMEMMAP

It's be similar if you have an arch-specific callback for vmalloc block
allocations instead of extending the bootmem API.

>
>  #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
>  extern void reserve_bootmem(unsigned long addr, unsigned long size);
>  #define alloc_bootmem(x) \
> diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
> --- a/mm/bootmem.c	2007-11-06 16:06:31.000000000 +0800
> +++ b/mm/bootmem.c	2007-11-06 15:49:20.000000000 +0800
> @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
>  	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
>  				    ARCH_LOW_ADDRESS_LIMIT, 0);
>  }
> +
> +__attribute__((weak)) __meminit

__meminit vs _init again

> +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
> +        unsigned long align)
> +{
> +        return NULL;
> +}
> +
> diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> --- a/mm/sparse-vmemmap.c	2007-11-06 15:16:12.000000000 +0800
> +++ b/mm/sparse-vmemmap.c	2007-11-06 16:08:52.000000000 +0800
> @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
>  		if (page)
>  			return page_address(page);
>  		return NULL;
> -	} else
> +	} else {
> +		void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
> +		if (p)
> +			return p;
>  		return __alloc_bootmem_node(NODE_DATA(node), size, size,
>  				__pa(MAX_DMA_ADDRESS));

If you had an arch-specific function, I guess this would turn into

	} else {
		return arch_vmemmap_alloc_block(...);
	}

> +	}
>  }
>  
>  void __meminit vmemmap_verify(pte_t *pte, int node,
> 
> 
> 
> 

-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [Patch] Allocate sparse vmemmap block above 4G
  2007-11-08 14:07 ` Mel Gorman
@ 2007-11-09  1:28   ` Zou, Nanhai
  2007-11-09  1:54     ` Christoph Lameter
  2007-11-09 14:42     ` Mel Gorman
  0 siblings, 2 replies; 7+ messages in thread
From: Zou, Nanhai @ 2007-11-09  1:28 UTC (permalink / raw)
  To: Mel Gorman
  Cc: LKML, Linus Torvalds, Greg KH, Dave Jones, Martin Ebourne,
	Siddha, Suresh B, Andi Kleen, Andrew Morton, Christoph Lameter,
	Andy Whitcroft


> -----Original Message-----
> From: Mel Gorman [mailto:mel@skynet.ie]
> Sent: 2007年11月8日 22:07
> To: Zou, Nanhai
> Cc: LKML; Linus Torvalds; Greg KH; Dave Jones; Martin Ebourne; Siddha, Suresh
> B; Andi Kleen; Andrew Morton; Christoph Lameter; Andy Whitcroft
> Subject: Re: [Patch] Allocate sparse vmemmap block above 4G
> 
> On (08/11/07 08:52), Zou Nan hai didst pronounce:
> > Resend the patch for more people to review
> >
> > On some single node x64 system with huge amount of physical memory e.g >
> > 64G. the memmap size maybe very big.
> >
> > If the memmap is allocated from low pages, it may occupies too much
> > memory below 4G.
> > then swiotlb could fail to reserve bounce buffer under 4G which will
> > lead to boot failure.
> >
> > This patch will first try to allocate memmap memory above 4G in sparse
> > vmemmap code.
> > If it failed, it will allocate memmap above MAX_DMA_ADDRESS.
> > This patch is against 2.6.24-rc1-git14
> >
> 
> You never state that you depend on the strict_goal patch either here or in
> a leader mail. The usual way of getting peoples attention is to have three
> mails. In your case it would be
> 
> [PATCH 0/2] Allocate vmemmap from highmem where possible
> [PATCH 1/2] Strictly place bootmem allocations when requested
> [PATCH 2/2] Allocate sparse vmemmap block above 4G on x86_64
> 
> Maybe you have gone through all this already and I'm coming so late that I
> missed it all. If that is the case, sorry for the noise.
Hi Mel,
 Thanks for reviewing.

> 
> > Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
> > Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> >
> > diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> > --- a/arch/x86/mm/init_64.c	2007-11-06 15:16:12.000000000 +0800
> > +++ b/arch/x86/mm/init_64.c	2007-11-06 15:55:50.000000000 +0800
> > @@ -448,6 +448,13 @@ void online_page(struct page *page)
> >  	num_physpages++;
> >  }
> >
> > +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long
> size,
> > +        unsigned long align)
> > +{
> 
> Wrong markup there I believe. The __meminit markup is for functions
> that are needed at runtime when memory is hot-added or hot-removed.
> Bootmem functions do not qualify. __init is sufficient.
> 
__meminit here is to avoid section link warning here.
this function is called by vmemmap_alloc_block which is a __meminit function,
so if I mark it as __meminit, there were be warning about section mismatch, 
although this function will not be called during memory hotplug time.

> The name is confusing as well. I don't know what a high node is, but I
> think you mean alloc_bootmem_highmem  or alloc_bootmem_highmem_zone.
> That in *itself* is confusing on x86_64 because the memory above 4GiB is
> ZONE_NORMAL, not ZONE_HIGHMEM. Calling it alloc_bootmem_nondma() makes
> it worse.
> 
> I think you need to rework this to have an arch-specific function
> like arch_alloc_vmemmap_block() that by default allocates with
> ____alloc_bootmem_node() and otherwise uses an arch-specific function.
> In this case, it would know to call __alloc_bootmem_core() with the proper
> addressing limits.
> 
> > +        return __alloc_bootmem_core(pgdat->bdata, size,
> > +                        align, (4UL*1024*1024*1024), 0, 1);
> > +}
> 
> More magic values, both the 4GiB address here and the magic "1" at the
> end are problems.
> 
Yes, the 4UL*1024*1024*1024 could be a define here.

However I think define constant for a boolean parameter is overkilling. Look at kernel code, 
e.g.
the force parameter in get_user_pages
and 
the sync parameter in try_to_wake_up 
we don't do things like
#define TRY_TO_WAKEUP_SYNC 1
#define TRY_TO_WAKEUP_NOSYNC 0 

There are other examples in kernel code. I believe it is a common practice not to define constant for a Boolean parameter.
How do you think?

> > +
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  /*
> >   * Memory is added always to NORMAL zone. This means you will never get
> > diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
> > --- a/include/linux/bootmem.h	2007-11-06 16:06:31.000000000 +0800
> > +++ b/include/linux/bootmem.h	2007-11-06 15:50:36.000000000 +0800
> > @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
> >  				  unsigned long limit,
> >  				  int strict_goal);
> >
> > +extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
> > +                unsigned long size,
> > +                unsigned long align);
> > +
> 
> Confusing. Your declaration here makes it look like a bootmem API
> function but it is an arch-specific function that only exists for
> x86-64. I know it gets overridden later when a weak symbol but the more
> common approach is to have Kconfig define something like
> ARCH_HIGHMEM_VMEMMAP and
> 
> #ifdef CONFIG_ARCH_HIGHMEM_VMEMMAP
> extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
>                 unsigned long size,
>                 unsigned long align);
> #else
> static inline void *alloc_bootmem_high_node(pg_data_t *pgdat,
> 		unsigned long size,
> 		unsigned long align) {
I was told by Andrew that 
> 	return NULL;
> }
> #endif /* CONFIG_ARCH_HIGHMEM_VMEMMAP
> 
> It's be similar if you have an arch-specific callback for vmalloc block
> allocations instead of extending the bootmem API.
> 
I was told by Andrew that ARCH_HAS_XXX is not preferred half a year before in
http://lkml.org/lkml/2007/5/17/287
So I reworked that patch to the preferred __attribute__ ((weak)))
.....


> >
> >  #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
> >  extern void reserve_bootmem(unsigned long addr, unsigned long size);
> >  #define alloc_bootmem(x) \
> > diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
> > --- a/mm/bootmem.c	2007-11-06 16:06:31.000000000 +0800
> > +++ b/mm/bootmem.c	2007-11-06 15:49:20.000000000 +0800
> > @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
> >  	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
> >  				    ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> > +
> > +__attribute__((weak)) __meminit
> 
> __meminit vs _init again
> 
> > +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
> > +        unsigned long align)
> > +{
> > +        return NULL;
> > +}
> > +
> > diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > --- a/mm/sparse-vmemmap.c	2007-11-06 15:16:12.000000000 +0800
> > +++ b/mm/sparse-vmemmap.c	2007-11-06 16:08:52.000000000 +0800
> > @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
> >  		if (page)
> >  			return page_address(page);
> >  		return NULL;
> > -	} else
> > +	} else {
> > +		void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
> > +		if (p)
> > +			return p;
> >  		return __alloc_bootmem_node(NODE_DATA(node), size, size,
> >  				__pa(MAX_DMA_ADDRESS));
> 
> If you had an arch-specific function, I guess this would turn into
> 
> 	} else {
> 		return arch_vmemmap_alloc_block(...);
> 	}
> 
> > +	}
> >  }
> >
> >  void __meminit vmemmap_verify(pte_t *pte, int node,
> >
> >
> >
> >
> 
> --
> --
> Mel Gorman
> Part-time Phd Student                          Linux Technology Center
> University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [Patch] Allocate sparse vmemmap block above 4G
  2007-11-09  1:28   ` Zou, Nanhai
@ 2007-11-09  1:54     ` Christoph Lameter
  2007-11-09 14:42     ` Mel Gorman
  1 sibling, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2007-11-09  1:54 UTC (permalink / raw)
  To: Zou, Nanhai
  Cc: Mel Gorman, LKML, Linus Torvalds, Greg KH, Dave Jones,
	Martin Ebourne, Siddha, Suresh B, Andi Kleen, Andrew Morton,
	Andy Whitcroft

On Fri, 9 Nov 2007, Zou, Nanhai wrote:

> > More magic values, both the 4GiB address here and the magic "1" at the
> > end are problems.
> > 
> Yes, the 4UL*1024*1024*1024 could be a define here.

The 4GB boundary here is MAX_DMA32_ADDRESS I guess? We are only having 
this problem because of the two DMA zones on x86_64. I thought Andi was 
getting rid of the first one at 16MB. If he would do so then ZONE_DMA 
could be used instead of DMA32 and everything will be fine.

For now you may want to put

#ifdef CONFIG_ZONE_DMA32

around this code since it depends on DMA32.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch] Allocate sparse vmemmap block above 4G
  2007-11-09  1:28   ` Zou, Nanhai
  2007-11-09  1:54     ` Christoph Lameter
@ 2007-11-09 14:42     ` Mel Gorman
  1 sibling, 0 replies; 7+ messages in thread
From: Mel Gorman @ 2007-11-09 14:42 UTC (permalink / raw)
  To: Zou, Nanhai
  Cc: LKML, Linus Torvalds, Greg KH, Dave Jones, Martin Ebourne,
	Siddha, Suresh B, Andi Kleen, Andrew Morton, Christoph Lameter,
	Andy Whitcroft

On (09/11/07 09:28), Zou, Nanhai didst pronounce:

> > Wrong markup there I believe. The __meminit markup is for functions
> > that are needed at runtime when memory is hot-added or hot-removed.
> > Bootmem functions do not qualify. __init is sufficient.
> > 
> __meminit here is to avoid section link warning here.
> this function is called by vmemmap_alloc_block which is a __meminit function,
> so if I mark it as __meminit, there were be warning about section mismatch, 
> although this function will not be called during memory hotplug time.
> 

Ok, that is fair enough and good enough reason to mark it __meminit.
It's a pity in one sense because a dead function remains in the loaded
kernel image but I can't think of a nice way around it right now.

> > The name is confusing as well. I don't know what a high node is, but I
> > think you mean alloc_bootmem_highmem  or alloc_bootmem_highmem_zone.
> > That in *itself* is confusing on x86_64 because the memory above 4GiB is
> > ZONE_NORMAL, not ZONE_HIGHMEM. Calling it alloc_bootmem_nondma() makes
> > it worse.
> > 
> > I think you need to rework this to have an arch-specific function
> > like arch_alloc_vmemmap_block() that by default allocates with
> > ____alloc_bootmem_node() and otherwise uses an arch-specific function.
> > In this case, it would know to call __alloc_bootmem_core() with the proper
> > addressing limits.
> > 
> > > +        return __alloc_bootmem_core(pgdat->bdata, size,
> > > +                        align, (4UL*1024*1024*1024), 0, 1);
> > > +}
> > 
> > More magic values, both the 4GiB address here and the magic "1" at the
> > end are problems.
> > 
> Yes, the 4UL*1024*1024*1024 could be a define here.
> 
> However I think define constant for a boolean parameter is overkilling. Look at kernel code, 
> e.g.
> the force parameter in get_user_pages
> and 
> the sync parameter in try_to_wake_up 
> we don't do things like
> #define TRY_TO_WAKEUP_SYNC 1
> #define TRY_TO_WAKEUP_NOSYNC 0 
> 
> There are other examples in kernel code. I believe it is a common practice not to define constant for a Boolean parameter.
> How do you think?
> 

I guess it's a question of personal taste. I am not a fan of magic numbers
because it's not clear from the call-site what "1" means. It could be a
value of 1 or a boolean true requiring that I double check the function
being called. I would have used either a #define or defined

__something() - internal to a C file
something()   - non-strict version
something_strict() - the strict version

so it's obvious from the call-site what the intention is. However, I
imagine there are people puking at that notion too.

Fix up the 4GB magic number at least so it reflects that it is DMA32 you
are talking about.

> > > +
> > >  #ifdef CONFIG_MEMORY_HOTPLUG
> > >  /*
> > >   * Memory is added always to NORMAL zone. This means you will never get
> > > diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
> > > --- a/include/linux/bootmem.h	2007-11-06 16:06:31.000000000 +0800
> > > +++ b/include/linux/bootmem.h	2007-11-06 15:50:36.000000000 +0800
> > > @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
> > >  				  unsigned long limit,
> > >  				  int strict_goal);
> > >
> > > +extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
> > > +                unsigned long size,
> > > +                unsigned long align);
> > > +
> > 
> > Confusing. Your declaration here makes it look like a bootmem API
> > function but it is an arch-specific function that only exists for
> > x86-64. I know it gets overridden later when a weak symbol but the more
> > common approach is to have Kconfig define something like
> > ARCH_HIGHMEM_VMEMMAP and
> > 
> > #ifdef CONFIG_ARCH_HIGHMEM_VMEMMAP
> > extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
> >                 unsigned long size,
> >                 unsigned long align);
> > #else
> > static inline void *alloc_bootmem_high_node(pg_data_t *pgdat,
> > 		unsigned long size,
> > 		unsigned long align) {
> I was told by Andrew that 
> > 	return NULL;
> > }
> > #endif /* CONFIG_ARCH_HIGHMEM_VMEMMAP
> > 
> > It's be similar if you have an arch-specific callback for vmalloc block
> > allocations instead of extending the bootmem API.
> > 
> I was told by Andrew that ARCH_HAS_XXX is not preferred half a year before in
> http://lkml.org/lkml/2007/5/17/287
> So I reworked that patch to the preferred __attribute__ ((weak)))
> .....
> 

Ok, my bad. I wasn't aware of that.

> 
> > >
> > >  #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
> > >  extern void reserve_bootmem(unsigned long addr, unsigned long size);
> > >  #define alloc_bootmem(x) \
> > > diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
> > > --- a/mm/bootmem.c	2007-11-06 16:06:31.000000000 +0800
> > > +++ b/mm/bootmem.c	2007-11-06 15:49:20.000000000 +0800
> > > @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
> > >  	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
> > >  				    ARCH_LOW_ADDRESS_LIMIT, 0);
> > >  }
> > > +
> > > +__attribute__((weak)) __meminit
> > 
> > __meminit vs _init again
> > 
> > > +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
> > > +        unsigned long align)
> > > +{
> > > +        return NULL;
> > > +}
> > > +
> > > diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > > --- a/mm/sparse-vmemmap.c	2007-11-06 15:16:12.000000000 +0800
> > > +++ b/mm/sparse-vmemmap.c	2007-11-06 16:08:52.000000000 +0800
> > > @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
> > >  		if (page)
> > >  			return page_address(page);
> > >  		return NULL;
> > > -	} else
> > > +	} else {
> > > +		void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
> > > +		if (p)
> > > +			return p;
> > >  		return __alloc_bootmem_node(NODE_DATA(node), size, size,
> > >  				__pa(MAX_DMA_ADDRESS));
> > 
> > If you had an arch-specific function, I guess this would turn into
> > 
> > 	} else {
> > 		return arch_vmemmap_alloc_block(...);
> > 	}
> > 
> > > +	}
> > >  }
> > >
> > >  void __meminit vmemmap_verify(pte_t *pte, int node,
> > >
> > >
> > >
> > >
> > 
> > --
> > --
> > Mel Gorman
> > Part-time Phd Student                          Linux Technology Center
> > University of Limerick                         IBM Dublin Software Lab
> 

-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-11-09 14:43 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-07  7:07 [Patch] Allocate sparse vmemmap block above 4G Zou Nan hai
2007-11-07 17:12 ` Dave Hansen
  -- strict thread matches above, loose matches on Subject: below --
2007-11-08  0:52 Zou Nan hai
2007-11-08 14:07 ` Mel Gorman
2007-11-09  1:28   ` Zou, Nanhai
2007-11-09  1:54     ` Christoph Lameter
2007-11-09 14:42     ` Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox