Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Andrew Hastings <abh@cray.com>
To: Andi Kleen <andi@firstfloor.org>
Cc: linux-kernel@vger.kernel.org, pj@sgi.com, linux-mm@kvack.org,
	nickpiggin@yahoo.com.au
Subject: Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER
Date: Wed, 09 Apr 2008 11:05:17 -0500	[thread overview]
Message-ID: <47FCE93D.4090509@cray.com> (raw)
In-Reply-To: <20080317015826.110AA1B41E0@basil.firstfloor.org>

Andi Kleen wrote:
> This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
> not practical to enlarge MAX_ORDER to 1GB. 
> 
> Instead the 1GB pages are only allocated at boot using the bootmem
> allocator using the hugepages=... option.
> 
> These 1G bootmem pages are never freed. In theory it would be possible
> to implement that with some complications, but since it would be a one-way
> street (> MAX_ORDER pages cannot be allocated later) I decided not to currently.
> 
> The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big
> and the ifdef uglyness seemed not be worth it.

This looks like an off-by-one error here and in the code below -- it 
should be ">= MAX_ORDER" not "> MAX_ORDER".  Cf alloc_pages() in gfp.h:

         if (unlikely(order >= MAX_ORDER))
                 return NULL;

> Known problems: /proc/meminfo and "free" do not display the memory 
> allocated for gb pages in "Total". This is a little confusing for the
> user.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  mm/hugetlb.c |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 62 insertions(+), 2 deletions(-)
> 
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -14,6 +14,7 @@
>  #include <linux/mempolicy.h>
>  #include <linux/cpuset.h>
>  #include <linux/mutex.h>
> +#include <linux/bootmem.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> @@ -153,7 +154,7 @@ static void free_huge_page(struct page *
>  	INIT_LIST_HEAD(&page->lru);
>  
>  	spin_lock(&hugetlb_lock);
> -	if (h->surplus_huge_pages_node[nid]) {
> +	if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) {
>  		update_and_free_page(h, page);
>  		h->surplus_huge_pages--;
>  		h->surplus_huge_pages_node[nid]--;
> @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag
>  {
>  	struct page *page;
>  
> +	if (h->order > MAX_ORDER)
> +		return NULL;
> +
>  	page = alloc_pages_node(nid,
>  		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
>  			huge_page_order(h));
> @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag
>  	struct page *page;
>  	unsigned int nid;
>  
> +	if (h->order > MAX_ORDER)
> +		return NULL;
> +
>  	/*
>  	 * Assume we will successfully allocate the surplus page to
>  	 * prevent racing processes from causing the surplus to exceed
> @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat
>  	/* Uncommit the reservation */
>  	h->resv_huge_pages -= unused_resv_pages;
>  
> +	/* Cannot return gigantic pages currently */
> +	if (h->order > MAX_ORDER)
> +		return;
> +
>  	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
>  
>  	while (nr_pages) {
> @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru
>  	return page;
>  }
>  
> +static __initdata LIST_HEAD(huge_boot_pages);
> +
> +struct huge_bm_page {
> +	struct list_head list;
> +	struct hstate *hstate;
> +};
> +
> +static int __init alloc_bm_huge_page(struct hstate *h)
> +{
> +	struct huge_bm_page *m;
> +	m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
> +					huge_page_size(h), huge_page_size(h),
> +					0);
> +	if (!m)
> +		return 0;
> +	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
> +	/* Put them into a private list first because mem_map is not up yet */
> +	list_add(&m->list, &huge_boot_pages);
> +	m->hstate = h;
> +	huge_next_node(h);
> +	return 1;
> +}
> +
> +/* Put bootmem huge pages into the standard lists after mem_map is up */
> +static int __init huge_init_bm(void)
> +{
> +	struct huge_bm_page *m;
> +	list_for_each_entry (m, &huge_boot_pages, list) {
> +		struct page *page = virt_to_page(m);
> +		struct hstate *h = m->hstate;
> +		__ClearPageReserved(page);
> +		prep_compound_page(page, h->order);
> +		huge_new_page(h, page);
> +	}
> +	return 0;
> +}
> +__initcall(huge_init_bm);
> +
>  static int __init hugetlb_init_hstate(struct hstate *h)
>  {
>  	unsigned long i;
> @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st
>  	h->hugetlb_next_nid = first_node(node_online_map);
>  
>  	for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
> -		if (!alloc_fresh_huge_page(h))
> +		if (h->order > MAX_ORDER) {
> +			if (!alloc_bm_huge_page(h))
> +				break;
> +		} else if (!alloc_fresh_huge_page(h))
>  			break;
>  	}
>  	max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
> @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs
>  {
>  	int i;
>  
> +	if (h->order > MAX_ORDER)
> +		return;
> +
>  	for (i = 0; i < MAX_NUMNODES; ++i) {
>  		struct page *page, *next;
>  		struct list_head *freel = &h->hugepage_freelists[i];
> @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns
>  
>  	*err = 0;
>  
> +	if (h->order > MAX_ORDER) {
> +		*err = -EINVAL;
> +		return max_huge_pages[h - hstates];
> +	}
> +
>  	/*
>  	 * Increase the pool size
>  	 * First take pages out of surplus state.  Then make up the

-Andrew Hastings
  Cray Inc.

WARNING: multiple messages have this Message-ID (diff)

From: Andrew Hastings <abh@cray.com>
To: Andi Kleen <andi@firstfloor.org>
Cc: linux-kernel@vger.kernel.org, pj@sgi.com, linux-mm@kvack.org,
	nickpiggin@yahoo.com.au
Subject: Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER
Date: Wed, 09 Apr 2008 11:05:17 -0500	[thread overview]
Message-ID: <47FCE93D.4090509@cray.com> (raw)
In-Reply-To: <20080317015826.110AA1B41E0@basil.firstfloor.org>

Andi Kleen wrote:
> This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
> not practical to enlarge MAX_ORDER to 1GB. 
> 
> Instead the 1GB pages are only allocated at boot using the bootmem
> allocator using the hugepages=... option.
> 
> These 1G bootmem pages are never freed. In theory it would be possible
> to implement that with some complications, but since it would be a one-way
> street (> MAX_ORDER pages cannot be allocated later) I decided not to currently.
> 
> The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big
> and the ifdef uglyness seemed not be worth it.

This looks like an off-by-one error here and in the code below -- it 
should be ">= MAX_ORDER" not "> MAX_ORDER".  Cf alloc_pages() in gfp.h:

         if (unlikely(order >= MAX_ORDER))
                 return NULL;

> Known problems: /proc/meminfo and "free" do not display the memory 
> allocated for gb pages in "Total". This is a little confusing for the
> user.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  mm/hugetlb.c |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 62 insertions(+), 2 deletions(-)
> 
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -14,6 +14,7 @@
>  #include <linux/mempolicy.h>
>  #include <linux/cpuset.h>
>  #include <linux/mutex.h>
> +#include <linux/bootmem.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> @@ -153,7 +154,7 @@ static void free_huge_page(struct page *
>  	INIT_LIST_HEAD(&page->lru);
>  
>  	spin_lock(&hugetlb_lock);
> -	if (h->surplus_huge_pages_node[nid]) {
> +	if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) {
>  		update_and_free_page(h, page);
>  		h->surplus_huge_pages--;
>  		h->surplus_huge_pages_node[nid]--;
> @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag
>  {
>  	struct page *page;
>  
> +	if (h->order > MAX_ORDER)
> +		return NULL;
> +
>  	page = alloc_pages_node(nid,
>  		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
>  			huge_page_order(h));
> @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag
>  	struct page *page;
>  	unsigned int nid;
>  
> +	if (h->order > MAX_ORDER)
> +		return NULL;
> +
>  	/*
>  	 * Assume we will successfully allocate the surplus page to
>  	 * prevent racing processes from causing the surplus to exceed
> @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat
>  	/* Uncommit the reservation */
>  	h->resv_huge_pages -= unused_resv_pages;
>  
> +	/* Cannot return gigantic pages currently */
> +	if (h->order > MAX_ORDER)
> +		return;
> +
>  	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
>  
>  	while (nr_pages) {
> @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru
>  	return page;
>  }
>  
> +static __initdata LIST_HEAD(huge_boot_pages);
> +
> +struct huge_bm_page {
> +	struct list_head list;
> +	struct hstate *hstate;
> +};
> +
> +static int __init alloc_bm_huge_page(struct hstate *h)
> +{
> +	struct huge_bm_page *m;
> +	m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
> +					huge_page_size(h), huge_page_size(h),
> +					0);
> +	if (!m)
> +		return 0;
> +	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
> +	/* Put them into a private list first because mem_map is not up yet */
> +	list_add(&m->list, &huge_boot_pages);
> +	m->hstate = h;
> +	huge_next_node(h);
> +	return 1;
> +}
> +
> +/* Put bootmem huge pages into the standard lists after mem_map is up */
> +static int __init huge_init_bm(void)
> +{
> +	struct huge_bm_page *m;
> +	list_for_each_entry (m, &huge_boot_pages, list) {
> +		struct page *page = virt_to_page(m);
> +		struct hstate *h = m->hstate;
> +		__ClearPageReserved(page);
> +		prep_compound_page(page, h->order);
> +		huge_new_page(h, page);
> +	}
> +	return 0;
> +}
> +__initcall(huge_init_bm);
> +
>  static int __init hugetlb_init_hstate(struct hstate *h)
>  {
>  	unsigned long i;
> @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st
>  	h->hugetlb_next_nid = first_node(node_online_map);
>  
>  	for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
> -		if (!alloc_fresh_huge_page(h))
> +		if (h->order > MAX_ORDER) {
> +			if (!alloc_bm_huge_page(h))
> +				break;
> +		} else if (!alloc_fresh_huge_page(h))
>  			break;
>  	}
>  	max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
> @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs
>  {
>  	int i;
>  
> +	if (h->order > MAX_ORDER)
> +		return;
> +
>  	for (i = 0; i < MAX_NUMNODES; ++i) {
>  		struct page *page, *next;
>  		struct list_head *freel = &h->hugepage_freelists[i];
> @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns
>  
>  	*err = 0;
>  
> +	if (h->order > MAX_ORDER) {
> +		*err = -EINVAL;
> +		return max_huge_pages[h - hstates];
> +	}
> +
>  	/*
>  	 * Increase the pool size
>  	 * First take pages out of surplus state.  Then make up the

-Andrew Hastings
  Cray Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2008-04-09 19:02 UTC|newest]

Thread overview: 150+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-17  1:58 [PATCH] [0/18] GB pages hugetlb support Andi Kleen
2008-03-17  1:58 ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [1/18] Convert hugeltlb.c over to pass global state around in a structure Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17 20:15   ` Adam Litke
2008-03-17 20:15     ` Adam Litke
2008-03-18 12:05   ` Mel Gorman
2008-03-18 12:05     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [2/18] Add basic support for more than one hstate in hugetlbfs Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17 20:22   ` Adam Litke
2008-03-17 20:22     ` Adam Litke
2008-03-17 20:44     ` Andi Kleen
2008-03-17 20:44       ` Andi Kleen
2008-03-18 12:23   ` Mel Gorman
2008-03-18 12:23     ` Mel Gorman
2008-03-23 10:38   ` KOSAKI Motohiro
2008-03-23 10:38     ` KOSAKI Motohiro
2008-03-23 11:28     ` Andi Kleen
2008-03-23 11:28       ` Andi Kleen
2008-03-23 11:30       ` KOSAKI Motohiro
2008-03-23 11:30         ` KOSAKI Motohiro
2008-03-17  1:58 ` [PATCH] [3/18] Convert /proc output code over to report multiple hstates Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 12:28   ` Mel Gorman
2008-03-18 12:28     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [4/18] Add basic support for more than one hstate in hugetlbfs Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  8:09   ` Paul Jackson
2008-03-17  8:09     ` Paul Jackson
2008-03-17  8:15     ` Andi Kleen
2008-03-17  8:15       ` Andi Kleen
2008-03-17 20:28   ` Adam Litke
2008-03-17 20:28     ` Adam Litke
2008-03-18 14:11   ` Mel Gorman
2008-03-18 14:11     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [5/18] Expand the hugetlbfs sysctls to handle arrays for all hstates Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 14:34   ` Mel Gorman
2008-03-18 14:34     ` Mel Gorman
2008-03-18 16:49     ` Andi Kleen
2008-03-18 16:49       ` Andi Kleen
2008-03-18 17:01       ` Mel Gorman
2008-03-18 17:01         ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [6/18] Add support to have individual hstates for each hugetlbfs mount Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 14:10   ` Adam Litke
2008-03-18 14:10     ` Adam Litke
2008-03-18 15:02   ` Mel Gorman
2008-03-18 15:02     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [7/18] Abstract out the NUMA node round robin code into a separate function Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 15:42   ` Mel Gorman
2008-03-18 15:42     ` Mel Gorman
2008-03-18 15:47     ` Andi Kleen
2008-03-18 15:47       ` Andi Kleen
2008-03-18 16:04       ` Mel Gorman
2008-03-18 16:04         ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [8/18] Add a __alloc_bootmem_node_nopanic Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 15:54   ` Mel Gorman
2008-03-18 15:54     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [9/18] Export prep_compound_page to the hugetlb allocator Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [10/18] Factor out new huge page preparation code into separate function Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17 20:31   ` Adam Litke
2008-03-17 20:31     ` Adam Litke
2008-03-18 16:02   ` Mel Gorman
2008-03-18 16:02     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [11/18] Fix alignment bug in bootmem allocator Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  2:19   ` Yinghai Lu
2008-03-17  2:19     ` Yinghai Lu
2008-03-17  7:02     ` Andi Kleen
2008-03-17  7:02       ` Andi Kleen
2008-03-17  7:17       ` Yinghai Lu
2008-03-17  7:17         ` Yinghai Lu
2008-03-17  7:31         ` Yinghai Lu
2008-03-17  7:31           ` Yinghai Lu
2008-03-17  7:41           ` Andi Kleen
2008-03-17  7:41             ` Andi Kleen
2008-03-17  7:53             ` Yinghai Lu
2008-03-17  7:53               ` Yinghai Lu
2008-03-17  8:10               ` Yinghai Lu
2008-03-17  8:17                 ` Andi Kleen
2008-03-17  8:17                   ` Andi Kleen
2008-03-17  8:56               ` Andi Kleen
2008-03-17  8:56                 ` Andi Kleen
2008-03-17 18:52                 ` Yinghai Lu
2008-03-17 18:52                   ` Yinghai Lu
2008-03-17 21:27                   ` Yinghai Lu
2008-03-18  2:06                     ` Yinghai Lu
2008-03-18  2:06                       ` Yinghai Lu
2008-03-18 16:18   ` Mel Gorman
2008-03-18 16:18     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 16:27   ` Mel Gorman
2008-03-18 16:27     ` Mel Gorman
2008-04-09 16:05   ` Andrew Hastings [this message]
2008-04-09 16:05     ` Andrew Hastings
2008-04-09 17:56     ` Andi Kleen
2008-04-09 17:56       ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [13/18] Add support to allocate hugepages of different size with hugepages= Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 16:32   ` Mel Gorman
2008-03-18 16:32     ` Mel Gorman
2008-03-18 16:45     ` Andi Kleen
2008-03-18 16:45       ` Andi Kleen
2008-03-18 16:46       ` Mel Gorman
2008-03-18 16:46         ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [14/18] Clean up hugetlb boot time printk Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-18 16:37   ` Mel Gorman
2008-03-18 16:37     ` Mel Gorman
2008-03-17  1:58 ` [PATCH] [15/18] Add support to x86-64 to allocate and lookup GB pages in hugetlb Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [16/18] Add huge pud support to hugetlbfs Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [17/18] Add huge pud support to mm/memory.c Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  1:58 ` [PATCH] [18/18] Implement hugepagesz= option for x86-64 Andi Kleen
2008-03-17  1:58   ` Andi Kleen
2008-03-17  9:29   ` Paul Jackson
2008-03-17  9:29     ` Paul Jackson
2008-03-17  9:59     ` Andi Kleen
2008-03-17  9:59       ` Andi Kleen
2008-03-17 10:02       ` Paul Jackson
2008-03-17 10:02         ` Paul Jackson
2008-03-17  3:11 ` [PATCH] [0/18] GB pages hugetlb support Paul Jackson
2008-03-17  3:11   ` Paul Jackson
2008-03-17  7:00   ` Andi Kleen
2008-03-17  7:00     ` Andi Kleen
2008-03-17  7:00     ` Paul Jackson
2008-03-17  7:00       ` Paul Jackson
2008-03-17  7:29       ` Andi Kleen
2008-03-17  7:29         ` Andi Kleen
2008-03-17  5:35 ` Paul Jackson
2008-03-17  5:35   ` Paul Jackson
2008-03-17  6:58   ` Andi Kleen
2008-03-17  6:58     ` Andi Kleen
2008-03-17  9:26 ` Paul Jackson
2008-03-17  9:26   ` Paul Jackson
2008-03-17 15:05 ` Adam Litke
2008-03-17 15:05   ` Adam Litke
2008-03-17 15:33   ` Andi Kleen
2008-03-17 15:33     ` Andi Kleen
2008-03-17 15:59     ` Adam Litke
2008-03-17 15:59       ` Adam Litke

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47FCE93D.4090509@cray.com \
    --to=abh@cray.com \
    --cc=andi@firstfloor.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nickpiggin@yahoo.com.au \
    --cc=pj@sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.