From: Andrew Hastings <abh@cray.com>
To: Andi Kleen <andi@firstfloor.org>
Cc: linux-kernel@vger.kernel.org, pj@sgi.com, linux-mm@kvack.org,
nickpiggin@yahoo.com.au
Subject: Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER
Date: Wed, 09 Apr 2008 11:05:17 -0500 [thread overview]
Message-ID: <47FCE93D.4090509@cray.com> (raw)
In-Reply-To: <20080317015826.110AA1B41E0@basil.firstfloor.org>
Andi Kleen wrote:
> This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
> not practical to enlarge MAX_ORDER to 1GB.
>
> Instead the 1GB pages are only allocated at boot using the bootmem
> allocator using the hugepages=... option.
>
> These 1G bootmem pages are never freed. In theory it would be possible
> to implement that with some complications, but since it would be a one-way
> street (> MAX_ORDER pages cannot be allocated later) I decided not to currently.
>
> The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big
> and the ifdef uglyness seemed not be worth it.
This looks like an off-by-one error here and in the code below -- it
should be ">= MAX_ORDER" not "> MAX_ORDER". Cf alloc_pages() in gfp.h:
if (unlikely(order >= MAX_ORDER))
return NULL;
> Known problems: /proc/meminfo and "free" do not display the memory
> allocated for gb pages in "Total". This is a little confusing for the
> user.
>
> Signed-off-by: Andi Kleen <ak@suse.de>
>
> ---
> mm/hugetlb.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 62 insertions(+), 2 deletions(-)
>
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -14,6 +14,7 @@
> #include <linux/mempolicy.h>
> #include <linux/cpuset.h>
> #include <linux/mutex.h>
> +#include <linux/bootmem.h>
>
> #include <asm/page.h>
> #include <asm/pgtable.h>
> @@ -153,7 +154,7 @@ static void free_huge_page(struct page *
> INIT_LIST_HEAD(&page->lru);
>
> spin_lock(&hugetlb_lock);
> - if (h->surplus_huge_pages_node[nid]) {
> + if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) {
> update_and_free_page(h, page);
> h->surplus_huge_pages--;
> h->surplus_huge_pages_node[nid]--;
> @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag
> {
> struct page *page;
>
> + if (h->order > MAX_ORDER)
> + return NULL;
> +
> page = alloc_pages_node(nid,
> htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
> huge_page_order(h));
> @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag
> struct page *page;
> unsigned int nid;
>
> + if (h->order > MAX_ORDER)
> + return NULL;
> +
> /*
> * Assume we will successfully allocate the surplus page to
> * prevent racing processes from causing the surplus to exceed
> @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat
> /* Uncommit the reservation */
> h->resv_huge_pages -= unused_resv_pages;
>
> + /* Cannot return gigantic pages currently */
> + if (h->order > MAX_ORDER)
> + return;
> +
> nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
>
> while (nr_pages) {
> @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru
> return page;
> }
>
> +static __initdata LIST_HEAD(huge_boot_pages);
> +
> +struct huge_bm_page {
> + struct list_head list;
> + struct hstate *hstate;
> +};
> +
> +static int __init alloc_bm_huge_page(struct hstate *h)
> +{
> + struct huge_bm_page *m;
> + m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
> + huge_page_size(h), huge_page_size(h),
> + 0);
> + if (!m)
> + return 0;
> + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
> + /* Put them into a private list first because mem_map is not up yet */
> + list_add(&m->list, &huge_boot_pages);
> + m->hstate = h;
> + huge_next_node(h);
> + return 1;
> +}
> +
> +/* Put bootmem huge pages into the standard lists after mem_map is up */
> +static int __init huge_init_bm(void)
> +{
> + struct huge_bm_page *m;
> + list_for_each_entry (m, &huge_boot_pages, list) {
> + struct page *page = virt_to_page(m);
> + struct hstate *h = m->hstate;
> + __ClearPageReserved(page);
> + prep_compound_page(page, h->order);
> + huge_new_page(h, page);
> + }
> + return 0;
> +}
> +__initcall(huge_init_bm);
> +
> static int __init hugetlb_init_hstate(struct hstate *h)
> {
> unsigned long i;
> @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st
> h->hugetlb_next_nid = first_node(node_online_map);
>
> for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
> - if (!alloc_fresh_huge_page(h))
> + if (h->order > MAX_ORDER) {
> + if (!alloc_bm_huge_page(h))
> + break;
> + } else if (!alloc_fresh_huge_page(h))
> break;
> }
> max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
> @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs
> {
> int i;
>
> + if (h->order > MAX_ORDER)
> + return;
> +
> for (i = 0; i < MAX_NUMNODES; ++i) {
> struct page *page, *next;
> struct list_head *freel = &h->hugepage_freelists[i];
> @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns
>
> *err = 0;
>
> + if (h->order > MAX_ORDER) {
> + *err = -EINVAL;
> + return max_huge_pages[h - hstates];
> + }
> +
> /*
> * Increase the pool size
> * First take pages out of surplus state. Then make up the
-Andrew Hastings
Cray Inc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2008-04-09 16:05 UTC|newest]
Thread overview: 76+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-03-17 1:58 [PATCH] [0/18] GB pages hugetlb support Andi Kleen
2008-03-17 1:58 ` [PATCH] [1/18] Convert hugeltlb.c over to pass global state around in a structure Andi Kleen
2008-03-17 20:15 ` Adam Litke
2008-03-18 12:05 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [2/18] Add basic support for more than one hstate in hugetlbfs Andi Kleen
2008-03-17 20:22 ` Adam Litke
2008-03-17 20:44 ` Andi Kleen
2008-03-18 12:23 ` Mel Gorman
2008-03-23 10:38 ` KOSAKI Motohiro
2008-03-23 11:28 ` Andi Kleen
2008-03-23 11:30 ` KOSAKI Motohiro
2008-03-17 1:58 ` [PATCH] [3/18] Convert /proc output code over to report multiple hstates Andi Kleen
2008-03-18 12:28 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [4/18] Add basic support for more than one hstate in hugetlbfs Andi Kleen
2008-03-17 8:09 ` Paul Jackson
2008-03-17 8:15 ` Andi Kleen
2008-03-17 20:28 ` Adam Litke
2008-03-18 14:11 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [5/18] Expand the hugetlbfs sysctls to handle arrays for all hstates Andi Kleen
2008-03-18 14:34 ` Mel Gorman
2008-03-18 16:49 ` Andi Kleen
2008-03-18 17:01 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [6/18] Add support to have individual hstates for each hugetlbfs mount Andi Kleen
2008-03-18 14:10 ` Adam Litke
2008-03-18 15:02 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [7/18] Abstract out the NUMA node round robin code into a separate function Andi Kleen
2008-03-18 15:42 ` Mel Gorman
2008-03-18 15:47 ` Andi Kleen
2008-03-18 16:04 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [8/18] Add a __alloc_bootmem_node_nopanic Andi Kleen
2008-03-18 15:54 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [9/18] Export prep_compound_page to the hugetlb allocator Andi Kleen
2008-03-17 1:58 ` [PATCH] [10/18] Factor out new huge page preparation code into separate function Andi Kleen
2008-03-17 20:31 ` Adam Litke
2008-03-18 16:02 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [11/18] Fix alignment bug in bootmem allocator Andi Kleen
2008-03-17 2:19 ` Yinghai Lu
2008-03-17 7:02 ` Andi Kleen
2008-03-17 7:17 ` Yinghai Lu
2008-03-17 7:31 ` Yinghai Lu
2008-03-17 7:41 ` Andi Kleen
2008-03-17 7:53 ` Yinghai Lu
2008-03-17 8:10 ` Yinghai Lu
2008-03-17 8:17 ` Andi Kleen
2008-03-17 8:56 ` Andi Kleen
2008-03-17 18:52 ` Yinghai Lu
2008-03-17 21:27 ` Yinghai Lu
2008-03-18 2:06 ` Yinghai Lu
2008-03-18 16:18 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER Andi Kleen
2008-03-18 16:27 ` Mel Gorman
2008-04-09 16:05 ` Andrew Hastings [this message]
2008-04-09 17:56 ` Andi Kleen
2008-03-17 1:58 ` [PATCH] [13/18] Add support to allocate hugepages of different size with hugepages= Andi Kleen
2008-03-18 16:32 ` Mel Gorman
2008-03-18 16:45 ` Andi Kleen
2008-03-18 16:46 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [14/18] Clean up hugetlb boot time printk Andi Kleen
2008-03-18 16:37 ` Mel Gorman
2008-03-17 1:58 ` [PATCH] [15/18] Add support to x86-64 to allocate and lookup GB pages in hugetlb Andi Kleen
2008-03-17 1:58 ` [PATCH] [16/18] Add huge pud support to hugetlbfs Andi Kleen
2008-03-17 1:58 ` [PATCH] [17/18] Add huge pud support to mm/memory.c Andi Kleen
2008-03-17 1:58 ` [PATCH] [18/18] Implement hugepagesz= option for x86-64 Andi Kleen
2008-03-17 9:29 ` Paul Jackson
2008-03-17 9:59 ` Andi Kleen
2008-03-17 10:02 ` Paul Jackson
2008-03-17 3:11 ` [PATCH] [0/18] GB pages hugetlb support Paul Jackson
2008-03-17 7:00 ` Andi Kleen
2008-03-17 7:00 ` Paul Jackson
2008-03-17 7:29 ` Andi Kleen
2008-03-17 5:35 ` Paul Jackson
2008-03-17 6:58 ` Andi Kleen
2008-03-17 9:26 ` Paul Jackson
2008-03-17 15:05 ` Adam Litke
2008-03-17 15:33 ` Andi Kleen
2008-03-17 15:59 ` Adam Litke
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=47FCE93D.4090509@cray.com \
--to=abh@cray.com \
--cc=andi@firstfloor.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nickpiggin@yahoo.com.au \
--cc=pj@sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).