From: Robin Holt <holt@sgi.com>
To: Michal Hocko <mhocko@suse.cz>, Cliff Wickman <cpw@sgi.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org,
wli@holomorphy.com
Subject: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
Date: Tue, 2 Apr 2013 21:43:44 -0500 [thread overview]
Message-ID: <20130403024344.GA4384@sgi.com> (raw)
In-Reply-To: <20130314085138.GA11636@dhcp22.suse.cz>
Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.
Large system sites would at times like to allocate a very large amount
of memory as 1GB pages. They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
start_kernel
kernel_init
do_pre_smp_initcalls
hugetlb_init
hugetlb_init_hstates
hugetlb_hstate_alloc_pages
Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems). This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems. A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating. 32TB would take 8 hours.
Signed-off-by: Robin Holt <holt@sgi.com>
To: Cliff Whickman <cpw@sgi.com>
To: Michal Hocko <mhocko@suse.cz>
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Linux mm <linux-mm@kvack.org>
Cc: x86 Maintainers <x86@kernel.org>
---
Changes since -v1
- Reworked to remove the special NO_ZERO flag and push that down further
in the call chain.
Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64). I have not tried a config which uses a
bootmem allocator.
include/linux/bootmem.h | 8 +++++++-
mm/bootmem.c | 21 +++++++++++++++++----
mm/hugetlb.c | 2 +-
mm/nobootmem.c | 37 +++++++++++++++++++++++++++----------
mm/sparse.c | 2 +-
5 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+ pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal);
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int zeroed);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit,
+ int zeroed)
{
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ if (zeroed)
+ return kzalloc(size, GFP_NOWAIT);
+ else
+ return kmalloc(size, GFP_NOWAIT);
again:
/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
return NULL;
}
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
if (ptr)
return ptr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
while (nr_nodes) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(
+ addr = __alloc_bootmem_node_nopanic_notzeroed(
NODE_DATA(hstate_next_node_to_alloc(h,
&node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit, int zeroed)
{
void *ptr;
u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return NULL;
ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
+ if (zeroed)
+ memset(ptr, 0, size);
memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return ptr;
}
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ int zeroed)
{
void *ptr;
again:
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;
@@ -314,13 +322,22 @@ again:
return NULL;
}
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
if (ptr)
return ptr;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ SMP_CACHE_BYTES, goal, limit, 1);
if (!p && limit) {
limit = 0;
goto again;
--
1.8.1.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
WARNING: multiple messages have this Message-ID (diff)
From: Robin Holt <holt@sgi.com>
To: Michal Hocko <mhocko@suse.cz>, Cliff Wickman <cpw@sgi.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org,
wli@holomorphy.com
Subject: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2
Date: Tue, 2 Apr 2013 21:43:44 -0500 [thread overview]
Message-ID: <20130403024344.GA4384@sgi.com> (raw)
In-Reply-To: <20130314085138.GA11636@dhcp22.suse.cz>
Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.
Large system sites would at times like to allocate a very large amount
of memory as 1GB pages. They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
start_kernel
kernel_init
do_pre_smp_initcalls
hugetlb_init
hugetlb_init_hstates
hugetlb_hstate_alloc_pages
Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems). This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems. A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating. 32TB would take 8 hours.
Signed-off-by: Robin Holt <holt@sgi.com>
To: Cliff Whickman <cpw@sgi.com>
To: Michal Hocko <mhocko@suse.cz>
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Linux mm <linux-mm@kvack.org>
Cc: x86 Maintainers <x86@kernel.org>
---
Changes since -v1
- Reworked to remove the special NO_ZERO flag and push that down further
in the call chain.
Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64). I have not tried a config which uses a
bootmem allocator.
include/linux/bootmem.h | 8 +++++++-
mm/bootmem.c | 21 +++++++++++++++++----
mm/hugetlb.c | 2 +-
mm/nobootmem.c | 37 +++++++++++++++++++++++++++----------
mm/sparse.c | 2 +-
5 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+ pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal);
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int zeroed);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit,
+ int zeroed)
{
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ if (zeroed)
+ return kzalloc(size, GFP_NOWAIT);
+ else
+ return kmalloc(size, GFP_NOWAIT);
again:
/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
return NULL;
}
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
if (ptr)
return ptr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
while (nr_nodes) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(
+ addr = __alloc_bootmem_node_nopanic_notzeroed(
NODE_DATA(hstate_next_node_to_alloc(h,
&node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit, int zeroed)
{
void *ptr;
u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return NULL;
ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
+ if (zeroed)
+ memset(ptr, 0, size);
memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return ptr;
}
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ int zeroed)
{
void *ptr;
again:
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;
@@ -314,13 +322,22 @@ again:
return NULL;
}
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
if (ptr)
return ptr;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ SMP_CACHE_BYTES, goal, limit, 1);
if (!p && limit) {
limit = 0;
goto again;
--
1.8.1.2
next prev parent reply other threads:[~2013-04-03 2:43 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-06 21:50 [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot Cliff Wickman
2013-03-06 21:50 ` Cliff Wickman
2013-03-10 5:55 ` Hillf Danton
2013-03-10 5:55 ` Hillf Danton
2013-03-11 12:32 ` Cliff Wickman
2013-03-11 12:32 ` Cliff Wickman
2013-03-14 8:51 ` Michal Hocko
2013-03-14 8:51 ` Michal Hocko
2013-04-03 2:43 ` Robin Holt [this message]
2013-04-03 2:43 ` [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2 Robin Holt
2013-04-03 14:00 ` Michal Hocko
2013-04-03 14:00 ` Michal Hocko
2013-04-03 17:21 ` Robin Holt
2013-04-03 17:21 ` Robin Holt
2013-04-04 8:17 ` Michal Hocko
2013-04-04 8:17 ` Michal Hocko
2013-04-03 14:02 ` Michal Hocko
2013-04-03 14:02 ` Michal Hocko
2013-04-03 17:00 ` Robin Holt
2013-04-03 17:00 ` Robin Holt
2013-04-04 8:08 ` Michal Hocko
2013-04-04 8:08 ` Michal Hocko
2013-04-04 0:17 ` [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot Simon Jeons
2013-04-04 0:17 ` Simon Jeons
2013-04-04 12:16 ` Cliff Wickman
2013-04-04 12:16 ` Cliff Wickman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130403024344.GA4384@sgi.com \
--to=holt@sgi.com \
--cc=cpw@sgi.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@suse.cz \
--cc=wli@holomorphy.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.