* [PATCH RFC] x86: add brk allocation for very, very early allocations
@ 2009-02-27 17:58 Jeremy Fitzhardinge
2009-02-27 18:43 ` H. Peter Anvin
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-27 17:58 UTC (permalink / raw)
To: Yinghai Lu, H. Peter Anvin, Ingo Molnar
Cc: the arch/x86 maintainers, Linux Kernel Mailing List
[
I'd like to add a mechanism like this so I can dynamically allocate some
Xen-related structures, rather than statically allocating them in the bss,
both so that Xen has less overhead when it isn't being used, and so I can
scale better to things like memory size.
I think this is more widely useful; it would supplant dmi_alloc_data[], for
example, and I'm sure there's other cases.
This is fundimentally the same as head_32.S's extension of the bss to build
the initial kernel mapping, but 64-bit doesn't currently do anything analogous
to this.
Unfortunately when I use this code as-is I'm getting crashes when the slab
allocator starts up. I think this is all correct, but I'm wondering if
there's something I'm overlooking which is broken in principle.
So, what am I missing?
Thanks,
J
]
Add a brk()-like allocator which effectively extends the bss
in order to allow very early code to do dynamic allocations.
This is better than using statically allocated arrays for
data in subsystems which may never get used.
The amount of space available depends on how much the initial
kernel mappings have covered, and so is fairly limited.
Not-Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 66801cb..e6b754b 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -99,6 +99,11 @@ extern struct boot_params boot_params;
*/
#define LOWMEMSIZE() (0x9f000)
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_start, _brk_end;
+void init_brk(unsigned long start);
+void *extend_brk(size_t size, size_t align);
+
#ifdef __i386__
void __init i386_start_kernel(void);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1..fa9ae31 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,6 +34,8 @@ void __init i386_start_kernel(void)
reserve_ebda_region();
+ init_brk((unsigned long)__va(init_pg_tables_end));
+
/*
* At this point everything still needed from the boot loader
* or BIOS or kernel text should be early reserved or marked not
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b2722..4b29802 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,6 +91,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");
+ init_brk((unsigned long)&_end);
+
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0d051b4..8899cfa 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
#endif
unsigned int boot_cpu_id __read_mostly;
+__initdata unsigned long _brk_start, _brk_end;
#ifdef CONFIG_X86_64
int default_cpu_present_to_apicid(int mps_cpu)
@@ -335,6 +336,26 @@ static void __init relocate_initrd(void)
}
#endif
+void __init init_brk(unsigned long brk)
+{
+ _brk_start = _brk_end = brk;
+}
+
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ return ret;
+}
+
static void __init reserve_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -727,11 +748,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
- init_mm.brk = (unsigned long) &_end;
-#endif
+ init_mm.brk = _brk_end;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
@@ -897,6 +914,9 @@ void __init setup_arch(char **cmdline_p)
acpi_numa_init();
#endif
+ if (_brk_end > _brk_start)
+ reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
initmem_init(0, max_pfn);
#ifdef CONFIG_ACPI_SLEEP
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd5df65..4222ed6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/setup.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
@@ -95,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -707,7 +708,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, _brk_end))
return 0;
/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1815b8a..a3cc9ff 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -975,6 +975,9 @@ asmlinkage void __init xen_start_kernel(void)
init_mm.pgd = pgd;
+ /* Set up very early brk allocator after Xen pagetables */
+ init_brk(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE);
+
/* keep using Xen gdt for now; no urgent need to change it */
pv_info.kernel_rpl = 1;
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 17:58 [PATCH RFC] x86: add brk allocation for very, very early allocations Jeremy Fitzhardinge
@ 2009-02-27 18:43 ` H. Peter Anvin
2009-02-27 19:17 ` H. Peter Anvin
2009-02-27 20:26 ` Yinghai Lu
2 siblings, 0 replies; 10+ messages in thread
From: H. Peter Anvin @ 2009-02-27 18:43 UTC (permalink / raw)
To: Jeremy Fitzhardinge
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Jeremy Fitzhardinge wrote:
> I'd like to add a mechanism like this so I can dynamically allocate some
> Xen-related structures, rather than statically allocating them in the bss,
> both so that Xen has less overhead when it isn't being used, and so I can
> scale better to things like memory size.
>
> I think this is more widely useful; it would supplant dmi_alloc_data[], for
> example, and I'm sure there's other cases.
>
> This is fundimentally the same as head_32.S's extension of the bss to build
> the initial kernel mapping, but 64-bit doesn't currently do anything
> analogous
> to this.
>
> Unfortunately when I use this code as-is I'm getting crashes when the slab
> allocator starts up. I think this is all correct, but I'm wondering if
> there's something I'm overlooking which is broken in principle.
First of all, I like the concept. As far as getting crashes, I suspect
what you're finding is some use of this type of extended memory space
that just isn't documented. I would try this in a simulator, setting a
watchpoint on _end to see if you get any hits before the slab allocator
starts.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 17:58 [PATCH RFC] x86: add brk allocation for very, very early allocations Jeremy Fitzhardinge
2009-02-27 18:43 ` H. Peter Anvin
@ 2009-02-27 19:17 ` H. Peter Anvin
2009-02-27 19:26 ` Jeremy Fitzhardinge
2009-02-27 20:26 ` Yinghai Lu
2 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2009-02-27 19:17 UTC (permalink / raw)
To: Jeremy Fitzhardinge
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Jeremy Fitzhardinge wrote:
>
> unsigned int boot_cpu_id __read_mostly;
> +__initdata unsigned long _brk_start, _brk_end;
>
Better yet, initialize _brk_start and _brk_end statically:
extern const char _end[];
__initdata unsigned long _brk_start = (unsigned long)&_end;
__initdata unsigned long _brk_end = (unsigned long)&_end;
That way it's available from the first instruction, and we can fully
remove the x86-32 init_pg_tables_end and replace it with _brk_end (thus
putting the initial page tables in the brk.)
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 19:17 ` H. Peter Anvin
@ 2009-02-27 19:26 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-27 19:26 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
H. Peter Anvin wrote:
> Jeremy Fitzhardinge wrote:
>
>> unsigned int boot_cpu_id __read_mostly;
>> +__initdata unsigned long _brk_start, _brk_end;
>>
>>
>
> Better yet, initialize _brk_start and _brk_end statically:
>
> extern const char _end[];
> __initdata unsigned long _brk_start = (unsigned long)&_end;
> __initdata unsigned long _brk_end = (unsigned long)&_end;
>
> That way it's available from the first instruction, and we can fully
> remove the x86-32 init_pg_tables_end and replace it with _brk_end (thus
> putting the initial page tables in the brk.)
Yes. It doesn't work for Xen (the domain builder puts the initrd and
Xen-built pagetable immediately after the kernel's bss), but its no
problem to move the brk in that case.
J
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 17:58 [PATCH RFC] x86: add brk allocation for very, very early allocations Jeremy Fitzhardinge
2009-02-27 18:43 ` H. Peter Anvin
2009-02-27 19:17 ` H. Peter Anvin
@ 2009-02-27 20:26 ` Yinghai Lu
2009-02-27 21:02 ` Jeremy Fitzhardinge
2009-02-27 21:05 ` H. Peter Anvin
2 siblings, 2 replies; 10+ messages in thread
From: Yinghai Lu @ 2009-02-27 20:26 UTC (permalink / raw)
To: Jeremy Fitzhardinge
Cc: H. Peter Anvin, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Jeremy Fitzhardinge wrote:
> [
> I'd like to add a mechanism like this so I can dynamically allocate some
> Xen-related structures, rather than statically allocating them in the bss,
> both so that Xen has less overhead when it isn't being used, and so I can
> scale better to things like memory size.
>
> I think this is more widely useful; it would supplant dmi_alloc_data[], for
> example, and I'm sure there's other cases.
>
> This is fundimentally the same as head_32.S's extension of the bss to build
> the initial kernel mapping, but 64-bit doesn't currently do anything
> analogous
> to this.
>
> Unfortunately when I use this code as-is I'm getting crashes when the slab
> allocator starts up. I think this is all correct, but I'm wondering if
> there's something I'm overlooking which is broken in principle.
>
> So, what am I missing?
>
> Thanks,
> J
> ]
>
>
> Add a brk()-like allocator which effectively extends the bss
> in order to allow very early code to do dynamic allocations.
> This is better than using statically allocated arrays for
> data in subsystems which may never get used.
>
> The amount of space available depends on how much the initial
> kernel mappings have covered, and so is fairly limited.
>
> Not-Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index 66801cb..e6b754b 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -99,6 +99,11 @@ extern struct boot_params boot_params;
> */
> #define LOWMEMSIZE() (0x9f000)
>
> +/* exceedingly early brk-like allocator */
> +extern unsigned long _brk_start, _brk_end;
> +void init_brk(unsigned long start);
> +void *extend_brk(size_t size, size_t align);
> +
> #ifdef __i386__
>
> void __init i386_start_kernel(void);
> diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
> index ac108d1..fa9ae31 100644
> --- a/arch/x86/kernel/head32.c
> +++ b/arch/x86/kernel/head32.c
> @@ -34,6 +34,8 @@ void __init i386_start_kernel(void)
>
> reserve_ebda_region();
>
> + init_brk((unsigned long)__va(init_pg_tables_end));
> +
> /*
> * At this point everything still needed from the boot loader
> * or BIOS or kernel text should be early reserved or marked not
> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> index f5b2722..4b29802 100644
> --- a/arch/x86/kernel/head64.c
> +++ b/arch/x86/kernel/head64.c
> @@ -91,6 +91,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
> if (console_loglevel == 10)
> early_printk("Kernel alive\n");
>
> + init_brk((unsigned long)&_end);
> +
> x86_64_start_reservations(real_mode_data);
> }
>
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 0d051b4..8899cfa 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -113,6 +113,7 @@
> #endif
>
> unsigned int boot_cpu_id __read_mostly;
> +__initdata unsigned long _brk_start, _brk_end;
>
> #ifdef CONFIG_X86_64
> int default_cpu_present_to_apicid(int mps_cpu)
> @@ -335,6 +336,26 @@ static void __init relocate_initrd(void)
> }
> #endif
>
> +void __init init_brk(unsigned long brk)
> +{
> + _brk_start = _brk_end = brk;
> +}
> +
> +void * __init extend_brk(size_t size, size_t align)
> +{
> + size_t mask = align - 1;
> + void *ret;
> +
> + BUG_ON(align & mask);
> +
> + _brk_end = (_brk_end + mask) & ~mask;
> +
> + ret = (void *)_brk_end;
> + _brk_end += size;
> +
> + return ret;
> +}
> +
> static void __init reserve_initrd(void)
> {
> u64 ramdisk_image = boot_params.hdr.ramdisk_image;
> @@ -727,11 +748,7 @@ void __init setup_arch(char **cmdline_p)
> init_mm.start_code = (unsigned long) _text;
> init_mm.end_code = (unsigned long) _etext;
> init_mm.end_data = (unsigned long) _edata;
> -#ifdef CONFIG_X86_32
> - init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
> -#else
> - init_mm.brk = (unsigned long) &_end;
> -#endif
> + init_mm.brk = _brk_end;
>
> code_resource.start = virt_to_phys(_text);
> code_resource.end = virt_to_phys(_etext)-1;
> @@ -897,6 +914,9 @@ void __init setup_arch(char **cmdline_p)
> acpi_numa_init();
> #endif
>
> + if (_brk_end > _brk_start)
> + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
> +
> initmem_init(0, max_pfn);
it seems reserve _brk_end is some late?
init_memory_mapping(0,...) could get some for direct mapping page table. and it could start from _end...
YH
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 20:26 ` Yinghai Lu
@ 2009-02-27 21:02 ` Jeremy Fitzhardinge
2009-02-27 21:05 ` H. Peter Anvin
1 sibling, 0 replies; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-27 21:02 UTC (permalink / raw)
To: Yinghai Lu
Cc: H. Peter Anvin, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Yinghai Lu wrote:
> it seems reserve _brk_end is some late?
>
> init_memory_mapping(0,...) could get some for direct mapping page table. and it could start from _end...
>
Ah, thanks! I was confused by the use of reserve_early() in
initmem_init(), but they're OK because they allocate out of the e820 map
at that point (and init_memory_mapping() reserves everything it allocates).
I'll give it a whirl.
J
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 20:26 ` Yinghai Lu
2009-02-27 21:02 ` Jeremy Fitzhardinge
@ 2009-02-27 21:05 ` H. Peter Anvin
2009-02-27 21:19 ` Jeremy Fitzhardinge
1 sibling, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2009-02-27 21:05 UTC (permalink / raw)
To: Yinghai Lu
Cc: Jeremy Fitzhardinge, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Yinghai Lu wrote:
>
> it seems reserve _brk_end is some late?
>
> init_memory_mapping(0,...) could get some for direct mapping page table. and it could start from _end...
>
That would make sense. init_memory_mapping() needs to know what not to
step on, and that would include the brk at this stage. For 64-bit mode,
Xen is the only user of !PSE, and so may be particularly vulnerable to
this issue (because of massively larger direct mapping tables.)
This reminds me... is your intent that the BRK is permanent (unless
explicitly freed on a page by page basis) or part of the init memory
that is flushed?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 21:05 ` H. Peter Anvin
@ 2009-02-27 21:19 ` Jeremy Fitzhardinge
2009-02-27 21:28 ` H. Peter Anvin
0 siblings, 1 reply; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-27 21:19 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
H. Peter Anvin wrote:
> That would make sense. init_memory_mapping() needs to know what not to
> step on, and that would include the brk at this stage. For 64-bit mode,
> Xen is the only user of !PSE, and so may be particularly vulnerable to
> this issue (because of massively larger direct mapping tables.)
>
Yep.
> This reminds me... is your intent that the BRK is permanent (unless
> explicitly freed on a page by page basis) or part of the init memory
> that is flushed?
>
Permanent. I'm using it to allocate things which are the moral
equivalent of the kernel pagetables.
J
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 21:19 ` Jeremy Fitzhardinge
@ 2009-02-27 21:28 ` H. Peter Anvin
2009-02-27 21:41 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2009-02-27 21:28 UTC (permalink / raw)
To: Jeremy Fitzhardinge
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
Jeremy Fitzhardinge wrote:
>
>> This reminds me... is your intent that the BRK is permanent (unless
>> explicitly freed on a page by page basis) or part of the init memory
>> that is flushed?
>
> Permanent. I'm using it to allocate things which are the moral
> equivalent of the kernel pagetables.
>
Cool; just wanted it noted what the rules were. Even more so the reason
to fold as many possible uses as possible into this new mechanism.
-hpa
P.S. Could you send me the updates to the dom0 patches you said you had?
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH RFC] x86: add brk allocation for very, very early allocations
2009-02-27 21:28 ` H. Peter Anvin
@ 2009-02-27 21:41 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2009-02-27 21:41 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Yinghai Lu, Ingo Molnar, the arch/x86 maintainers,
Linux Kernel Mailing List
H. Peter Anvin wrote:
> Cool; just wanted it noted what the rules were. Even more so the reason
> to fold as many possible uses as possible into this new mechanism.
>
Yes, I just converted i386 pagetable construction and dmi_alloc().
Boots under qemu.
> P.S. Could you send me the updates to the dom0 patches you said you had?
>
Was just tidying them up for posting but got slightly sidetracked. (The
TSS/io-bitmap changes are a prelude to the dom0 patches though.)
J
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2009-02-27 21:41 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-27 17:58 [PATCH RFC] x86: add brk allocation for very, very early allocations Jeremy Fitzhardinge
2009-02-27 18:43 ` H. Peter Anvin
2009-02-27 19:17 ` H. Peter Anvin
2009-02-27 19:26 ` Jeremy Fitzhardinge
2009-02-27 20:26 ` Yinghai Lu
2009-02-27 21:02 ` Jeremy Fitzhardinge
2009-02-27 21:05 ` H. Peter Anvin
2009-02-27 21:19 ` Jeremy Fitzhardinge
2009-02-27 21:28 ` H. Peter Anvin
2009-02-27 21:41 ` Jeremy Fitzhardinge
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox