Jeremy Fitzhardinge wrote: > Yinghai Lu wrote: >> Jeremy Fitzhardinge wrote: >> >>> Yinghai Lu wrote: >>> >>>> could have more explanation about the 1M size. >>>> because initial_pg_tables will sit in it. please consider to add >>>> something like >>>> >>>> in head_32.S >>>> >>>> LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT >>>> >>>> #if PTRS_PER_PMD > 1 >>>> PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD >>>> #else >>>> PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) >>>> #endif >>>> ALLOCATOR_SLOP = 4 >>>> >>> OK, how does this look: >>> >>> The following changes since commit >>> 21e8ba72daf5d7f0af33968f873499c85f96ccef: >>> Jeremy Fitzhardinge (1): >>> x86: use brk allocation for DMI >>> >>> are available in the git repository at: >>> >>> git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git >>> push/x86/brk >>> >>> Jeremy Fitzhardinge (1): >>> x86: allow extend_brk users to reserve brk space >>> >>> Yinghai Lu (1): >>> x86-32: compute initial mapping size more accurately >>> >>> arch/x86/include/asm/page_32_types.h | 5 +++++ >>> arch/x86/include/asm/setup.h | 30 >>> ++++++++++++++++++++++++++++++ >>> arch/x86/kernel/head_32.S | 4 +++- >>> arch/x86/kernel/setup.c | 2 ++ >>> arch/x86/kernel/vmlinux_32.lds.S | 4 +++- >>> arch/x86/kernel/vmlinux_64.lds.S | 4 +++- >>> 6 files changed, 46 insertions(+), 3 deletions(-) >>> >>> git diff 21e8ba72daf5d7f0af33968f873499c85f96ccef..push/x86/brk >>> diff --git a/arch/x86/include/asm/page_32_types.h >>> b/arch/x86/include/asm/page_32_types.h >>> index f1e4a79..0f915ae 100644 >>> --- a/arch/x86/include/asm/page_32_types.h >>> +++ b/arch/x86/include/asm/page_32_types.h >>> @@ -39,6 +39,11 @@ >>> #define __VIRTUAL_MASK_SHIFT 32 >>> #endif /* CONFIG_X86_PAE */ >>> >>> +/* >>> + * Kernel image size is limited to 512 MB (see in >>> arch/x86/kernel/head_32.S) >>> + */ >>> +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) >>> + >>> #ifndef __ASSEMBLY__ >>> >>> /* >>> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h >>> index 366d366..61b126b 100644 >>> --- a/arch/x86/include/asm/setup.h >>> +++ b/arch/x86/include/asm/setup.h >>> @@ -104,6 +104,29 @@ extern struct boot_params boot_params; >>> extern unsigned long _brk_end; >>> void *extend_brk(size_t size, size_t align); >>> >>> +/* >>> + * Reserve space in the brk section. The name must be unique within >>> + * the file, and somewhat descriptive. The size is in bytes. Must be >>> + * used at file scope. >>> + * >>> + * (This uses a temp function to wrap the asm so we can pass it the >>> + * size parameter; otherwise we wouldn't be able to. We can't use a >>> + * "section" attribute on a normal variable because it always ends up >>> + * being @progbits, which ends up allocating space in the vmlinux >>> + * executable.) >>> + */ >>> +#define RESERVE_BRK(name,sz) \ >>> + static void __section(.discard) __used \ >>> + __brk_reservation_fn_##name##__(void) { \ >>> + asm volatile ( \ >>> + ".pushsection .brk_reservation,\"aw\",@nobits;" \ >>> + "__brk_reservation_" #name "__:" \ >>> + " 1:.skip %c0;" \ >>> + " .size __brk_reservation_" #name "__, . - 1b;" \ >>> + " .popsection" \ >>> + : : "i" (sz)); \ >>> + } >>> + >>> #ifdef __i386__ >>> >>> void __init i386_start_kernel(void); >>> @@ -115,6 +138,13 @@ void __init x86_64_start_reservations(char >>> *real_mode_data); >>> >>> #endif /* __i386__ */ >>> #endif /* _SETUP */ >>> +#else >>> +#define RESERVE_BRK(name,sz) \ >>> + .pushsection .brk_reservation,"aw",@nobits; \ >>> +__brk_reservation_##name##__: \ >>> +1: .skip sz; \ >>> + .size __brk_reservation_##name##__,.-1b; \ >>> + .popsection >>> #endif /* __ASSEMBLY__ */ >>> #endif /* __KERNEL__ */ >>> >>> diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S >>> index d243437..80dc05e 100644 >>> --- a/arch/x86/kernel/head_32.S >>> +++ b/arch/x86/kernel/head_32.S >>> @@ -54,7 +54,7 @@ >>> * >>> * This should be a multiple of a page. >>> */ >>> -LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) >>> +LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT >>> >>> /* >>> * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate >>> @@ -75,6 +75,8 @@ ALLOCATOR_SLOP = 4 >>> >>> INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + >>> ALLOCATOR_SLOP)*PAGE_SIZE_asm >>> >> >> no user for INIT_MAP_BEYOND_END any more. >> > > There are several remaining references: > > : abulafia:pts/0; grep INIT_MAP_BEYOND_END arch/x86/kernel/head_32.S > INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + > ALLOCATOR_SLOP)*PAGE_SIZE_asm > * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. > * End condition: we must map up to and including INIT_MAP_BEYOND_END > leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp > * End condition: we must map up to and including INIT_MAP_BEYOND_END > leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp > > Are you saying they're redundant and should be removed? please check attached ... Impact: cleanup Don't use ram after _end blindly for pagetables. aka init pages is before _end put those pg table into .bss v2: keep initial page table up to 512M only. v4: put initial page tables just before _end Signed-off-by: Yinghai Lu --- arch/x86/include/asm/page_32_types.h | 5 +++ arch/x86/kernel/head32.c | 3 + arch/x86/kernel/head_32.S | 55 ++++++++++++++--------------------- arch/x86/kernel/vmlinux_32.lds.S | 11 ++++++- 4 files changed, 40 insertions(+), 34 deletions(-) Index: linux-2.6/arch/x86/kernel/head32.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/head32.c +++ linux-2.6/arch/x86/kernel/head32.c @@ -18,7 +18,8 @@ void __init i386_start_kernel(void) { reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), + "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ Index: linux-2.6/arch/x86/kernel/head_32.S =================================================================== --- linux-2.6.orig/arch/x86/kernel/head_32.S +++ linux-2.6/arch/x86/kernel/head_32.S @@ -38,42 +38,30 @@ #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id /* - * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. - * We need: - * - one bit for each possible page, but only in low memory, which means - * 232/4096/8 = 128K worst case (4G/4G split.) + * This is how much memory for page table to and including _end + * we need mapped initially. * - enough space to map all low memory, which means - * (232/4096) / 1024 pages (worst case, non PAE) - * (232/4096) / 512 + 4 pages (worst case for PAE) - * - a few pages for allocator use before the kernel pagetable has - * been set up + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries */ -LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) - -/* - * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate - * pagetables from above the 16MB DMA limit, so we'll have to set - * up pagetables 16MB more (worst-case): - */ -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) -LOW_PAGES = LOW_PAGES + 0x1000000 -#endif +LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT #if PTRS_PER_PMD > 1 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD #else PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) #endif -BOOTBITMAP_SIZE = LOW_PAGES / 8 ALLOCATOR_SLOP = 4 -INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm +INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, @@ -166,10 +154,9 @@ num_subarch_entries = (. - subarch_entri /* * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond _end. The variable - * init_pg_tables_end is set up to point to the first "safe" location. + * tables, which are located immediately beyond _end. * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. + * and PAGE_OFFSET for up to _end * * Note that the stack is not yet set up! */ @@ -209,14 +196,14 @@ default_entry: loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables. + * End condition: we must map up to the end. */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end), %ebp + addl PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b 1: - movl %edi,pa(init_pg_tables_end) + movl %edi, pa(init_pg_tables_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) @@ -242,14 +229,14 @@ page_pde_offset = (__PAGE_OFFSET >> 20); addl $0x1000,%eax loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables; the +0x007 is + * End condition: we must map up to end, the +0x007 is * the attribute bits */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end), %ebp + addl PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b - movl %edi,pa(init_pg_tables_end) + movl %edi, pa(init_pg_tables_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) @@ -636,6 +623,10 @@ swapper_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 + +.section ".bss.extra_page_aligned","wa" + .align PAGE_SIZE_asm + .fill INIT_MAP_SIZE,1,0 /* * This starts the data section. */ Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S =================================================================== --- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S +++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S @@ -189,10 +189,13 @@ SECTIONS *(.bss) . = ALIGN(4); __bss_stop = .; - _end = . ; + /* extra_page_aligned must be last one before end*/ /* This is where the kernel creates the early boot page tables */ . = ALIGN(PAGE_SIZE); pg0 = . ; + *(.bss.extra_page_aligned) + . = ALIGN(8); + _end = . ; } /* Sections to be discarded */ @@ -205,6 +208,12 @@ SECTIONS DWARF_DEBUG } +/* + * Build-time check on the image size: + */ +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + #ifdef CONFIG_KEXEC /* Link time checks */ #include Index: linux-2.6/arch/x86/include/asm/page_32_types.h =================================================================== --- linux-2.6.orig/arch/x86/include/asm/page_32_types.h +++ linux-2.6/arch/x86/include/asm/page_32_types.h @@ -39,6 +39,11 @@ #define __VIRTUAL_MASK_SHIFT 32 #endif /* CONFIG_X86_PAE */ +/* + * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) + */ +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) + #ifndef __ASSEMBLY__ /*