From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: Yinghai Lu <yinghai@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
"H. Peter Anvin" <hpa@zytor.com>,
"Eric W. Biederman" <ebiederm@xmission.com>,
linux-kernel@vger.kernel.org
Subject: Re: [PATCH v5 03/13] x86, 64bit: Set extra ident mapping for whole kernel range
Date: Fri, 21 Dec 2012 17:28:40 -0500 [thread overview]
Message-ID: <20121221222840.GB1102@phenom.dumpdata.com> (raw)
In-Reply-To: <1354089042-10023-4-git-send-email-yinghai@kernel.org>
On Tue, Nov 27, 2012 at 11:50:32PM -0800, Yinghai Lu wrote:
> Current when kernel is loaded above 1G, only [_text, _text+2M] is set
> up with extra ident page table.
> That is not enough, some variables that could be used early are out of
> that range, like BRK for early page table.
> Need to set map for [_text, _end] include text/data/bss/brk...
>
> Also current kernel is not allowed to be loaded above 512g, it thinks
> that address is too big.
> We need to add one extra spare page for level3 to point that 512g range.
> Need to check _text range and set level4 pg with that spare level3 page,
> and set level3 with level2 page to cover [_text, _end] with extra mapping.
>
> At last, to handle crossing GB boundary, we need to add another
> level2 spare page. To handle crossing 512GB boundary, we need to
> add another level3 spare page to next 512G range.
>
> Test on with kexec-tools with local test code to force loading kernel
> cross 1G, 5G, 512g, 513g.
>
> We need this to put relocatable 64bit bzImage high above 1g.
>
> -v4: add crossing GB boundary handling.
> -v5: use spare pages from BRK, so could save pages when kernel is not
> loaded above 1GB.
>
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> arch/x86/kernel/head_64.S | 203 +++++++++++++++++++++++++++++++++++++++++----
> 1 files changed, 187 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 94bf9cc..338799a 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -20,6 +20,7 @@
> #include <asm/processor-flags.h>
> #include <asm/percpu.h>
> #include <asm/nops.h>
> +#include <asm/setup.h>
>
> #ifdef CONFIG_PARAVIRT
> #include <asm/asm-offsets.h>
> @@ -42,6 +43,13 @@ L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
> L4_START_KERNEL = pgd_index(__START_KERNEL_map)
> L3_START_KERNEL = pud_index(__START_KERNEL_map)
>
> +/* two for level3, and two for level2 */
> +SPARE_MAP_SIZE = (4 * PAGE_SIZE)
> +RESERVE_BRK(spare_map, SPARE_MAP_SIZE)
Perhaps 'spare_directory' ? Or 'spare_table' ?
> +
> +#define spare_page(x) (__brk_base + (x) * PAGE_SIZE)
> +#define add_one_spare_page addq $PAGE_SIZE, _brk_end(%rip)
> +
> .text
> __HEAD
> .code64
> @@ -78,12 +86,6 @@ startup_64:
> testl %eax, %eax
> jnz bad_address
>
> - /* Is the address too large? */
> - leaq _text(%rip), %rdx
> - movq $PGDIR_SIZE, %rax
> - cmpq %rax, %rdx
> - jae bad_address
> -
> /* Fixup the physical addresses in the page table
> */
> addq %rbp, init_level4_pgt + 0(%rip)
> @@ -97,25 +99,196 @@ startup_64:
>
> addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
>
> - /* Add an Identity mapping if I am above 1G */
> + /* Add an Identity mapping if _end is above 1G */
> + leaq _end(%rip), %r9
> + decq %r9
> + cmp $PUD_SIZE, %r9
> + jl ident_complete
> +
> + /* Clear spare pages */
> + leaq __brk_base(%rip), %rdi
> + xorq %rax, %rax
> + movq $(SPARE_MAP_SIZE/8), %rcx
> +1: decq %rcx
> + movq %rax, (%rdi)
> + leaq 8(%rdi), %rdi
> + jnz 1b
> +
> + /* get end */
> + andq $PMD_PAGE_MASK, %r9
> + /* round start to 1G if it is below 1G */
> leaq _text(%rip), %rdi
> andq $PMD_PAGE_MASK, %rdi
> + cmp $PUD_SIZE, %rdi
> + jg 1f
> + movq $PUD_SIZE, %rdi
> +1:
> + /* get 512G index */
> + movq %r9, %r8
> + shrq $PGDIR_SHIFT, %r8
> + andq $(PTRS_PER_PGD - 1), %r8
> + movq %rdi, %rax
> + shrq $PGDIR_SHIFT, %rax
> + andq $(PTRS_PER_PGD - 1), %rax
> +
> + /* cross two 512G ? */
> + cmp %r8, %rax
> + jne set_level3_other_512g
> +
> + /* all in first 512G ? */
> + cmp $0, %rax
> + je skip_level3_spare
> +
> + /* same 512G other than first 512g */
> + /*
> + * We need one level3, one or two level 2,
> + * so use first one for level3.
> + */
> + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + leaq init_level4_pgt(%rip), %rbx
> + movq %rdx, 0(%rbx, %rax, 8)
> + addq $L4_PAGE_OFFSET, %rax
> + movq %rdx, 0(%rbx, %rax, 8)
> + /* one level3 in BRK */
> + add_one_spare_page
> +
> + /* get 1G index */
> + movq %r9, %r8
> + shrq $PUD_SHIFT, %r8
> + andq $(PTRS_PER_PUD - 1), %r8
> + movq %rdi, %rax
> + shrq $PUD_SHIFT, %rax
> + andq $(PTRS_PER_PUD - 1), %rax
> +
> + /* same 1G ? */
> + cmp %r8, %rax
> + je set_level2_start_only_not_first_512g
> +
> + /* set level2 for end */
> + leaq spare_page(0)(%rip), %rbx
> + leaq (spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + movq %rdx, 0(%rbx, %r8, 8)
> + /* second one level2 in BRK */
> + add_one_spare_page
> +
> +set_level2_start_only_not_first_512g:
> + leaq spare_page(0)(%rip), %rbx
> + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + movq %rdx, 0(%rbx, %rax, 8)
> + /* first one level2 in BRK */
> + add_one_spare_page
> +
> + /* one spare level3 before level2*/
> + leaq spare_page(1)(%rip), %rbx
> + jmp set_level2_spare
> +
> +set_level3_other_512g:
> + /*
> + * We need one or two level3, and two level2,
> + * so use first two for level2.
> + */
> + /* for level2 last on first 512g */
> + leaq level3_ident_pgt(%rip), %rcx
> + /* start is in first 512G ? */
> + cmp $0, %rax
> + je set_level2_start_other_512g
>
> + /* Set level3 for _text */
> + leaq (spare_page(3) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + leaq init_level4_pgt(%rip), %rbx
> + movq %rdx, 0(%rbx, %rax, 8)
> + addq $L4_PAGE_OFFSET, %rax
> + movq %rdx, 0(%rbx, %rax, 8)
> + /* first one level3 in BRK */
> + add_one_spare_page
> +
> + /* for level2 last not on first 512G */
> + leaq spare_page(3)(%rip), %rcx
> +
> +set_level2_start_other_512g:
> + /* always need to set level2 */
> movq %rdi, %rax
> shrq $PUD_SHIFT, %rax
> andq $(PTRS_PER_PUD - 1), %rax
> - jz ident_complete
> + movq %rcx, %rbx /* %rcx : level3 spare or level3_ident_pgt */
> + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + movq %rdx, 0(%rbx, %rax, 8)
> + /* first one level2 in BRK */
> + add_one_spare_page
>
> - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> +set_level3_end_other_512g:
> + leaq (spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + leaq init_level4_pgt(%rip), %rbx
> + movq %rdx, 0(%rbx, %r8, 8)
> + addq $L4_PAGE_OFFSET, %r8
> + movq %rdx, 0(%rbx, %r8, 8)
> + /* second one level3 in BRK */
> + add_one_spare_page
> +
> + /* always need to set level2 */
> + movq %r9, %r8
> + shrq $PUD_SHIFT, %r8
> + andq $(PTRS_PER_PUD - 1), %r8
> + leaq spare_page(2)(%rip), %rbx
> + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + movq %rdx, 0(%rbx, %r8, 8)
> + /* second one level2 in BRK */
> + add_one_spare_page
> +
> + /* no spare level3 before level2 */
> + leaq spare_page(0)(%rip), %rbx
> + jmp set_level2_spare
> +
> +skip_level3_spare:
> + /* We have one or two level2 */
> + /* get 1G index */
> + movq %r9, %r8
> + shrq $PUD_SHIFT, %r8
> + andq $(PTRS_PER_PUD - 1), %r8
> + movq %rdi, %rax
> + shrq $PUD_SHIFT, %rax
> + andq $(PTRS_PER_PUD - 1), %rax
> +
> + /* same 1G ? */
> + cmp %r8, %rax
> + je set_level2_start_only_first_512g
> +
> + /* set level2 without level3 spare */
> + leaq level3_ident_pgt(%rip), %rbx
> + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> + movq %rdx, 0(%rbx, %r8, 8)
> + /* second one level2 in BRK */
> + add_one_spare_page
> +
> +set_level2_start_only_first_512g:
> + /* set level2 without level3 spare */
> leaq level3_ident_pgt(%rip), %rbx
> + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
> movq %rdx, 0(%rbx, %rax, 8)
> + /* first one level2 in BRK */
> + add_one_spare_page
>
> + /* no spare level3 */
> + leaq spare_page(0)(%rip), %rbx
> +
> +set_level2_spare:
> movq %rdi, %rax
> shrq $PMD_SHIFT, %rax
> andq $(PTRS_PER_PMD - 1), %rax
> leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
> - leaq level2_spare_pgt(%rip), %rbx
> - movq %rdx, 0(%rbx, %rax, 8)
> + /* %rbx is set before */
> + movq %r9, %r8
> + shrq $PMD_SHIFT, %r8
> + andq $(PTRS_PER_PMD - 1), %r8
> + cmp %r8, %rax
> + jl 1f
> + addq $PTRS_PER_PMD, %r8
> +1: movq %rdx, 0(%rbx, %rax, 8)
> + addq $PMD_SIZE, %rdx
> + incq %rax
> + cmp %r8, %rax
> + jle 1b
> +
> ident_complete:
>
> /*
> @@ -423,11 +596,9 @@ NEXT_PAGE(level2_kernel_pgt)
> * If you want to increase this then increase MODULES_VADDR
> * too.)
> */
> - PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
> - KERNEL_IMAGE_SIZE/PMD_SIZE)
> -
> -NEXT_PAGE(level2_spare_pgt)
> - .fill 512, 8, 0
> + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
> + /* hold the whole page */
> + .fill (PTRS_PER_PMD - (KERNEL_IMAGE_SIZE/PMD_SIZE)), 8, 0
>
> #undef PMDS
> #undef NEXT_PAGE
> --
> 1.7.7
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
next prev parent reply other threads:[~2012-12-21 22:29 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-11-28 7:50 [PATCH v5 00/13] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 01/13] x86, boot: move verify_cpu.S after 0x200 Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 02/13] x86, boot: Move lldt/ltr out of 64bit code section Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 03/13] x86, 64bit: Set extra ident mapping for whole kernel range Yinghai Lu
2012-12-21 22:28 ` Konrad Rzeszutek Wilk [this message]
2012-12-21 22:35 ` Yinghai Lu
2012-12-21 22:39 ` H. Peter Anvin
2012-12-21 22:51 ` Yinghai Lu
2012-12-21 22:54 ` H. Peter Anvin
2012-12-21 23:40 ` Konrad Rzeszutek Wilk
2012-11-28 7:50 ` [PATCH v5 04/13] x86: Merge early_reserve_initrd for 32bit and 64bit Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 05/13] x86: add get_ramdisk_image/size() Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 06/13] x86, boot: add get_cmd_line_ptr() Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 07/13] x86, boot: move checking of cmd_line_ptr out of common path Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 08/13] x86, boot: update cmd_line_ptr to unsigned long Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 09/13] x86: use io_remap to access real_mode_data Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 10/13] x86, boot: add fields to support load bzImage and ramdisk above 4G Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 11/13] x86: remove 1024G limitation for kexec buffer on 64bit Yinghai Lu
2012-11-28 7:50 ` [PATCH v5 12/13] x86, 64bit: Print init kernel lowmap correctly Yinghai Lu
2012-12-21 22:26 ` Konrad Rzeszutek Wilk
2012-12-21 22:44 ` Yinghai Lu
2012-12-21 23:39 ` Konrad Rzeszutek Wilk
2012-12-21 23:52 ` Yinghai Lu
2012-12-22 2:14 ` Konrad Rzeszutek Wilk
2012-11-28 7:50 ` [PATCH v5 13/13] x86, mm: Fix page table early allocation offset checking Yinghai Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20121221222840.GB1102@phenom.dumpdata.com \
--to=konrad.wilk@oracle.com \
--cc=ebiederm@xmission.com \
--cc=hpa@zytor.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=tglx@linutronix.de \
--cc=yinghai@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox