linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	"H. Peter Anvin" <hpa@zytor.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, Yinghai Lu <yinghai@kernel.org>
Subject: [PATCH v6 06/27] x86, 64bit: Set extra ident mapping for whole kernel range
Date: Thu, 13 Dec 2012 14:02:00 -0800	[thread overview]
Message-ID: <1355436141-8668-7-git-send-email-yinghai@kernel.org> (raw)
In-Reply-To: <1355436141-8668-1-git-send-email-yinghai@kernel.org>

Current when kernel is loaded above 1G, only [_text, _text+2M] is set
up with extra ident page table.
That is not enough, some variables that could be used early are out of
that range, like BRK for early page table.
Need to set map for [_text, _end] include text/data/bss/brk...

Also current kernel is not allowed to be loaded above 512g, it thinks
that address is too big.
We need to add one extra spare page for level3 to point that 512g range.
Need to check _text range and set level4 pg with that spare level3 page,
and set level3 with level2 page to cover [_text, _end] with extra mapping.

At last, to handle crossing GB boundary, we need to add another
level2 spare page. To handle crossing 512GB boundary, we need to
add another level3 spare page to next 512G range.

Test on with kexec-tools with local test code to force loading kernel
cross 1G, 5G, 512g, 513g.

We need this to put relocatable 64bit bzImage high above 1g.

-v4: add crossing GB boundary handling.
-v5: use spare pages from BRK, so could save pages when kernel is not
	loaded above 1GB.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/x86/kernel/head_64.S |  203 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 187 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c..7d13874 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -20,6 +20,7 @@
 #include <asm/processor-flags.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -42,6 +43,13 @@ L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
 L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
+/* two for level3, and two for level2 */
+SPARE_MAP_SIZE = (4 * PAGE_SIZE)
+RESERVE_BRK(spare_map, SPARE_MAP_SIZE)
+
+#define spare_page(x)	(__brk_base + (x) * PAGE_SIZE)
+#define add_one_spare_page	addq $PAGE_SIZE, _brk_end(%rip)
+
 	.text
 	__HEAD
 	.code64
@@ -78,12 +86,6 @@ startup_64:
 	testl	%eax, %eax
 	jnz	bad_address
 
-	/* Is the address too large? */
-	leaq	_text(%rip), %rdx
-	movq	$PGDIR_SIZE, %rax
-	cmpq	%rax, %rdx
-	jae	bad_address
-
 	/* Fixup the physical addresses in the page table
 	 */
 	addq	%rbp, init_level4_pgt + 0(%rip)
@@ -97,25 +99,196 @@ startup_64:
 
 	addq	%rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-	/* Add an Identity mapping if I am above 1G */
+	/* Add an Identity mapping if _end is above 1G */
+	leaq	_end(%rip), %r9
+	decq	%r9
+	cmp	$PUD_SIZE, %r9
+	jl	ident_complete
+
+	/* Clear spare pages */
+	leaq	__brk_base(%rip), %rdi
+	xorq	%rax, %rax
+	movq	$(SPARE_MAP_SIZE/8), %rcx
+1:	decq	%rcx
+	movq	%rax, (%rdi)
+	leaq	8(%rdi), %rdi
+	jnz	1b
+
+	/* get end */
+	andq	$PMD_PAGE_MASK, %r9
+	/* round start to 1G if it is below 1G */
 	leaq	_text(%rip), %rdi
 	andq	$PMD_PAGE_MASK, %rdi
+	cmp	$PUD_SIZE, %rdi
+	jg	1f
+	movq	$PUD_SIZE, %rdi
+1:
+	/* get 512G index */
+	movq	%r9, %r8
+	shrq	$PGDIR_SHIFT, %r8
+	andq	$(PTRS_PER_PGD - 1), %r8
+	movq	%rdi, %rax
+	shrq	$PGDIR_SHIFT, %rax
+	andq	$(PTRS_PER_PGD - 1), %rax
+
+	/* cross two 512G ? */
+	cmp	%r8, %rax
+	jne	set_level3_other_512g
+
+	/* all in first 512G ? */
+	cmp	$0, %rax
+	je	skip_level3_spare
+
+	/* same 512G other than first 512g */
+	/*
+	 * We need one level3, one or two level 2,
+	 * so use first one for level3.
+	 */
+	leaq    (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq    init_level4_pgt(%rip), %rbx
+	movq    %rdx, 0(%rbx, %rax, 8)
+	addq    $L4_PAGE_OFFSET, %rax
+	movq    %rdx, 0(%rbx, %rax, 8)
+	/* one level3 in BRK */
+	add_one_spare_page
+
+	/* get 1G index */
+	movq    %r9, %r8
+	shrq    $PUD_SHIFT, %r8
+	andq    $(PTRS_PER_PUD - 1), %r8
+	movq    %rdi, %rax
+	shrq    $PUD_SHIFT, %rax
+	andq    $(PTRS_PER_PUD - 1), %rax
+
+	/* same 1G ? */
+	cmp     %r8, %rax
+	je	set_level2_start_only_not_first_512g
+
+	/* set level2 for end */
+	leaq    spare_page(0)(%rip), %rbx
+	leaq    (spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq    %rdx, 0(%rbx, %r8, 8)
+	/* second one level2 in BRK */
+	add_one_spare_page
+
+set_level2_start_only_not_first_512g:
+	leaq    spare_page(0)(%rip), %rbx
+	leaq    (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq    %rdx, 0(%rbx, %rax, 8)
+	/* first one level2 in BRK */
+	add_one_spare_page
+
+	/* one spare level3 before level2*/
+	leaq    spare_page(1)(%rip), %rbx
+	jmp	set_level2_spare
+
+set_level3_other_512g:
+	/*
+	 * We need one or two level3, and two level2,
+	 * so use first two for level2.
+	 */
+	/* for level2 last on first 512g */
+	leaq	level3_ident_pgt(%rip), %rcx
+	/* start is in first 512G ? */
+	cmp	$0, %rax
+	je	set_level2_start_other_512g
+
+	/* Set level3 for _text */
+	leaq	(spare_page(3) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq	init_level4_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %rax, 8)
+	addq	$L4_PAGE_OFFSET, %rax
+	movq	%rdx, 0(%rbx, %rax, 8)
+	/* first one level3 in BRK */
+	add_one_spare_page
+
+	/* for level2 last not on first 512G */
+	leaq	spare_page(3)(%rip), %rcx
 
+set_level2_start_other_512g:
+	/* always need to set level2 */
 	movq	%rdi, %rax
 	shrq	$PUD_SHIFT, %rax
 	andq	$(PTRS_PER_PUD - 1), %rax
-	jz	ident_complete
+	movq	%rcx, %rbx  /* %rcx : level3 spare or level3_ident_pgt */
+	leaq	(spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %rax, 8)
+	/* first one level2 in BRK */
+	add_one_spare_page
+
+set_level3_end_other_512g:
+	leaq	(spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq	init_level4_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %r8, 8)
+	addq	$L4_PAGE_OFFSET, %r8
+	movq	%rdx, 0(%rbx, %r8, 8)
+	/* second one level3 in BRK */
+	add_one_spare_page
+
+	/* always need to set level2 */
+	movq	%r9, %r8
+	shrq	$PUD_SHIFT, %r8
+	andq	$(PTRS_PER_PUD - 1), %r8
+	leaq	spare_page(2)(%rip), %rbx
+	leaq	(spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %r8, 8)
+	/* second one level2 in BRK */
+	add_one_spare_page
+
+	/* no spare level3 before level2 */
+	leaq    spare_page(0)(%rip), %rbx
+	jmp	set_level2_spare
+
+skip_level3_spare:
+	/* We have one or two level2 */
+	/* get 1G index */
+	movq	%r9, %r8
+	shrq	$PUD_SHIFT, %r8
+	andq	$(PTRS_PER_PUD - 1), %r8
+	movq	%rdi, %rax
+	shrq	$PUD_SHIFT, %rax
+	andq	$(PTRS_PER_PUD - 1), %rax
+
+	/* same 1G ? */
+	cmp	%r8, %rax
+	je	set_level2_start_only_first_512g
 
-	leaq	(level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	/* set level2 without level3 spare */
 	leaq	level3_ident_pgt(%rip), %rbx
+	leaq	(spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %r8, 8)
+	/* second one level2 in BRK */
+	add_one_spare_page
+
+set_level2_start_only_first_512g:
+	/*  set level2 without level3 spare */
+	leaq	level3_ident_pgt(%rip), %rbx
+	leaq	(spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
 	movq	%rdx, 0(%rbx, %rax, 8)
+	/* first one level2 in BRK */
+	add_one_spare_page
+
+	/* no spare level3 */
+	leaq    spare_page(0)(%rip), %rbx
 
+set_level2_spare:
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rax
 	andq	$(PTRS_PER_PMD - 1), %rax
 	leaq	__PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
-	leaq	level2_spare_pgt(%rip), %rbx
-	movq	%rdx, 0(%rbx, %rax, 8)
+	/* %rbx is set before */
+	movq	%r9, %r8
+	shrq	$PMD_SHIFT, %r8
+	andq	$(PTRS_PER_PMD - 1), %r8
+	cmp	%r8, %rax
+	jl	1f
+	addq	$PTRS_PER_PMD, %r8
+1:	movq	%rdx, 0(%rbx, %rax, 8)
+	addq	$PMD_SIZE, %rdx
+	incq	%rax
+	cmp	%r8, %rax
+	jle	1b
+
 ident_complete:
 
 	/*
@@ -439,11 +612,9 @@ NEXT_PAGE(level2_kernel_pgt)
 	 *  If you want to increase this then increase MODULES_VADDR
 	 *  too.)
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
-		KERNEL_IMAGE_SIZE/PMD_SIZE)
-
-NEXT_PAGE(level2_spare_pgt)
-	.fill   512, 8, 0
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
+	/* hold the whole page */
+	.fill (PTRS_PER_PMD - (KERNEL_IMAGE_SIZE/PMD_SIZE)), 8, 0
 
 #undef PMDS
 #undef NEXT_PAGE
-- 
1.7.10.4


  parent reply	other threads:[~2012-12-13 22:08 UTC|newest]

Thread overview: 66+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-12-13 22:01 [PATCH v6 00/27] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G Yinghai Lu
2012-12-13 22:01 ` [PATCH v6 01/27] x86, mm: Fix page table early allocation offset checking Yinghai Lu
2012-12-14 10:53   ` Borislav Petkov
2012-12-19  3:30     ` Yinghai Lu
2012-12-19 17:16       ` Borislav Petkov
2012-12-13 22:01 ` [PATCH v6 02/27] x86, mm: make pgd next calculation consistent with pud/pmd Yinghai Lu
2012-12-14 14:34   ` Borislav Petkov
2012-12-19  3:37     ` Yinghai Lu
2012-12-19 20:48       ` Borislav Petkov
2012-12-19 21:55         ` Yinghai Lu
2012-12-13 22:01 ` [PATCH v6 03/27] x86, boot: move verify_cpu.S and no_longmode after 0x200 Yinghai Lu
2012-12-15 17:06   ` Borislav Petkov
2012-12-19  3:44     ` Yinghai Lu
2012-12-19 20:57       ` Borislav Petkov
2012-12-19 21:58         ` Yinghai Lu
2012-12-19 22:04           ` Borislav Petkov
2012-12-22  2:24           ` Konrad Rzeszutek Wilk
2012-12-13 22:01 ` [PATCH v6 04/27] x86, boot: Move lldt/ltr out of 64bit code section Yinghai Lu
2012-12-15 17:28   ` Borislav Petkov
2012-12-19  3:53     ` Yinghai Lu
2012-12-13 22:01 ` [PATCH v6 05/27] x86, 64bit: clear ident mapping when kernel is above 512G Yinghai Lu
2012-12-16 17:49   ` Borislav Petkov
2012-12-16 18:04     ` Yinghai Lu
2012-12-19  3:57     ` Yinghai Lu
2012-12-13 22:02 ` Yinghai Lu [this message]
2012-12-13 22:02 ` [PATCH v6 07/27] x86: Merge early_reserve_initrd for 32bit and 64bit Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 08/27] x86: add get_ramdisk_image/size() Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 09/27] x86, boot: add get_cmd_line_ptr() Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 10/27] x86, boot: move checking of cmd_line_ptr out of common path Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 11/27] x86, boot: update cmd_line_ptr to unsigned long Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 12/27] x86: use io_remap to access real_mode_data Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 13/27] x86: use rsi/rdi to pass realmode_data pointer Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 14/27] x86, kexec: remove 1024G limitation for kexec buffer on 64bit Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 15/27] x86, kexec: set ident mapping for kernel that is above max_pfn Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 16/27] x86, kexec: Merge ident_mapping_init and init_level4_page Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 17/27] x86, kexec: only set ident mapping for ram Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 18/27] x86, boot: add fields to support load bzImage and ramdisk above 4G Yinghai Lu
2012-12-13 22:54   ` H. Peter Anvin
2012-12-13 23:28     ` Yinghai Lu
2012-12-13 23:38       ` H. Peter Anvin
2012-12-13 22:02 ` [PATCH v6 19/27] x86, boot: update comments about entries for 64bit image Yinghai Lu
2012-12-13 23:27   ` H. Peter Anvin
2012-12-14  0:13     ` Yinghai Lu
2012-12-14  0:38       ` H. Peter Anvin
2012-12-14  0:44         ` Yinghai Lu
2012-12-14  0:51           ` H. Peter Anvin
2012-12-14  0:51           ` Yinghai Lu
2012-12-14  0:54             ` H. Peter Anvin
2012-12-14  1:00               ` Yinghai Lu
2012-12-14  1:04                 ` H. Peter Anvin
2012-12-14  2:15     ` Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 20/27] x86, 64bit: Print init kernel lowmap correctly Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 21/27] x86, boot: Not need to check setup_header version Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 22/27] mm: Add alloc_bootmem_low_pages_nopanic() Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 23/27] x86: Don't panic if can not alloc buffer for swiotlb Yinghai Lu
2012-12-22  2:21   ` Konrad Rzeszutek Wilk
2012-12-13 22:02 ` [PATCH v6 24/27] x86: Add swiotlb force off support Yinghai Lu
2012-12-22  2:18   ` Konrad Rzeszutek Wilk
2012-12-22  5:00     ` Yinghai Lu
2012-12-23  5:00       ` H. Peter Anvin
2012-12-13 22:02 ` [PATCH v6 25/27] x86, kdump: remove crashkernel range find limit for 64bit Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 26/27] x86: add Crash kernel low reservation Yinghai Lu
2012-12-13 22:02 ` [PATCH v6 27/27] x86: Merge early kernel reserve for 32bit and 64bit Yinghai Lu
2012-12-13 23:47 ` [PATCH v6 00/27] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G H. Peter Anvin
2012-12-14  0:00   ` Yinghai Lu
2012-12-21 22:38   ` Konrad Rzeszutek Wilk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1355436141-8668-7-git-send-email-yinghai@kernel.org \
    --to=yinghai@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=ebiederm@xmission.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).