linux-efi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]   ` <1379602494-26684-12-git-send-email-bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org>
@ 2013-09-21 11:39     ` Borislav Petkov
  2013-09-23  5:47       ` Dave Young
       [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 2 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-09-21 11:39 UTC (permalink / raw)
  To: X86 ML
  Cc: LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal, Dave Young,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Thu, Sep 19, 2013 at 04:54:54PM +0200, Borislav Petkov wrote:
> From: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
> 
> We map the EFI regions needed for runtime services contiguously on
> virtual addresses starting from -4G down for a total max space of 64G.
> This way, we provide for stable runtime services addresses across
> kernels so that a kexec'd kernel can still use them.
> 
> This way, they're mapped in a separate pagetable so that we don't
> pollute the kernel namespace (you can see how the whole ioremapping and
> saving and restoring of PGDs is gone now).

Ok, this one was not so good, let's try again:

This time I saved 32-bit and am switching the pagetable only after
having built it properly. This boots fine again on baremetal and on OVMF
with Matt's handover flags fix from yesterday.

Also, I've uploaded the whole series to
git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git, branch
efi-experimental

("-experimental" doesn't trigger Fengguang's robot :-))

Good luck! :-)

---
>From 880fcee20209a122eda846e7f109776ed1c56de5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
Date: Wed, 18 Sep 2013 17:35:42 +0200
Subject: [PATCH] EFI: Runtime services virtual mapping

We map the EFI regions needed for runtime services contiguously on
virtual addresses starting from -4G down for a total max space of 64G.
This way, we provide for stable runtime services addresses across
kernels so that a kexec'd kernel can still use them.

This way, they're mapped in a separate pagetable so that we don't
pollute the kernel namespace (you can see how the whole ioremapping and
saving and restoring of PGDs is gone now).

Signed-off-by: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
---
 arch/x86/include/asm/efi.h           | 43 ++++++++++--------
 arch/x86/include/asm/pgtable_types.h |  3 +-
 arch/x86/platform/efi/efi.c          | 68 ++++++++++++-----------------
 arch/x86/platform/efi/efi_32.c       | 29 +++++++++++-
 arch/x86/platform/efi/efi_64.c       | 85 +++++++++++++++++++-----------------
 arch/x86/platform/efi/efi_stub_64.S  | 53 ++++++++++++++++++++++
 6 files changed, 181 insertions(+), 100 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0062a0125041..9a99e0499e4b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -69,24 +69,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
 	efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
 		  (u64)(a4), (u64)(a5), (u64)(a6))
 
+#define _efi_call_virtX(x, f, ...)					\
+({									\
+	efi_status_t __s;						\
+									\
+	efi_sync_low_kernel_mappings();					\
+	preempt_disable();						\
+	__s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__);	\
+	preempt_enable();						\
+	__s;								\
+})
+
 #define efi_call_virt0(f)				\
-	efi_call0((efi.systab->runtime->f))
-#define efi_call_virt1(f, a1)					\
-	efi_call1((efi.systab->runtime->f), (u64)(a1))
-#define efi_call_virt2(f, a1, a2)					\
-	efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
-#define efi_call_virt3(f, a1, a2, a3)					\
-	efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3))
-#define efi_call_virt4(f, a1, a2, a3, a4)				\
-	efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4))
-#define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
-	efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4), (u64)(a5))
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
-	efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+	_efi_call_virtX(0, f)
+#define efi_call_virt1(f, a1)				\
+	_efi_call_virtX(1, f, (u64)(a1))
+#define efi_call_virt2(f, a1, a2)			\
+	_efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3)			\
+	_efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4)		\
+	_efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)		\
+	_efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
+	_efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
 
 extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 				 u32 type, u64 attribute);
@@ -101,6 +108,8 @@ extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 extern void efi_unmap_memmap(void);
 extern void efi_memory_uc(u64 addr, unsigned long size);
+extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void efi_sync_low_kernel_mappings(void);
 
 #ifdef CONFIG_EFI
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0ecac257fb26..a83aa44bb1fb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
 extern phys_addr_t slow_virt_to_phys(void *__address);
-
+extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+				   unsigned numpages, unsigned long page_flags);
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 538c1e6b7b2c..90459f5f587c 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -12,6 +12,8 @@
  *	Bibo Mao <bibo.mao-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  *	Chandramouli Narayanan <mouli-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
  *	Huang Ying <ying.huang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
+ * Copyright (C) 2013 SuSE Labs
+ * 	Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org> - runtime services VA mapping
  *
  * Copied from efi_32.c to eliminate the duplicated code between EFI
  * 32/64 support code. --ying 2007-10-26
@@ -81,6 +83,17 @@ static efi_system_table_t efi_systab __initdata;
 unsigned long x86_efi_facility;
 
 /*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+	u64 r15;
+	u64 prev_cr3;
+	pgd_t *efi_pgt;
+	bool use_pgd;
+};
+extern struct efi_scratch efi_scratch;
+
+/*
  * Returns 1 if 'facility' is enabled, 0 otherwise.
  */
 int efi_enabled(int facility)
@@ -797,22 +810,6 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
 		set_memory_nx(addr, npages);
 }
 
-static void __init runtime_code_page_mkexec(void)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	/* Make EFI runtime service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
-		if (md->type != EFI_RUNTIME_SERVICES_CODE)
-			continue;
-
-		efi_set_executable(md, true);
-	}
-}
-
 /*
  * We can't ioremap data in EFI boot services RAM, because we've already mapped
  * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
@@ -862,10 +859,10 @@ void efi_memory_uc(u64 addr, unsigned long size)
 void __init efi_enter_virtual_mode(void)
 {
 	efi_memory_desc_t *md, *prev_md = NULL;
-	efi_status_t status;
+	void *p, *new_memmap = NULL;
 	unsigned long size;
-	u64 end, systab, start_pfn, end_pfn;
-	void *p, *va, *new_memmap = NULL;
+	efi_status_t status;
+	u64 end, systab;
 	int count = 0;
 
 	efi.systab = NULL;
@@ -874,7 +871,6 @@ void __init efi_enter_virtual_mode(void)
 	 * We don't do virtual mode, since we don't do runtime services, on
 	 * non-native EFI
 	 */
-
 	if (!efi_is_native()) {
 		efi_unmap_memmap();
 		return;
@@ -914,33 +910,18 @@ void __init efi_enter_virtual_mode(void)
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
+		efi_map_region(md);
+
 		size = md->num_pages << PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		start_pfn = PFN_DOWN(md->phys_addr);
-		end_pfn = PFN_UP(end);
-		if (pfn_range_is_mapped(start_pfn, end_pfn)) {
-			va = __va(md->phys_addr);
-
-			if (!(md->attribute & EFI_MEMORY_WB))
-				efi_memory_uc((u64)(unsigned long)va, size);
-		} else
-			va = efi_ioremap(md->phys_addr, size,
-					 md->type, md->attribute);
-
-		md->virt_addr = (u64) (unsigned long) va;
-
-		if (!va) {
-			pr_err("ioremap of 0x%llX failed!\n",
-			       (unsigned long long)md->phys_addr);
-			continue;
-		}
-
 		systab = (u64) (unsigned long) efi_phys.systab;
 		if (md->phys_addr <= systab && systab < end) {
 			systab += md->virt_addr - md->phys_addr;
+
 			efi.systab = (efi_system_table_t *) (unsigned long) systab;
 		}
+
 		new_memmap = krealloc(new_memmap,
 				      (count + 1) * memmap.desc_size,
 				      GFP_KERNEL);
@@ -949,8 +930,15 @@ void __init efi_enter_virtual_mode(void)
 		count++;
 	}
 
+#ifdef CONFIG_X86_64
+	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
+	efi_scratch.use_pgd = true;
+#endif
+
 	BUG_ON(!efi.systab);
 
+	efi_sync_low_kernel_mappings();
+
 	status = phys_efi_set_virtual_address_map(
 		memmap.desc_size * count,
 		memmap.desc_size,
@@ -983,8 +971,6 @@ void __init efi_enter_virtual_mode(void)
 	efi.query_variable_info = virt_efi_query_variable_info;
 	efi.update_capsule = virt_efi_update_capsule;
 	efi.query_capsule_caps = virt_efi_query_capsule_caps;
-	if (__supported_pte_mask & _PAGE_NX)
-		runtime_code_page_mkexec();
 
 	kfree(new_memmap);
 
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e446941dd7..661663b08eaf 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -37,9 +37,36 @@
  * claim EFI runtime service handler exclusively and to duplicate a memory in
  * low memory space say 0 - 3G.
  */
-
 static unsigned long efi_rt_eflags;
 
+void efi_sync_low_kernel_mappings(void) {}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+	u64 start_pfn, end_pfn, end;
+	unsigned long size;
+	void *va;
+
+	start_pfn = PFN_DOWN(md->phys_addr);
+	size	  = md->num_pages << PAGE_SHIFT;
+	end	  = md->phys_addr + size;
+	end_pfn   = PFN_UP(end);
+
+	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
+		va = __va(md->phys_addr);
+
+		if (!(md->attribute & EFI_MEMORY_WB))
+			efi_memory_uc((u64)(unsigned long)va, size);
+	} else
+		va = efi_ioremap(md->phys_addr, size,
+				 md->type, md->attribute);
+
+	md->virt_addr = (u64) (unsigned long) va;
+	if (!va)
+		pr_err("ioremap of 0x%llX failed!\n",
+		       (unsigned long long)md->phys_addr);
+}
+
 void efi_call_phys_prelog(void)
 {
 	struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 39a0e7f1f0a3..db5230dd350e 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -39,59 +39,64 @@
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
 
-static pgd_t *save_pgd __initdata;
-static unsigned long efi_flags __initdata;
+void __init efi_call_phys_prelog(void) {}
+void __init efi_call_phys_epilog(void) {}
 
-static void __init early_code_mapping_set_exec(int executable)
+/*
+ * Add low kernel mappings for passing arguments to EFI functions.
+ */
+void efi_sync_low_kernel_mappings(void)
 {
-	efi_memory_desc_t *md;
-	void *p;
+	unsigned num_pgds;
+	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
 
-	if (!(__supported_pte_mask & _PAGE_NX))
-		return;
+	num_pgds = pgd_index(VMALLOC_START - 1) - pgd_index(PAGE_OFFSET);
 
-	/* Make EFI service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if (md->type == EFI_RUNTIME_SERVICES_CODE ||
-		    md->type == EFI_BOOT_SERVICES_CODE)
-			efi_set_executable(md, executable);
-	}
+	memcpy(pgd + pgd_index(PAGE_OFFSET),
+		init_mm.pgd + pgd_index(PAGE_OFFSET),
+		sizeof(pgd_t) * num_pgds);
 }
 
-void __init efi_call_phys_prelog(void)
+/*
+ * We allocate runtime services regions top-down, starting from -4G, i.e.
+ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
+ */
+static u64 efi_va	 = -4 * (1UL << 30);
+#define EFI_VA_END	 (-68 * (1UL << 30))
+
+static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
-	unsigned long vaddress;
-	int pgd;
-	int n_pgds;
+	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+	unsigned long pf = 0, size;
+	u64 end;
 
-	early_code_mapping_set_exec(1);
-	local_irq_save(efi_flags);
+	if (!(md->attribute & EFI_MEMORY_WB))
+		pf |= _PAGE_PCD;
 
-	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
-	save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+	size = md->num_pages << PAGE_SHIFT;
+	end  = va + size;
 
-	for (pgd = 0; pgd < n_pgds; pgd++) {
-		save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
-		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
-		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
-	}
-	__flush_tlb_all();
+	if(kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+		pr_warning("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+			   md->phys_addr, va);
 }
 
-void __init efi_call_phys_epilog(void)
+void __init efi_map_region(efi_memory_desc_t *md)
 {
-	/*
-	 * After the lock is released, the original page table is restored.
-	 */
-	int pgd;
-	int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
-	for (pgd = 0; pgd < n_pgds; pgd++)
-		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
-	kfree(save_pgd);
-	__flush_tlb_all();
-	local_irq_restore(efi_flags);
-	early_code_mapping_set_exec(0);
+	unsigned long size = md->num_pages << PAGE_SHIFT;
+
+	efi_va -= size;
+	if (efi_va < EFI_VA_END) {
+		pr_warning(FW_WARN "VA address range overflow!\n");
+		return;
+	}
+
+	/* Do the 1:1 map */
+	__map_region(md, md->phys_addr);
+
+	/* Do the VA map */
+	__map_region(md, efi_va);
+	md->virt_addr = efi_va;
 }
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..2bb9714bf713 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -34,10 +34,47 @@
 	mov %rsi, %cr0;			\
 	mov (%rsp), %rsp
 
+	/* stolen from gcc */
+	.macro FLUSH_TLB_ALL
+	movq %r15, efi_scratch(%rip)
+	movq %r14, efi_scratch+8(%rip)
+	movq %cr4, %r15
+	movq %r15, %r14
+	andb $0x7f, %r14b
+	movq %r14, %cr4
+	movq %r15, %cr4
+	movq efi_scratch+8(%rip), %r14
+	movq efi_scratch(%rip), %r15
+	.endm
+
+	.macro SWITCH_PGT
+	cmpb $0, efi_scratch+24(%rip)
+	je 1f
+	movq %r15, efi_scratch(%rip)		# r15
+	# save previous CR3
+	movq %cr3, %r15
+	movq %r15, efi_scratch+8(%rip)		# prev_cr3
+	movq efi_scratch+16(%rip), %r15		# EFI pgt
+	movq %r15, %cr3
+	1:
+	.endm
+
+	.macro RESTORE_PGT
+	cmpb $0, efi_scratch+24(%rip)
+	je 2f
+	movq efi_scratch+8(%rip), %r15
+	movq %r15, %cr3
+	movq efi_scratch(%rip), %r15
+	FLUSH_TLB_ALL
+	2:
+	.endm
+
 ENTRY(efi_call0)
 	SAVE_XMM
 	subq $32, %rsp
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -47,7 +84,9 @@ ENTRY(efi_call1)
 	SAVE_XMM
 	subq $32, %rsp
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -57,7 +96,9 @@ ENTRY(efi_call2)
 	SAVE_XMM
 	subq $32, %rsp
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -68,7 +109,9 @@ ENTRY(efi_call3)
 	subq $32, %rsp
 	mov  %rcx, %r8
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -80,7 +123,9 @@ ENTRY(efi_call4)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -93,7 +138,9 @@ ENTRY(efi_call5)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
@@ -109,8 +156,14 @@ ENTRY(efi_call6)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
 ENDPROC(efi_call6)
+
+	.data
+ENTRY(efi_scratch)
+	.fill 3,8,0
-- 
1.8.4

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-09-22 12:35         ` Dave Young
       [not found]           ` <20130922123515.GA7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2013-09-23  8:45         ` Borislav Petkov
  2013-09-25  9:24         ` Borislav Petkov
  2 siblings, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-22 12:35 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/21/13 at 01:39pm, Borislav Petkov wrote:
> On Thu, Sep 19, 2013 at 04:54:54PM +0200, Borislav Petkov wrote:
> > From: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
> > 
> > We map the EFI regions needed for runtime services contiguously on
> > virtual addresses starting from -4G down for a total max space of 64G.
> > This way, we provide for stable runtime services addresses across
> > kernels so that a kexec'd kernel can still use them.
> > 
> > This way, they're mapped in a separate pagetable so that we don't
> > pollute the kernel namespace (you can see how the whole ioremapping and
> > saving and restoring of PGDs is gone now).
> 
> Ok, this one was not so good, let's try again:
> 
> This time I saved 32-bit and am switching the pagetable only after
> having built it properly. This boots fine again on baremetal and on OVMF
> with Matt's handover flags fix from yesterday.
> 
> Also, I've uploaded the whole series to
> git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git, branch
> efi-experimental

I tested your new patch, it works both with efi stub and grub boot in 1st kernel.

But it paniced in kexec boot with my kexec related patcheset, the patchset
contains 3 patch:
1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
2. export physical addr fw_vendor, runtime, tables to /sys/firmware/efi/systab
3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams; Also do not
   call SetVirtualAddressMao in case kexecboot.

The panic happens at the last line of efi_init:
        /* clean DUMMY object */
        efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
                         EFI_VARIABLE_NON_VOLATILE |
                         EFI_VARIABLE_BOOTSERVICE_ACCESS |
                         EFI_VARIABLE_RUNTIME_ACCESS,
                         0, NULL);

Below is the dmesg:
[    0.003359] pid_max: default: 32768 minimum: 301
[    0.004792] BUG: unable to handle kernel paging request at fffffffefde97e70
[    0.006666] IP: [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
[    0.006666] PGD 36981067 PUD 35828063 PMD 0 
[    0.006666] Oops: 0000 [#1] SMP 
[    0.006666] Modules linked in:
[    0.006666] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.12.0-rc1+ #10
[    0.006666] task: ffffffff81985490 ti: ffffffff81964000 task.ti: ffffffff81964000
[    0.006666] RIP: 0010:[<ffffffff8103a1db>]  [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
[    0.006666] RSP: 0000:ffffffff81965ee0  EFLAGS: 00010246
[    0.006666] RAX: fffffffefde97e18 RBX: ffffffff81999300 RCX: 0000000000000007
[    0.006666] RDX: ffffffff81965f20 RSI: ffffffff81999300 RDI: ffff88000009cc88
[    0.006666] RBP: ffffffff81965f08 R08: 0000000000000000 R09: 0000000000000000
[    0.006666] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff81965f20
[    0.006666] R13: 0000000000000000 R14: 0000000000000007 R15: 0000000000000000
[    0.006666] FS:  0000000000000000(0000) GS:ffff880035c00000(0000) knlGS:0000000000000000
[    0.006666] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.006666] CR2: fffffffefde97e70 CR3: 0000000036980000 CR4: 00000000000006b0
[    0.006666] Stack:
[    0.006666]  0000000000000000 ffffffffff475630 ffff88003582c000 000000000000000f
[    0.006666]  000000000000000f ffffffff81965f60 ffffffff81a98a7c ffffffff81b1a900
[    0.006666]  47ddbe4b4424ac57 a9929ff050ed979e 0000000000000000 a02eda84fa6789b9
[    0.006666] Call Trace:
[    0.006666]  [<ffffffff81a98a7c>] efi_enter_virtual_mode+0x30a/0x32c
[    0.006666]  [<ffffffff81a83ccf>] start_kernel+0x349/0x3da
[    0.006666]  [<ffffffff81a83794>] ? repair_env_string+0x58/0x58
[    0.006666]  [<ffffffff81a83120>] ? early_idt_handlers+0x120/0x120
[    0.006666]  [<ffffffff81a83498>] x86_64_start_reservations+0x2a/0x2c
[    0.006666]  [<ffffffff81a83590>] x86_64_start_kernel+0xf6/0xff
[    0.006666] Code: 55 49 89 cd 4c 89 45 d8 e8 db 05 00 00 48 8b 05 2c 6a a2 00 4c 8b 45 d8 44 89 f1 4c 89 e2 48 89 de 48 8b 40 58 4d 89 c1 4d 89 e8 <48> 8b 78 58 e8 7c 0a 00 00 41 5e 5b 41 5c 41 5d 41 5e 5d c3 55 
[    0.006666] RIP  [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
[    0.006666]  RSP <ffffffff81965ee0>
[    0.006666] CR2: fffffffefde97e70
[    0.006666] ---[ end trace e9fbc5020b26135e ]---
[    0.006666] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.006666] Rebooting in 10 seconds..


> 
> ("-experimental" doesn't trigger Fengguang's robot :-))
> 
> Good luck! :-)
> 
> ---
> From 880fcee20209a122eda846e7f109776ed1c56de5 Mon Sep 17 00:00:00 2001
> From: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
> Date: Wed, 18 Sep 2013 17:35:42 +0200
> Subject: [PATCH] EFI: Runtime services virtual mapping
> 
> We map the EFI regions needed for runtime services contiguously on
> virtual addresses starting from -4G down for a total max space of 64G.
> This way, we provide for stable runtime services addresses across
> kernels so that a kexec'd kernel can still use them.
> 
> This way, they're mapped in a separate pagetable so that we don't
> pollute the kernel namespace (you can see how the whole ioremapping and
> saving and restoring of PGDs is gone now).
> 
> Signed-off-by: Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org>
> ---
>  arch/x86/include/asm/efi.h           | 43 ++++++++++--------
>  arch/x86/include/asm/pgtable_types.h |  3 +-
>  arch/x86/platform/efi/efi.c          | 68 ++++++++++++-----------------
>  arch/x86/platform/efi/efi_32.c       | 29 +++++++++++-
>  arch/x86/platform/efi/efi_64.c       | 85 +++++++++++++++++++-----------------
>  arch/x86/platform/efi/efi_stub_64.S  | 53 ++++++++++++++++++++++
>  6 files changed, 181 insertions(+), 100 deletions(-)
> 
> diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
> index 0062a0125041..9a99e0499e4b 100644
> --- a/arch/x86/include/asm/efi.h
> +++ b/arch/x86/include/asm/efi.h
> @@ -69,24 +69,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
>  	efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
>  		  (u64)(a4), (u64)(a5), (u64)(a6))
>  
> +#define _efi_call_virtX(x, f, ...)					\
> +({									\
> +	efi_status_t __s;						\
> +									\
> +	efi_sync_low_kernel_mappings();					\
> +	preempt_disable();						\
> +	__s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__);	\
> +	preempt_enable();						\
> +	__s;								\
> +})
> +
>  #define efi_call_virt0(f)				\
> -	efi_call0((efi.systab->runtime->f))
> -#define efi_call_virt1(f, a1)					\
> -	efi_call1((efi.systab->runtime->f), (u64)(a1))
> -#define efi_call_virt2(f, a1, a2)					\
> -	efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
> -#define efi_call_virt3(f, a1, a2, a3)					\
> -	efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3))
> -#define efi_call_virt4(f, a1, a2, a3, a4)				\
> -	efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4))
> -#define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
> -	efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4), (u64)(a5))
> -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
> -	efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
> +	_efi_call_virtX(0, f)
> +#define efi_call_virt1(f, a1)				\
> +	_efi_call_virtX(1, f, (u64)(a1))
> +#define efi_call_virt2(f, a1, a2)			\
> +	_efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
> +#define efi_call_virt3(f, a1, a2, a3)			\
> +	_efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
> +#define efi_call_virt4(f, a1, a2, a3, a4)		\
> +	_efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
> +#define efi_call_virt5(f, a1, a2, a3, a4, a5)		\
> +	_efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
> +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
> +	_efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
>  
>  extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
>  				 u32 type, u64 attribute);
> @@ -101,6 +108,8 @@ extern void efi_call_phys_prelog(void);
>  extern void efi_call_phys_epilog(void);
>  extern void efi_unmap_memmap(void);
>  extern void efi_memory_uc(u64 addr, unsigned long size);
> +extern void __init efi_map_region(efi_memory_desc_t *md);
> +extern void efi_sync_low_kernel_mappings(void);
>  
>  #ifdef CONFIG_EFI
>  
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 0ecac257fb26..a83aa44bb1fb 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
>  extern phys_addr_t slow_virt_to_phys(void *__address);
> -
> +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
> +				   unsigned numpages, unsigned long page_flags);
>  #endif	/* !__ASSEMBLY__ */
>  
>  #endif /* _ASM_X86_PGTABLE_DEFS_H */
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index 538c1e6b7b2c..90459f5f587c 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -12,6 +12,8 @@
>   *	Bibo Mao <bibo.mao-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>   *	Chandramouli Narayanan <mouli-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
>   *	Huang Ying <ying.huang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> + * Copyright (C) 2013 SuSE Labs
> + * 	Borislav Petkov <bp-l3A5Bk7waGM@public.gmane.org> - runtime services VA mapping
>   *
>   * Copied from efi_32.c to eliminate the duplicated code between EFI
>   * 32/64 support code. --ying 2007-10-26
> @@ -81,6 +83,17 @@ static efi_system_table_t efi_systab __initdata;
>  unsigned long x86_efi_facility;
>  
>  /*
> + * Scratch space used for switching the pagetable in the EFI stub
> + */
> +struct efi_scratch {
> +	u64 r15;
> +	u64 prev_cr3;
> +	pgd_t *efi_pgt;
> +	bool use_pgd;
> +};
> +extern struct efi_scratch efi_scratch;
> +
> +/*
>   * Returns 1 if 'facility' is enabled, 0 otherwise.
>   */
>  int efi_enabled(int facility)
> @@ -797,22 +810,6 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
>  		set_memory_nx(addr, npages);
>  }
>  
> -static void __init runtime_code_page_mkexec(void)
> -{
> -	efi_memory_desc_t *md;
> -	void *p;
> -
> -	/* Make EFI runtime service code area executable */
> -	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> -		md = p;
> -
> -		if (md->type != EFI_RUNTIME_SERVICES_CODE)
> -			continue;
> -
> -		efi_set_executable(md, true);
> -	}
> -}
> -
>  /*
>   * We can't ioremap data in EFI boot services RAM, because we've already mapped
>   * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
> @@ -862,10 +859,10 @@ void efi_memory_uc(u64 addr, unsigned long size)
>  void __init efi_enter_virtual_mode(void)
>  {
>  	efi_memory_desc_t *md, *prev_md = NULL;
> -	efi_status_t status;
> +	void *p, *new_memmap = NULL;
>  	unsigned long size;
> -	u64 end, systab, start_pfn, end_pfn;
> -	void *p, *va, *new_memmap = NULL;
> +	efi_status_t status;
> +	u64 end, systab;
>  	int count = 0;
>  
>  	efi.systab = NULL;
> @@ -874,7 +871,6 @@ void __init efi_enter_virtual_mode(void)
>  	 * We don't do virtual mode, since we don't do runtime services, on
>  	 * non-native EFI
>  	 */
> -
>  	if (!efi_is_native()) {
>  		efi_unmap_memmap();
>  		return;
> @@ -914,33 +910,18 @@ void __init efi_enter_virtual_mode(void)
>  		    md->type != EFI_BOOT_SERVICES_DATA)
>  			continue;
>  
> +		efi_map_region(md);
> +
>  		size = md->num_pages << PAGE_SHIFT;
>  		end = md->phys_addr + size;
>  
> -		start_pfn = PFN_DOWN(md->phys_addr);
> -		end_pfn = PFN_UP(end);
> -		if (pfn_range_is_mapped(start_pfn, end_pfn)) {
> -			va = __va(md->phys_addr);
> -
> -			if (!(md->attribute & EFI_MEMORY_WB))
> -				efi_memory_uc((u64)(unsigned long)va, size);
> -		} else
> -			va = efi_ioremap(md->phys_addr, size,
> -					 md->type, md->attribute);
> -
> -		md->virt_addr = (u64) (unsigned long) va;
> -
> -		if (!va) {
> -			pr_err("ioremap of 0x%llX failed!\n",
> -			       (unsigned long long)md->phys_addr);
> -			continue;
> -		}
> -
>  		systab = (u64) (unsigned long) efi_phys.systab;
>  		if (md->phys_addr <= systab && systab < end) {
>  			systab += md->virt_addr - md->phys_addr;
> +
>  			efi.systab = (efi_system_table_t *) (unsigned long) systab;
>  		}
> +
>  		new_memmap = krealloc(new_memmap,
>  				      (count + 1) * memmap.desc_size,
>  				      GFP_KERNEL);
> @@ -949,8 +930,15 @@ void __init efi_enter_virtual_mode(void)
>  		count++;
>  	}
>  
> +#ifdef CONFIG_X86_64
> +	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
> +	efi_scratch.use_pgd = true;
> +#endif
> +
>  	BUG_ON(!efi.systab);
>  
> +	efi_sync_low_kernel_mappings();
> +
>  	status = phys_efi_set_virtual_address_map(
>  		memmap.desc_size * count,
>  		memmap.desc_size,
> @@ -983,8 +971,6 @@ void __init efi_enter_virtual_mode(void)
>  	efi.query_variable_info = virt_efi_query_variable_info;
>  	efi.update_capsule = virt_efi_update_capsule;
>  	efi.query_capsule_caps = virt_efi_query_capsule_caps;
> -	if (__supported_pte_mask & _PAGE_NX)
> -		runtime_code_page_mkexec();
>  
>  	kfree(new_memmap);
>  
> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
> index 40e446941dd7..661663b08eaf 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -37,9 +37,36 @@
>   * claim EFI runtime service handler exclusively and to duplicate a memory in
>   * low memory space say 0 - 3G.
>   */
> -
>  static unsigned long efi_rt_eflags;
>  
> +void efi_sync_low_kernel_mappings(void) {}
> +
> +void __init efi_map_region(efi_memory_desc_t *md)
> +{
> +	u64 start_pfn, end_pfn, end;
> +	unsigned long size;
> +	void *va;
> +
> +	start_pfn = PFN_DOWN(md->phys_addr);
> +	size	  = md->num_pages << PAGE_SHIFT;
> +	end	  = md->phys_addr + size;
> +	end_pfn   = PFN_UP(end);
> +
> +	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
> +		va = __va(md->phys_addr);
> +
> +		if (!(md->attribute & EFI_MEMORY_WB))
> +			efi_memory_uc((u64)(unsigned long)va, size);
> +	} else
> +		va = efi_ioremap(md->phys_addr, size,
> +				 md->type, md->attribute);
> +
> +	md->virt_addr = (u64) (unsigned long) va;
> +	if (!va)
> +		pr_err("ioremap of 0x%llX failed!\n",
> +		       (unsigned long long)md->phys_addr);
> +}
> +
>  void efi_call_phys_prelog(void)
>  {
>  	struct desc_ptr gdt_descr;
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 39a0e7f1f0a3..db5230dd350e 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -39,59 +39,64 @@
>  #include <asm/cacheflush.h>
>  #include <asm/fixmap.h>
>  
> -static pgd_t *save_pgd __initdata;
> -static unsigned long efi_flags __initdata;
> +void __init efi_call_phys_prelog(void) {}
> +void __init efi_call_phys_epilog(void) {}
>  
> -static void __init early_code_mapping_set_exec(int executable)
> +/*
> + * Add low kernel mappings for passing arguments to EFI functions.
> + */
> +void efi_sync_low_kernel_mappings(void)
>  {
> -	efi_memory_desc_t *md;
> -	void *p;
> +	unsigned num_pgds;
> +	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
>  
> -	if (!(__supported_pte_mask & _PAGE_NX))
> -		return;
> +	num_pgds = pgd_index(VMALLOC_START - 1) - pgd_index(PAGE_OFFSET);
>  
> -	/* Make EFI service code area executable */
> -	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> -		md = p;
> -		if (md->type == EFI_RUNTIME_SERVICES_CODE ||
> -		    md->type == EFI_BOOT_SERVICES_CODE)
> -			efi_set_executable(md, executable);
> -	}
> +	memcpy(pgd + pgd_index(PAGE_OFFSET),
> +		init_mm.pgd + pgd_index(PAGE_OFFSET),
> +		sizeof(pgd_t) * num_pgds);
>  }
>  
> -void __init efi_call_phys_prelog(void)
> +/*
> + * We allocate runtime services regions top-down, starting from -4G, i.e.
> + * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
> + */
> +static u64 efi_va	 = -4 * (1UL << 30);
> +#define EFI_VA_END	 (-68 * (1UL << 30))
> +
> +static void __init __map_region(efi_memory_desc_t *md, u64 va)
>  {
> -	unsigned long vaddress;
> -	int pgd;
> -	int n_pgds;
> +	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
> +	unsigned long pf = 0, size;
> +	u64 end;
>  
> -	early_code_mapping_set_exec(1);
> -	local_irq_save(efi_flags);
> +	if (!(md->attribute & EFI_MEMORY_WB))
> +		pf |= _PAGE_PCD;
>  
> -	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
> -	save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
> +	size = md->num_pages << PAGE_SHIFT;
> +	end  = va + size;
>  
> -	for (pgd = 0; pgd < n_pgds; pgd++) {
> -		save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
> -		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
> -		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
> -	}
> -	__flush_tlb_all();
> +	if(kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
> +		pr_warning("Error mapping PA 0x%llx -> VA 0x%llx!\n",
> +			   md->phys_addr, va);
>  }
>  
> -void __init efi_call_phys_epilog(void)
> +void __init efi_map_region(efi_memory_desc_t *md)
>  {
> -	/*
> -	 * After the lock is released, the original page table is restored.
> -	 */
> -	int pgd;
> -	int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
> -	for (pgd = 0; pgd < n_pgds; pgd++)
> -		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
> -	kfree(save_pgd);
> -	__flush_tlb_all();
> -	local_irq_restore(efi_flags);
> -	early_code_mapping_set_exec(0);
> +	unsigned long size = md->num_pages << PAGE_SHIFT;
> +
> +	efi_va -= size;
> +	if (efi_va < EFI_VA_END) {
> +		pr_warning(FW_WARN "VA address range overflow!\n");
> +		return;
> +	}
> +
> +	/* Do the 1:1 map */
> +	__map_region(md, md->phys_addr);
> +
> +	/* Do the VA map */
> +	__map_region(md, efi_va);
> +	md->virt_addr = efi_va;
>  }
>  
>  void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
> diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
> index 4c07ccab8146..2bb9714bf713 100644
> --- a/arch/x86/platform/efi/efi_stub_64.S
> +++ b/arch/x86/platform/efi/efi_stub_64.S
> @@ -34,10 +34,47 @@
>  	mov %rsi, %cr0;			\
>  	mov (%rsp), %rsp
>  
> +	/* stolen from gcc */
> +	.macro FLUSH_TLB_ALL
> +	movq %r15, efi_scratch(%rip)
> +	movq %r14, efi_scratch+8(%rip)
> +	movq %cr4, %r15
> +	movq %r15, %r14
> +	andb $0x7f, %r14b
> +	movq %r14, %cr4
> +	movq %r15, %cr4
> +	movq efi_scratch+8(%rip), %r14
> +	movq efi_scratch(%rip), %r15
> +	.endm
> +
> +	.macro SWITCH_PGT
> +	cmpb $0, efi_scratch+24(%rip)
> +	je 1f
> +	movq %r15, efi_scratch(%rip)		# r15
> +	# save previous CR3
> +	movq %cr3, %r15
> +	movq %r15, efi_scratch+8(%rip)		# prev_cr3
> +	movq efi_scratch+16(%rip), %r15		# EFI pgt
> +	movq %r15, %cr3
> +	1:
> +	.endm
> +
> +	.macro RESTORE_PGT
> +	cmpb $0, efi_scratch+24(%rip)
> +	je 2f
> +	movq efi_scratch+8(%rip), %r15
> +	movq %r15, %cr3
> +	movq efi_scratch(%rip), %r15
> +	FLUSH_TLB_ALL
> +	2:
> +	.endm
> +
>  ENTRY(efi_call0)
>  	SAVE_XMM
>  	subq $32, %rsp
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -47,7 +84,9 @@ ENTRY(efi_call1)
>  	SAVE_XMM
>  	subq $32, %rsp
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -57,7 +96,9 @@ ENTRY(efi_call2)
>  	SAVE_XMM
>  	subq $32, %rsp
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -68,7 +109,9 @@ ENTRY(efi_call3)
>  	subq $32, %rsp
>  	mov  %rcx, %r8
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -80,7 +123,9 @@ ENTRY(efi_call4)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -93,7 +138,9 @@ ENTRY(efi_call5)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $48, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -109,8 +156,14 @@ ENTRY(efi_call6)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $48, %rsp
>  	RESTORE_XMM
>  	ret
>  ENDPROC(efi_call6)
> +
> +	.data
> +ENTRY(efi_scratch)
> +	.fill 3,8,0
> -- 
> 1.8.4
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]           ` <20130922123515.GA7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-22 13:37             ` Borislav Petkov
       [not found]               ` <20130922133722.GC28718-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-22 13:37 UTC (permalink / raw)
  To: Dave Young
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> I tested your new patch, it works both with efi stub and grub boot in
> 1st kernel.

Good, thanks!

> But it paniced in kexec boot with my kexec related patcheset, the patchset

That's the second kernel, right?

> contains 3 patch:
> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> 2. export physical addr fw_vendor, runtime, tables to /sys/firmware/efi/systab
> 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams; Also do not
>    call SetVirtualAddressMao in case kexecboot.
> 
> The panic happens at the last line of efi_init:
>         /* clean DUMMY object */
>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
>                          EFI_VARIABLE_NON_VOLATILE |
>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
>                          EFI_VARIABLE_RUNTIME_ACCESS,
>                          0, NULL);
> 
> Below is the dmesg:
> [    0.003359] pid_max: default: 32768 minimum: 301
> [    0.004792] BUG: unable to handle kernel paging request at fffffffefde97e70
> [    0.006666] IP: [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
> [    0.006666] PGD 36981067 PUD 35828063 PMD 0

Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.

Ok, can you upload your patches somewhere and tell me exactly how to
reproduce this so that I can take a look too?

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]               ` <20130922133722.GC28718-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-09-22 14:00                 ` Dave Young
       [not found]                   ` <20130922140038.GB7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2013-09-22 15:27                 ` H. Peter Anvin
  1 sibling, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-22 14:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/22/13 at 03:37pm, Borislav Petkov wrote:
> On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> > I tested your new patch, it works both with efi stub and grub boot in
> > 1st kernel.
> 
> Good, thanks!
> 
> > But it paniced in kexec boot with my kexec related patcheset, the patchset
> 
> That's the second kernel, right?

Yes, it's 2nd kernel.

> 
> > contains 3 patch:
> > 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> > 2. export physical addr fw_vendor, runtime, tables to /sys/firmware/efi/systab
> > 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams; Also do not
> >    call SetVirtualAddressMao in case kexecboot.
> > 
> > The panic happens at the last line of efi_init:
> >         /* clean DUMMY object */
> >         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> >                          EFI_VARIABLE_NON_VOLATILE |
> >                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> >                          EFI_VARIABLE_RUNTIME_ACCESS,
> >                          0, NULL);
> > 
> > Below is the dmesg:
> > [    0.003359] pid_max: default: 32768 minimum: 301
> > [    0.004792] BUG: unable to handle kernel paging request at fffffffefde97e70
> > [    0.006666] IP: [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
> > [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> 
> Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.
> 
> Ok, can you upload your patches somewhere and tell me exactly how to
> reproduce this so that I can take a look too?

Ok, will put somewhere after a little cleanup today.

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                   ` <20130922140038.GB7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-22 14:31                     ` Dave Young
  0 siblings, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-22 14:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/22/13 at 10:00pm, Dave Young wrote:
> On 09/22/13 at 03:37pm, Borislav Petkov wrote:
> > On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> > > I tested your new patch, it works both with efi stub and grub boot in
> > > 1st kernel.
> > 
> > Good, thanks!
> > 
> > > But it paniced in kexec boot with my kexec related patcheset, the patchset
> > 
> > That's the second kernel, right?
> 
> Yes, it's 2nd kernel.
> 
> > 
> > > contains 3 patch:
> > > 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> > > 2. export physical addr fw_vendor, runtime, tables to /sys/firmware/efi/systab
> > > 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams; Also do not
> > >    call SetVirtualAddressMao in case kexecboot.
> > > 
> > > The panic happens at the last line of efi_init:
> > >         /* clean DUMMY object */
> > >         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> > >                          EFI_VARIABLE_NON_VOLATILE |
> > >                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> > >                          EFI_VARIABLE_RUNTIME_ACCESS,
> > >                          0, NULL);
> > > 
> > > Below is the dmesg:
> > > [    0.003359] pid_max: default: 32768 minimum: 301
> > > [    0.004792] BUG: unable to handle kernel paging request at fffffffefde97e70
> > > [    0.006666] IP: [<ffffffff8103a1db>] virt_efi_set_variable+0x40/0x54
> > > [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> > 
> > Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.
> > 
> > Ok, can you upload your patches somewhere and tell me exactly how to
> > reproduce this so that I can take a look too?
> 
> Ok, will put somewhere after a little cleanup today.

Here it is:
https://people.redhat.com/ruyang/kexec-efi/for-bp/

userspace patches are also necessary, they are under kexec-tools-patches/

Just test with below steps:
kexec -l /boot/vmlinuz-3.12.0-rc1+ --reuse-cmdline --append "kexecboot=1"
kexec -e

> 
> > 
> > Thanks.
> > 
> > -- 
> > Regards/Gruss,
> >     Boris.
> > 
> > Sent from a fat crate under my desk. Formatting is fine.
> > --

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]               ` <20130922133722.GC28718-fF5Pk5pvG8Y@public.gmane.org>
  2013-09-22 14:00                 ` Dave Young
@ 2013-09-22 15:27                 ` H. Peter Anvin
       [not found]                   ` <1ba7eca6-419c-4181-9927-9ba0927a6abf-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  1 sibling, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-09-22 15:27 UTC (permalink / raw)
  To: Borislav Petkov, Dave Young
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	James Bottomley, Vivek Goyal, linux-efi-u79uwXL29TY76Z2rM5mHXA

The address that faults is interesting in that it is indeed just below -4G.  The question at hand is probably what information you are using to build the EFI mappings in the secondary kernel and what could make it not match the primary.

Assuming it isn't as simple as the mappings never get built at all.


Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
>On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
>> I tested your new patch, it works both with efi stub and grub boot in
>> 1st kernel.
>
>Good, thanks!
>
>> But it paniced in kexec boot with my kexec related patcheset, the
>patchset
>
>That's the second kernel, right?
>
>> contains 3 patch:
>> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
>> 2. export physical addr fw_vendor, runtime, tables to
>/sys/firmware/efi/systab
>> 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams;
>Also do not
>>    call SetVirtualAddressMao in case kexecboot.
>> 
>> The panic happens at the last line of efi_init:
>>         /* clean DUMMY object */
>>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
>>                          EFI_VARIABLE_NON_VOLATILE |
>>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
>>                          EFI_VARIABLE_RUNTIME_ACCESS,
>>                          0, NULL);
>> 
>> Below is the dmesg:
>> [    0.003359] pid_max: default: 32768 minimum: 301
>> [    0.004792] BUG: unable to handle kernel paging request at
>fffffffefde97e70
>> [    0.006666] IP: [<ffffffff8103a1db>]
>virt_efi_set_variable+0x40/0x54
>> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
>
>Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.
>
>Ok, can you upload your patches somewhere and tell me exactly how to
>reproduce this so that I can take a look too?
>
>Thanks.

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                   ` <1ba7eca6-419c-4181-9927-9ba0927a6abf-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
@ 2013-09-22 16:38                     ` Borislav Petkov
  2013-09-23  5:45                     ` Dave Young
  2013-09-24  2:52                     ` Dave Young
  2 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-09-22 16:38 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Sun, Sep 22, 2013 at 08:27:34AM -0700, H. Peter Anvin wrote:a
> The address that faults is interesting in that it is indeed just below
> -4G. The question at hand is probably what information you are using
> to build the EFI mappings in the secondary kernel and what could make
> it not match the primary.

Yep, so obviously we're not building the pagetable in the second kernel
the same way as the first or we're missing some pieces.

Btw, for debugging situations like this one, one could use
arch/x86/mm/dump_pagetables.c successfully by sticking in the right CR3
value into *start.

:-)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                   ` <1ba7eca6-419c-4181-9927-9ba0927a6abf-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  2013-09-22 16:38                     ` Borislav Petkov
@ 2013-09-23  5:45                     ` Dave Young
  2013-09-24  2:52                     ` Dave Young
  2 siblings, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-23  5:45 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/22/13 at 08:27am, H. Peter Anvin wrote:
> The address that faults is interesting in that it is indeed just below -4G.  The question at hand is probably what information you are using to build the EFI mappings in the secondary kernel and what could make it not match the primary.
> 
> Assuming it isn't as simple as the mappings never get built at all.

At least the efi_info is same between two kernels.
Will print some debug info to see if I can find something.

> 
> 
> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> >> I tested your new patch, it works both with efi stub and grub boot in
> >> 1st kernel.
> >
> >Good, thanks!
> >
> >> But it paniced in kexec boot with my kexec related patcheset, the
> >patchset
> >
> >That's the second kernel, right?
> >
> >> contains 3 patch:
> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> >> 2. export physical addr fw_vendor, runtime, tables to
> >/sys/firmware/efi/systab
> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams;
> >Also do not
> >>    call SetVirtualAddressMao in case kexecboot.
> >> 
> >> The panic happens at the last line of efi_init:
> >>         /* clean DUMMY object */
> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> >>                          EFI_VARIABLE_NON_VOLATILE |
> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
> >>                          0, NULL);
> >> 
> >> Below is the dmesg:
> >> [    0.003359] pid_max: default: 32768 minimum: 301
> >> [    0.004792] BUG: unable to handle kernel paging request at
> >fffffffefde97e70
> >> [    0.006666] IP: [<ffffffff8103a1db>]
> >virt_efi_set_variable+0x40/0x54
> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> >
> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.
> >
> >Ok, can you upload your patches somewhere and tell me exactly how to
> >reproduce this so that I can take a look too?
> >
> >Thanks.
> 
> -- 
> Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
  2013-09-21 11:39     ` [PATCH -v2] " Borislav Petkov
@ 2013-09-23  5:47       ` Dave Young
       [not found]         ` <20130923054741.GC7007-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
       [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
  1 sibling, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-23  5:47 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal, linux-efi

On 09/21/13 at 01:39pm, Borislav Petkov wrote:
> On Thu, Sep 19, 2013 at 04:54:54PM +0200, Borislav Petkov wrote:
> > From: Borislav Petkov <bp@suse.de>
> > 
> > We map the EFI regions needed for runtime services contiguously on
> > virtual addresses starting from -4G down for a total max space of 64G.
> > This way, we provide for stable runtime services addresses across
> > kernels so that a kexec'd kernel can still use them.
> > 
> > This way, they're mapped in a separate pagetable so that we don't
> > pollute the kernel namespace (you can see how the whole ioremapping and
> > saving and restoring of PGDs is gone now).
> 
> Ok, this one was not so good, let's try again:
> 
> This time I saved 32-bit and am switching the pagetable only after
> having built it properly. This boots fine again on baremetal and on OVMF
> with Matt's handover flags fix from yesterday.
> 
> Also, I've uploaded the whole series to
> git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git, branch
> efi-experimental
> 
> ("-experimental" doesn't trigger Fengguang's robot :-))
> 
> Good luck! :-)
> 
> ---
> From 880fcee20209a122eda846e7f109776ed1c56de5 Mon Sep 17 00:00:00 2001
> From: Borislav Petkov <bp@suse.de>
> Date: Wed, 18 Sep 2013 17:35:42 +0200
> Subject: [PATCH] EFI: Runtime services virtual mapping
> 
> We map the EFI regions needed for runtime services contiguously on
> virtual addresses starting from -4G down for a total max space of 64G.
> This way, we provide for stable runtime services addresses across
> kernels so that a kexec'd kernel can still use them.
> 
> This way, they're mapped in a separate pagetable so that we don't
> pollute the kernel namespace (you can see how the whole ioremapping and
> saving and restoring of PGDs is gone now).
> 
> Signed-off-by: Borislav Petkov <bp@suse.de>
> ---
>  arch/x86/include/asm/efi.h           | 43 ++++++++++--------
>  arch/x86/include/asm/pgtable_types.h |  3 +-
>  arch/x86/platform/efi/efi.c          | 68 ++++++++++++-----------------
>  arch/x86/platform/efi/efi_32.c       | 29 +++++++++++-
>  arch/x86/platform/efi/efi_64.c       | 85 +++++++++++++++++++-----------------
>  arch/x86/platform/efi/efi_stub_64.S  | 53 ++++++++++++++++++++++
>  6 files changed, 181 insertions(+), 100 deletions(-)
> 
> diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
> index 0062a0125041..9a99e0499e4b 100644
> --- a/arch/x86/include/asm/efi.h
> +++ b/arch/x86/include/asm/efi.h
> @@ -69,24 +69,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
>  	efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
>  		  (u64)(a4), (u64)(a5), (u64)(a6))
>  
> +#define _efi_call_virtX(x, f, ...)					\
> +({									\
> +	efi_status_t __s;						\
> +									\
> +	efi_sync_low_kernel_mappings();					\
> +	preempt_disable();						\
> +	__s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__);	\
> +	preempt_enable();						\
> +	__s;								\
> +})
> +
>  #define efi_call_virt0(f)				\
> -	efi_call0((efi.systab->runtime->f))
> -#define efi_call_virt1(f, a1)					\
> -	efi_call1((efi.systab->runtime->f), (u64)(a1))
> -#define efi_call_virt2(f, a1, a2)					\
> -	efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
> -#define efi_call_virt3(f, a1, a2, a3)					\
> -	efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3))
> -#define efi_call_virt4(f, a1, a2, a3, a4)				\
> -	efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4))
> -#define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
> -	efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4), (u64)(a5))
> -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
> -	efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
> -		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
> +	_efi_call_virtX(0, f)
> +#define efi_call_virt1(f, a1)				\
> +	_efi_call_virtX(1, f, (u64)(a1))
> +#define efi_call_virt2(f, a1, a2)			\
> +	_efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
> +#define efi_call_virt3(f, a1, a2, a3)			\
> +	_efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
> +#define efi_call_virt4(f, a1, a2, a3, a4)		\
> +	_efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
> +#define efi_call_virt5(f, a1, a2, a3, a4, a5)		\
> +	_efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
> +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
> +	_efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
>  
>  extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
>  				 u32 type, u64 attribute);
> @@ -101,6 +108,8 @@ extern void efi_call_phys_prelog(void);
>  extern void efi_call_phys_epilog(void);
>  extern void efi_unmap_memmap(void);
>  extern void efi_memory_uc(u64 addr, unsigned long size);
> +extern void __init efi_map_region(efi_memory_desc_t *md);
> +extern void efi_sync_low_kernel_mappings(void);
>  
>  #ifdef CONFIG_EFI
>  
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 0ecac257fb26..a83aa44bb1fb 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
>  extern phys_addr_t slow_virt_to_phys(void *__address);
> -
> +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
> +				   unsigned numpages, unsigned long page_flags);
>  #endif	/* !__ASSEMBLY__ */
>  
>  #endif /* _ASM_X86_PGTABLE_DEFS_H */
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index 538c1e6b7b2c..90459f5f587c 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -12,6 +12,8 @@
>   *	Bibo Mao <bibo.mao@intel.com>
>   *	Chandramouli Narayanan <mouli@linux.intel.com>
>   *	Huang Ying <ying.huang@intel.com>
> + * Copyright (C) 2013 SuSE Labs
> + * 	Borislav Petkov <bp@suse.de> - runtime services VA mapping
>   *
>   * Copied from efi_32.c to eliminate the duplicated code between EFI
>   * 32/64 support code. --ying 2007-10-26
> @@ -81,6 +83,17 @@ static efi_system_table_t efi_systab __initdata;
>  unsigned long x86_efi_facility;
>  
>  /*
> + * Scratch space used for switching the pagetable in the EFI stub
> + */
> +struct efi_scratch {
> +	u64 r15;
> +	u64 prev_cr3;
> +	pgd_t *efi_pgt;
> +	bool use_pgd;
> +};
> +extern struct efi_scratch efi_scratch;
> +
> +/*
>   * Returns 1 if 'facility' is enabled, 0 otherwise.
>   */
>  int efi_enabled(int facility)
> @@ -797,22 +810,6 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
>  		set_memory_nx(addr, npages);
>  }
>  
> -static void __init runtime_code_page_mkexec(void)
> -{
> -	efi_memory_desc_t *md;
> -	void *p;
> -
> -	/* Make EFI runtime service code area executable */
> -	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> -		md = p;
> -
> -		if (md->type != EFI_RUNTIME_SERVICES_CODE)
> -			continue;
> -
> -		efi_set_executable(md, true);
> -	}
> -}
> -
>  /*
>   * We can't ioremap data in EFI boot services RAM, because we've already mapped
>   * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
> @@ -862,10 +859,10 @@ void efi_memory_uc(u64 addr, unsigned long size)
>  void __init efi_enter_virtual_mode(void)
>  {
>  	efi_memory_desc_t *md, *prev_md = NULL;
> -	efi_status_t status;
> +	void *p, *new_memmap = NULL;
>  	unsigned long size;
> -	u64 end, systab, start_pfn, end_pfn;
> -	void *p, *va, *new_memmap = NULL;
> +	efi_status_t status;
> +	u64 end, systab;
>  	int count = 0;
>  
>  	efi.systab = NULL;
> @@ -874,7 +871,6 @@ void __init efi_enter_virtual_mode(void)
>  	 * We don't do virtual mode, since we don't do runtime services, on
>  	 * non-native EFI
>  	 */
> -
>  	if (!efi_is_native()) {
>  		efi_unmap_memmap();
>  		return;
> @@ -914,33 +910,18 @@ void __init efi_enter_virtual_mode(void)
>  		    md->type != EFI_BOOT_SERVICES_DATA)
>  			continue;
>  
> +		efi_map_region(md);
> +
>  		size = md->num_pages << PAGE_SHIFT;
>  		end = md->phys_addr + size;
>  
> -		start_pfn = PFN_DOWN(md->phys_addr);
> -		end_pfn = PFN_UP(end);
> -		if (pfn_range_is_mapped(start_pfn, end_pfn)) {
> -			va = __va(md->phys_addr);
> -
> -			if (!(md->attribute & EFI_MEMORY_WB))
> -				efi_memory_uc((u64)(unsigned long)va, size);
> -		} else
> -			va = efi_ioremap(md->phys_addr, size,
> -					 md->type, md->attribute);
> -
> -		md->virt_addr = (u64) (unsigned long) va;
> -
> -		if (!va) {
> -			pr_err("ioremap of 0x%llX failed!\n",
> -			       (unsigned long long)md->phys_addr);
> -			continue;
> -		}
> -
>  		systab = (u64) (unsigned long) efi_phys.systab;
>  		if (md->phys_addr <= systab && systab < end) {
>  			systab += md->virt_addr - md->phys_addr;
> +
>  			efi.systab = (efi_system_table_t *) (unsigned long) systab;
>  		}
> +
>  		new_memmap = krealloc(new_memmap,
>  				      (count + 1) * memmap.desc_size,
>  				      GFP_KERNEL);
> @@ -949,8 +930,15 @@ void __init efi_enter_virtual_mode(void)
>  		count++;
>  	}
>  
> +#ifdef CONFIG_X86_64
> +	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
> +	efi_scratch.use_pgd = true;
> +#endif
> +
>  	BUG_ON(!efi.systab);
>  
> +	efi_sync_low_kernel_mappings();
> +
>  	status = phys_efi_set_virtual_address_map(
>  		memmap.desc_size * count,
>  		memmap.desc_size,
> @@ -983,8 +971,6 @@ void __init efi_enter_virtual_mode(void)
>  	efi.query_variable_info = virt_efi_query_variable_info;
>  	efi.update_capsule = virt_efi_update_capsule;
>  	efi.query_capsule_caps = virt_efi_query_capsule_caps;
> -	if (__supported_pte_mask & _PAGE_NX)
> -		runtime_code_page_mkexec();
>  
>  	kfree(new_memmap);
>  
> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
> index 40e446941dd7..661663b08eaf 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -37,9 +37,36 @@
>   * claim EFI runtime service handler exclusively and to duplicate a memory in
>   * low memory space say 0 - 3G.
>   */
> -
>  static unsigned long efi_rt_eflags;
>  
> +void efi_sync_low_kernel_mappings(void) {}
> +
> +void __init efi_map_region(efi_memory_desc_t *md)
> +{
> +	u64 start_pfn, end_pfn, end;
> +	unsigned long size;
> +	void *va;
> +
> +	start_pfn = PFN_DOWN(md->phys_addr);
> +	size	  = md->num_pages << PAGE_SHIFT;
> +	end	  = md->phys_addr + size;
> +	end_pfn   = PFN_UP(end);
> +
> +	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
> +		va = __va(md->phys_addr);
> +
> +		if (!(md->attribute & EFI_MEMORY_WB))
> +			efi_memory_uc((u64)(unsigned long)va, size);
> +	} else
> +		va = efi_ioremap(md->phys_addr, size,
> +				 md->type, md->attribute);
> +
> +	md->virt_addr = (u64) (unsigned long) va;
> +	if (!va)
> +		pr_err("ioremap of 0x%llX failed!\n",
> +		       (unsigned long long)md->phys_addr);
> +}
> +
>  void efi_call_phys_prelog(void)
>  {
>  	struct desc_ptr gdt_descr;
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 39a0e7f1f0a3..db5230dd350e 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -39,59 +39,64 @@
>  #include <asm/cacheflush.h>
>  #include <asm/fixmap.h>
>  
> -static pgd_t *save_pgd __initdata;
> -static unsigned long efi_flags __initdata;
> +void __init efi_call_phys_prelog(void) {}
> +void __init efi_call_phys_epilog(void) {}
>  
> -static void __init early_code_mapping_set_exec(int executable)
> +/*
> + * Add low kernel mappings for passing arguments to EFI functions.
> + */
> +void efi_sync_low_kernel_mappings(void)
>  {
> -	efi_memory_desc_t *md;
> -	void *p;
> +	unsigned num_pgds;
> +	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
>  
> -	if (!(__supported_pte_mask & _PAGE_NX))
> -		return;
> +	num_pgds = pgd_index(VMALLOC_START - 1) - pgd_index(PAGE_OFFSET);
>  
> -	/* Make EFI service code area executable */
> -	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> -		md = p;
> -		if (md->type == EFI_RUNTIME_SERVICES_CODE ||
> -		    md->type == EFI_BOOT_SERVICES_CODE)
> -			efi_set_executable(md, executable);
> -	}
> +	memcpy(pgd + pgd_index(PAGE_OFFSET),
> +		init_mm.pgd + pgd_index(PAGE_OFFSET),
> +		sizeof(pgd_t) * num_pgds);
>  }
>  
> -void __init efi_call_phys_prelog(void)
> +/*
> + * We allocate runtime services regions top-down, starting from -4G, i.e.
> + * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
> + */
> +static u64 efi_va	 = -4 * (1UL << 30);
> +#define EFI_VA_END	 (-68 * (1UL << 30))
> +
> +static void __init __map_region(efi_memory_desc_t *md, u64 va)
>  {
> -	unsigned long vaddress;
> -	int pgd;
> -	int n_pgds;
> +	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
> +	unsigned long pf = 0, size;
> +	u64 end;
>  
> -	early_code_mapping_set_exec(1);
> -	local_irq_save(efi_flags);
> +	if (!(md->attribute & EFI_MEMORY_WB))
> +		pf |= _PAGE_PCD;
>  
> -	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
> -	save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
> +	size = md->num_pages << PAGE_SHIFT;
> +	end  = va + size;
>  
> -	for (pgd = 0; pgd < n_pgds; pgd++) {
> -		save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
> -		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
> -		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
> -	}
> -	__flush_tlb_all();
> +	if(kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
> +		pr_warning("Error mapping PA 0x%llx -> VA 0x%llx!\n",
> +			   md->phys_addr, va);
>  }
>  
> -void __init efi_call_phys_epilog(void)
> +void __init efi_map_region(efi_memory_desc_t *md)
>  {
> -	/*
> -	 * After the lock is released, the original page table is restored.
> -	 */
> -	int pgd;
> -	int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
> -	for (pgd = 0; pgd < n_pgds; pgd++)
> -		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
> -	kfree(save_pgd);
> -	__flush_tlb_all();
> -	local_irq_restore(efi_flags);
> -	early_code_mapping_set_exec(0);
> +	unsigned long size = md->num_pages << PAGE_SHIFT;
> +
> +	efi_va -= size;
> +	if (efi_va < EFI_VA_END) {
> +		pr_warning(FW_WARN "VA address range overflow!\n");
> +		return;
> +	}
> +
> +	/* Do the 1:1 map */
> +	__map_region(md, md->phys_addr);
> +
> +	/* Do the VA map */
> +	__map_region(md, efi_va);


Could you add comment for above code? It's hard to understand the
twice mapping if one did not follow the old thread.


> +	md->virt_addr = efi_va;
>  }
>  
>  void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
> diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
> index 4c07ccab8146..2bb9714bf713 100644
> --- a/arch/x86/platform/efi/efi_stub_64.S
> +++ b/arch/x86/platform/efi/efi_stub_64.S
> @@ -34,10 +34,47 @@
>  	mov %rsi, %cr0;			\
>  	mov (%rsp), %rsp
>  
> +	/* stolen from gcc */
> +	.macro FLUSH_TLB_ALL
> +	movq %r15, efi_scratch(%rip)
> +	movq %r14, efi_scratch+8(%rip)
> +	movq %cr4, %r15
> +	movq %r15, %r14
> +	andb $0x7f, %r14b
> +	movq %r14, %cr4
> +	movq %r15, %cr4
> +	movq efi_scratch+8(%rip), %r14
> +	movq efi_scratch(%rip), %r15
> +	.endm
> +
> +	.macro SWITCH_PGT
> +	cmpb $0, efi_scratch+24(%rip)
> +	je 1f
> +	movq %r15, efi_scratch(%rip)		# r15
> +	# save previous CR3
> +	movq %cr3, %r15
> +	movq %r15, efi_scratch+8(%rip)		# prev_cr3
> +	movq efi_scratch+16(%rip), %r15		# EFI pgt
> +	movq %r15, %cr3
> +	1:
> +	.endm
> +
> +	.macro RESTORE_PGT
> +	cmpb $0, efi_scratch+24(%rip)
> +	je 2f
> +	movq efi_scratch+8(%rip), %r15
> +	movq %r15, %cr3
> +	movq efi_scratch(%rip), %r15
> +	FLUSH_TLB_ALL
> +	2:
> +	.endm
> +
>  ENTRY(efi_call0)
>  	SAVE_XMM
>  	subq $32, %rsp
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -47,7 +84,9 @@ ENTRY(efi_call1)
>  	SAVE_XMM
>  	subq $32, %rsp
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -57,7 +96,9 @@ ENTRY(efi_call2)
>  	SAVE_XMM
>  	subq $32, %rsp
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -68,7 +109,9 @@ ENTRY(efi_call3)
>  	subq $32, %rsp
>  	mov  %rcx, %r8
>  	mov  %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -80,7 +123,9 @@ ENTRY(efi_call4)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $32, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -93,7 +138,9 @@ ENTRY(efi_call5)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $48, %rsp
>  	RESTORE_XMM
>  	ret
> @@ -109,8 +156,14 @@ ENTRY(efi_call6)
>  	mov %r8, %r9
>  	mov %rcx, %r8
>  	mov %rsi, %rcx
> +	SWITCH_PGT
>  	call *%rdi
> +	RESTORE_PGT
>  	addq $48, %rsp
>  	RESTORE_XMM
>  	ret
>  ENDPROC(efi_call6)
> +
> +	.data
> +ENTRY(efi_scratch)
> +	.fill 3,8,0
> -- 
> 1.8.4
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]         ` <20130923054741.GC7007-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-23  6:29           ` Borislav Petkov
       [not found]             ` <20130923062947.GA2527-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-23  6:29 UTC (permalink / raw)
  To: Dave Young
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 23, 2013 at 01:47:41PM +0800, Dave Young wrote:
> > +	unsigned long size = md->num_pages << PAGE_SHIFT;
> > +
> > +	efi_va -= size;
> > +	if (efi_va < EFI_VA_END) {
> > +		pr_warning(FW_WARN "VA address range overflow!\n");
> > +		return;
> > +	}
> > +
> > +	/* Do the 1:1 map */
> > +	__map_region(md, md->phys_addr);
> > +
> > +	/* Do the VA map */
> > +	__map_region(md, efi_va);
> 
> 
> Could you add comment for above code? It's hard to understand the
> twice mapping if one did not follow the old thread.

Does that suffice:

/*
 * Make sure the 1:1 mappings are present as a catch-all for b0rked firmware
 * which doesn't update all internal pointers after switching to virtual mode
 * and would otherwise crap on us.
 */

?

Btw, when you reply to a mail, please remove that quoted portion of it
which you're not replying to - I had to scroll a bunch of screens down
and I almost missed your reply. :)

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]             ` <20130923062947.GA2527-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-09-23  7:08               ` Dave Young
  0 siblings, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-23  7:08 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/23/13 at 08:29am, Borislav Petkov wrote:
> On Mon, Sep 23, 2013 at 01:47:41PM +0800, Dave Young wrote:
> > > +	unsigned long size = md->num_pages << PAGE_SHIFT;
> > > +
> > > +	efi_va -= size;
> > > +	if (efi_va < EFI_VA_END) {
> > > +		pr_warning(FW_WARN "VA address range overflow!\n");
> > > +		return;
> > > +	}
> > > +
> > > +	/* Do the 1:1 map */
> > > +	__map_region(md, md->phys_addr);
> > > +
> > > +	/* Do the VA map */
> > > +	__map_region(md, efi_va);
> > 
> > 
> > Could you add comment for above code? It's hard to understand the
> > twice mapping if one did not follow the old thread.
> 
> Does that suffice:
> 
> /*
>  * Make sure the 1:1 mappings are present as a catch-all for b0rked firmware
>  * which doesn't update all internal pointers after switching to virtual mode
>  * and would otherwise crap on us.
>  */
> 
> ?

Yes, looks good. Thanks

> 
> Btw, when you reply to a mail, please remove that quoted portion of it
> which you're not replying to - I had to scroll a bunch of screens down
> and I almost missed your reply. :)

Will do. I did notice the problem after I enter 'y' in mutt, sorry about it.

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
  2013-09-22 12:35         ` Dave Young
@ 2013-09-23  8:45         ` Borislav Petkov
  2013-09-25  9:24         ` Borislav Petkov
  2 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-09-23  8:45 UTC (permalink / raw)
  To: X86 ML
  Cc: LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal, Dave Young,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Sat, Sep 21, 2013 at 01:39:29PM +0200, Borislav Petkov wrote:
> -void __init efi_call_phys_prelog(void)
> +/*
> + * We allocate runtime services regions top-down, starting from -4G, i.e.
> + * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
> + */
> +static u64 efi_va	 = -4 * (1UL << 30);
> +#define EFI_VA_END	 (-68 * (1UL << 30))

Note to self: add this range to Documentation/x86/x86_64/mm.txt

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                   ` <1ba7eca6-419c-4181-9927-9ba0927a6abf-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  2013-09-22 16:38                     ` Borislav Petkov
  2013-09-23  5:45                     ` Dave Young
@ 2013-09-24  2:52                     ` Dave Young
       [not found]                       ` <20130924025209.GA5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2 siblings, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-24  2:52 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/22/13 at 08:27am, H. Peter Anvin wrote:
> The address that faults is interesting in that it is indeed just below -4G.  The question at hand is probably what information you are using to build the EFI mappings in the secondary kernel and what could make it not match the primary.
> 
> Assuming it isn't as simple as the mappings never get built at all.

Here is my debug output, diff efi-mapping-1st-kernel efi-mapping-2nd-kernel:
Obviously, the high address mapping is not same:

--- efi-mapping-1.txt	2013-09-24 10:46:09.977746047 +0800
+++ efi-mapping-2.txt	2013-09-24 10:46:33.871421806 +0800
@@ -1,30 +1,30 @@
 efi mapping PA 0x800000 -> VA 0x800000
 efi mapping PA 0x800000 -> VA 0xffffffff00000000
 efi mapping PA 0x7c000000 -> VA 0x7c000000
-efi mapping PA 0x7c000000 -> VA 0xfffffffefffe0000
+efi mapping PA 0x7c000000 -> VA 0xffffffff00000000
 efi mapping PA 0x7d5e2000 -> VA 0x7d5e2000
-efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffdf000
+efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffff000
 efi mapping PA 0x7d77d000 -> VA 0x7d77d000
-efi mapping PA 0x7d77d000 -> VA 0xfffffffefffde000
+efi mapping PA 0x7d77d000 -> VA 0xfffffffeffffe000
 efi mapping PA 0x7d864000 -> VA 0x7d864000
-efi mapping PA 0x7d864000 -> VA 0xfffffffeff8d4000
+efi mapping PA 0x7d864000 -> VA 0xfffffffeff8f4000
 efi mapping PA 0x7df6e000 -> VA 0x7df6e000
-efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ae000
+efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ce000
 efi mapping PA 0x7e194000 -> VA 0x7e194000
-efi mapping PA 0x7e194000 -> VA 0xfffffffeff6ac000
+efi mapping PA 0x7e194000 -> VA 0xfffffffeff6cc000
 efi mapping PA 0x7e196000 -> VA 0x7e196000
-efi mapping PA 0x7e196000 -> VA 0xfffffffeff696000
+efi mapping PA 0x7e196000 -> VA 0xfffffffeff6b6000
 efi mapping PA 0x7e1ac000 -> VA 0x7e1ac000
-efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff681000
+efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff6a1000
 efi mapping PA 0x7e1c1000 -> VA 0x7e1c1000
-efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe041000
+efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe061000
 efi mapping PA 0x7f802000 -> VA 0x7f802000
-efi mapping PA 0x7f802000 -> VA 0xfffffffefdec2000
+efi mapping PA 0x7f802000 -> VA 0xfffffffefdee2000
 efi mapping PA 0x7f981000 -> VA 0x7f981000
-efi mapping PA 0x7f981000 -> VA 0xfffffffefde92000
+efi mapping PA 0x7f981000 -> VA 0xfffffffefdeb2000
 efi mapping PA 0x7f9b1000 -> VA 0x7f9b1000
-efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde6e000
+efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde8e000
 efi mapping PA 0x7f9e5000 -> VA 0x7f9e5000
-efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd873000
+efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd893000
 efi mapping PA 0x7ffe0000 -> VA 0x7ffe0000
-efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd853000
+efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd873000

> 
> 
> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> >> I tested your new patch, it works both with efi stub and grub boot in
> >> 1st kernel.
> >
> >Good, thanks!
> >
> >> But it paniced in kexec boot with my kexec related patcheset, the
> >patchset
> >
> >That's the second kernel, right?
> >
> >> contains 3 patch:
> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> >> 2. export physical addr fw_vendor, runtime, tables to
> >/sys/firmware/efi/systab
> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from bootparams;
> >Also do not
> >>    call SetVirtualAddressMao in case kexecboot.
> >> 
> >> The panic happens at the last line of efi_init:
> >>         /* clean DUMMY object */
> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> >>                          EFI_VARIABLE_NON_VOLATILE |
> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
> >>                          0, NULL);
> >> 
> >> Below is the dmesg:
> >> [    0.003359] pid_max: default: 32768 minimum: 301
> >> [    0.004792] BUG: unable to handle kernel paging request at
> >fffffffefde97e70
> >> [    0.006666] IP: [<ffffffff8103a1db>]
> >virt_efi_set_variable+0x40/0x54
> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> >
> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is 0.
> >
> >Ok, can you upload your patches somewhere and tell me exactly how to
> >reproduce this so that I can take a look too?
> >
> >Thanks.
> 
> -- 
> Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                       ` <20130924025209.GA5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-24  3:06                         ` H. Peter Anvin
       [not found]                           ` <2d27a1bc-eabf-4d45-8303-27ae58511b11-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-09-24  3:06 UTC (permalink / raw)
  To: Dave Young
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

Okay... I see two problems.

1. It looks like we subtract the region size after, rather than before, assigning an address.

2. The second region is assigned the same address in the secondary kernel as in the first, implying the size of the first region was somehow set to zero.

Dave Young <dyoung-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>On 09/22/13 at 08:27am, H. Peter Anvin wrote:
>> The address that faults is interesting in that it is indeed just
>below -4G.  The question at hand is probably what information you are
>using to build the EFI mappings in the secondary kernel and what could
>make it not match the primary.
>> 
>> Assuming it isn't as simple as the mappings never get built at all.
>
>Here is my debug output, diff efi-mapping-1st-kernel
>efi-mapping-2nd-kernel:
>Obviously, the high address mapping is not same:
>
>--- efi-mapping-1.txt	2013-09-24 10:46:09.977746047 +0800
>+++ efi-mapping-2.txt	2013-09-24 10:46:33.871421806 +0800
>@@ -1,30 +1,30 @@
> efi mapping PA 0x800000 -> VA 0x800000
> efi mapping PA 0x800000 -> VA 0xffffffff00000000
> efi mapping PA 0x7c000000 -> VA 0x7c000000
>-efi mapping PA 0x7c000000 -> VA 0xfffffffefffe0000
>+efi mapping PA 0x7c000000 -> VA 0xffffffff00000000
> efi mapping PA 0x7d5e2000 -> VA 0x7d5e2000
>-efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffdf000
>+efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffff000
> efi mapping PA 0x7d77d000 -> VA 0x7d77d000
>-efi mapping PA 0x7d77d000 -> VA 0xfffffffefffde000
>+efi mapping PA 0x7d77d000 -> VA 0xfffffffeffffe000
> efi mapping PA 0x7d864000 -> VA 0x7d864000
>-efi mapping PA 0x7d864000 -> VA 0xfffffffeff8d4000
>+efi mapping PA 0x7d864000 -> VA 0xfffffffeff8f4000
> efi mapping PA 0x7df6e000 -> VA 0x7df6e000
>-efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ae000
>+efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ce000
> efi mapping PA 0x7e194000 -> VA 0x7e194000
>-efi mapping PA 0x7e194000 -> VA 0xfffffffeff6ac000
>+efi mapping PA 0x7e194000 -> VA 0xfffffffeff6cc000
> efi mapping PA 0x7e196000 -> VA 0x7e196000
>-efi mapping PA 0x7e196000 -> VA 0xfffffffeff696000
>+efi mapping PA 0x7e196000 -> VA 0xfffffffeff6b6000
> efi mapping PA 0x7e1ac000 -> VA 0x7e1ac000
>-efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff681000
>+efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff6a1000
> efi mapping PA 0x7e1c1000 -> VA 0x7e1c1000
>-efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe041000
>+efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe061000
> efi mapping PA 0x7f802000 -> VA 0x7f802000
>-efi mapping PA 0x7f802000 -> VA 0xfffffffefdec2000
>+efi mapping PA 0x7f802000 -> VA 0xfffffffefdee2000
> efi mapping PA 0x7f981000 -> VA 0x7f981000
>-efi mapping PA 0x7f981000 -> VA 0xfffffffefde92000
>+efi mapping PA 0x7f981000 -> VA 0xfffffffefdeb2000
> efi mapping PA 0x7f9b1000 -> VA 0x7f9b1000
>-efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde6e000
>+efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde8e000
> efi mapping PA 0x7f9e5000 -> VA 0x7f9e5000
>-efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd873000
>+efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd893000
> efi mapping PA 0x7ffe0000 -> VA 0x7ffe0000
>-efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd853000
>+efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd873000
>
>> 
>> 
>> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
>> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
>> >> I tested your new patch, it works both with efi stub and grub boot
>in
>> >> 1st kernel.
>> >
>> >Good, thanks!
>> >
>> >> But it paniced in kexec boot with my kexec related patcheset, the
>> >patchset
>> >
>> >That's the second kernel, right?
>> >
>> >> contains 3 patch:
>> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
>> >> 2. export physical addr fw_vendor, runtime, tables to
>> >/sys/firmware/efi/systab
>> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from
>bootparams;
>> >Also do not
>> >>    call SetVirtualAddressMao in case kexecboot.
>> >> 
>> >> The panic happens at the last line of efi_init:
>> >>         /* clean DUMMY object */
>> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
>> >>                          EFI_VARIABLE_NON_VOLATILE |
>> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
>> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
>> >>                          0, NULL);
>> >> 
>> >> Below is the dmesg:
>> >> [    0.003359] pid_max: default: 32768 minimum: 301
>> >> [    0.004792] BUG: unable to handle kernel paging request at
>> >fffffffefde97e70
>> >> [    0.006666] IP: [<ffffffff8103a1db>]
>> >virt_efi_set_variable+0x40/0x54
>> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
>> >
>> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is
>0.
>> >
>> >Ok, can you upload your patches somewhere and tell me exactly how to
>> >reproduce this so that I can take a look too?
>> >
>> >Thanks.
>> 
>> -- 
>> Sent from my mobile phone.  Please pardon brevity and lack of
>formatting.

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                           ` <2d27a1bc-eabf-4d45-8303-27ae58511b11-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
@ 2013-09-24  4:57                             ` Dave Young
       [not found]                               ` <20130924045705.GB5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2013-10-02 10:04                             ` Borislav Petkov
  1 sibling, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-24  4:57 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> Okay... I see two problems.
> 
> 1. It looks like we subtract the region size after, rather than before, assigning an address.
> 
> 2. The second region is assigned the same address in the secondary kernel as in the first, implying the size of the first region was somehow set to zero.

I find the reason, efi_reserve_boot_services will reserve the BOOT_SERVICE_DATA region
thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping addr after
the md will be not same as 1st kernel, see below code:
 
void __init efi_map_region(efi_memory_desc_t *md)
{
        unsigned long size = md->num_pages << PAGE_SHIFT;

        efi_va -= size;
        ^^^^^^^^^^^^^^^
	[snip]
}


> 
> Dave Young <dyoung-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >On 09/22/13 at 08:27am, H. Peter Anvin wrote:
> >> The address that faults is interesting in that it is indeed just
> >below -4G.  The question at hand is probably what information you are
> >using to build the EFI mappings in the secondary kernel and what could
> >make it not match the primary.
> >> 
> >> Assuming it isn't as simple as the mappings never get built at all.
> >
> >Here is my debug output, diff efi-mapping-1st-kernel
> >efi-mapping-2nd-kernel:
> >Obviously, the high address mapping is not same:
> >
> >--- efi-mapping-1.txt	2013-09-24 10:46:09.977746047 +0800
> >+++ efi-mapping-2.txt	2013-09-24 10:46:33.871421806 +0800
> >@@ -1,30 +1,30 @@
> > efi mapping PA 0x800000 -> VA 0x800000
> > efi mapping PA 0x800000 -> VA 0xffffffff00000000
> > efi mapping PA 0x7c000000 -> VA 0x7c000000
> >-efi mapping PA 0x7c000000 -> VA 0xfffffffefffe0000
> >+efi mapping PA 0x7c000000 -> VA 0xffffffff00000000
> > efi mapping PA 0x7d5e2000 -> VA 0x7d5e2000
> >-efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffdf000
> >+efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffff000
> > efi mapping PA 0x7d77d000 -> VA 0x7d77d000
> >-efi mapping PA 0x7d77d000 -> VA 0xfffffffefffde000
> >+efi mapping PA 0x7d77d000 -> VA 0xfffffffeffffe000
> > efi mapping PA 0x7d864000 -> VA 0x7d864000
> >-efi mapping PA 0x7d864000 -> VA 0xfffffffeff8d4000
> >+efi mapping PA 0x7d864000 -> VA 0xfffffffeff8f4000
> > efi mapping PA 0x7df6e000 -> VA 0x7df6e000
> >-efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ae000
> >+efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ce000
> > efi mapping PA 0x7e194000 -> VA 0x7e194000
> >-efi mapping PA 0x7e194000 -> VA 0xfffffffeff6ac000
> >+efi mapping PA 0x7e194000 -> VA 0xfffffffeff6cc000
> > efi mapping PA 0x7e196000 -> VA 0x7e196000
> >-efi mapping PA 0x7e196000 -> VA 0xfffffffeff696000
> >+efi mapping PA 0x7e196000 -> VA 0xfffffffeff6b6000
> > efi mapping PA 0x7e1ac000 -> VA 0x7e1ac000
> >-efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff681000
> >+efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff6a1000
> > efi mapping PA 0x7e1c1000 -> VA 0x7e1c1000
> >-efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe041000
> >+efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe061000
> > efi mapping PA 0x7f802000 -> VA 0x7f802000
> >-efi mapping PA 0x7f802000 -> VA 0xfffffffefdec2000
> >+efi mapping PA 0x7f802000 -> VA 0xfffffffefdee2000
> > efi mapping PA 0x7f981000 -> VA 0x7f981000
> >-efi mapping PA 0x7f981000 -> VA 0xfffffffefde92000
> >+efi mapping PA 0x7f981000 -> VA 0xfffffffefdeb2000
> > efi mapping PA 0x7f9b1000 -> VA 0x7f9b1000
> >-efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde6e000
> >+efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde8e000
> > efi mapping PA 0x7f9e5000 -> VA 0x7f9e5000
> >-efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd873000
> >+efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd893000
> > efi mapping PA 0x7ffe0000 -> VA 0x7ffe0000
> >-efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd853000
> >+efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd873000
> >
> >> 
> >> 
> >> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
> >> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> >> >> I tested your new patch, it works both with efi stub and grub boot
> >in
> >> >> 1st kernel.
> >> >
> >> >Good, thanks!
> >> >
> >> >> But it paniced in kexec boot with my kexec related patcheset, the
> >> >patchset
> >> >
> >> >That's the second kernel, right?
> >> >
> >> >> contains 3 patch:
> >> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> >> >> 2. export physical addr fw_vendor, runtime, tables to
> >> >/sys/firmware/efi/systab
> >> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from
> >bootparams;
> >> >Also do not
> >> >>    call SetVirtualAddressMao in case kexecboot.
> >> >> 
> >> >> The panic happens at the last line of efi_init:
> >> >>         /* clean DUMMY object */
> >> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> >> >>                          EFI_VARIABLE_NON_VOLATILE |
> >> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> >> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
> >> >>                          0, NULL);
> >> >> 
> >> >> Below is the dmesg:
> >> >> [    0.003359] pid_max: default: 32768 minimum: 301
> >> >> [    0.004792] BUG: unable to handle kernel paging request at
> >> >fffffffefde97e70
> >> >> [    0.006666] IP: [<ffffffff8103a1db>]
> >> >virt_efi_set_variable+0x40/0x54
> >> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> >> >
> >> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is
> >0.
> >> >
> >> >Ok, can you upload your patches somewhere and tell me exactly how to
> >> >reproduce this so that I can take a look too?
> >> >
> >> >Thanks.
> >> 
> >> -- 
> >> Sent from my mobile phone.  Please pardon brevity and lack of
> >formatting.
> 
> -- 
> Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                               ` <20130924045705.GB5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-24  4:58                                 ` Dave Young
       [not found]                                   ` <20130924045818.GC5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2013-09-24  9:43                                 ` Borislav Petkov
  1 sibling, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-24  4:58 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 12:57pm, Dave Young wrote:
> On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> > Okay... I see two problems.
> > 
> > 1. It looks like we subtract the region size after, rather than before, assigning an address.
> > 
> > 2. The second region is assigned the same address in the secondary kernel as in the first, implying the size of the first region was somehow set to zero.
> 
> I find the reason, efi_reserve_boot_services will reserve the BOOT_SERVICE_DATA region
> thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping addr after
> the md will be not same as 1st kernel, see below code:
>  
> void __init efi_map_region(efi_memory_desc_t *md)
> {
>         unsigned long size = md->num_pages << PAGE_SHIFT;
> 
>         efi_va -= size;
>         ^^^^^^^^^^^^^^^
> 	[snip]
> }

So how about just reserve BOOT_SERVICE_DATA region but keep the md.numpages as is?

> 
> 
> > 
> > Dave Young <dyoung-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > >On 09/22/13 at 08:27am, H. Peter Anvin wrote:
> > >> The address that faults is interesting in that it is indeed just
> > >below -4G.  The question at hand is probably what information you are
> > >using to build the EFI mappings in the secondary kernel and what could
> > >make it not match the primary.
> > >> 
> > >> Assuming it isn't as simple as the mappings never get built at all.
> > >
> > >Here is my debug output, diff efi-mapping-1st-kernel
> > >efi-mapping-2nd-kernel:
> > >Obviously, the high address mapping is not same:
> > >
> > >--- efi-mapping-1.txt	2013-09-24 10:46:09.977746047 +0800
> > >+++ efi-mapping-2.txt	2013-09-24 10:46:33.871421806 +0800
> > >@@ -1,30 +1,30 @@
> > > efi mapping PA 0x800000 -> VA 0x800000
> > > efi mapping PA 0x800000 -> VA 0xffffffff00000000
> > > efi mapping PA 0x7c000000 -> VA 0x7c000000
> > >-efi mapping PA 0x7c000000 -> VA 0xfffffffefffe0000
> > >+efi mapping PA 0x7c000000 -> VA 0xffffffff00000000
> > > efi mapping PA 0x7d5e2000 -> VA 0x7d5e2000
> > >-efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffdf000
> > >+efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffff000
> > > efi mapping PA 0x7d77d000 -> VA 0x7d77d000
> > >-efi mapping PA 0x7d77d000 -> VA 0xfffffffefffde000
> > >+efi mapping PA 0x7d77d000 -> VA 0xfffffffeffffe000
> > > efi mapping PA 0x7d864000 -> VA 0x7d864000
> > >-efi mapping PA 0x7d864000 -> VA 0xfffffffeff8d4000
> > >+efi mapping PA 0x7d864000 -> VA 0xfffffffeff8f4000
> > > efi mapping PA 0x7df6e000 -> VA 0x7df6e000
> > >-efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ae000
> > >+efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ce000
> > > efi mapping PA 0x7e194000 -> VA 0x7e194000
> > >-efi mapping PA 0x7e194000 -> VA 0xfffffffeff6ac000
> > >+efi mapping PA 0x7e194000 -> VA 0xfffffffeff6cc000
> > > efi mapping PA 0x7e196000 -> VA 0x7e196000
> > >-efi mapping PA 0x7e196000 -> VA 0xfffffffeff696000
> > >+efi mapping PA 0x7e196000 -> VA 0xfffffffeff6b6000
> > > efi mapping PA 0x7e1ac000 -> VA 0x7e1ac000
> > >-efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff681000
> > >+efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff6a1000
> > > efi mapping PA 0x7e1c1000 -> VA 0x7e1c1000
> > >-efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe041000
> > >+efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe061000
> > > efi mapping PA 0x7f802000 -> VA 0x7f802000
> > >-efi mapping PA 0x7f802000 -> VA 0xfffffffefdec2000
> > >+efi mapping PA 0x7f802000 -> VA 0xfffffffefdee2000
> > > efi mapping PA 0x7f981000 -> VA 0x7f981000
> > >-efi mapping PA 0x7f981000 -> VA 0xfffffffefde92000
> > >+efi mapping PA 0x7f981000 -> VA 0xfffffffefdeb2000
> > > efi mapping PA 0x7f9b1000 -> VA 0x7f9b1000
> > >-efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde6e000
> > >+efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde8e000
> > > efi mapping PA 0x7f9e5000 -> VA 0x7f9e5000
> > >-efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd873000
> > >+efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd893000
> > > efi mapping PA 0x7ffe0000 -> VA 0x7ffe0000
> > >-efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd853000
> > >+efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd873000
> > >
> > >> 
> > >> 
> > >> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
> > >> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> > >> >> I tested your new patch, it works both with efi stub and grub boot
> > >in
> > >> >> 1st kernel.
> > >> >
> > >> >Good, thanks!
> > >> >
> > >> >> But it paniced in kexec boot with my kexec related patcheset, the
> > >> >patchset
> > >> >
> > >> >That's the second kernel, right?
> > >> >
> > >> >> contains 3 patch:
> > >> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> > >> >> 2. export physical addr fw_vendor, runtime, tables to
> > >> >/sys/firmware/efi/systab
> > >> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from
> > >bootparams;
> > >> >Also do not
> > >> >>    call SetVirtualAddressMao in case kexecboot.
> > >> >> 
> > >> >> The panic happens at the last line of efi_init:
> > >> >>         /* clean DUMMY object */
> > >> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> > >> >>                          EFI_VARIABLE_NON_VOLATILE |
> > >> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> > >> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
> > >> >>                          0, NULL);
> > >> >> 
> > >> >> Below is the dmesg:
> > >> >> [    0.003359] pid_max: default: 32768 minimum: 301
> > >> >> [    0.004792] BUG: unable to handle kernel paging request at
> > >> >fffffffefde97e70
> > >> >> [    0.006666] IP: [<ffffffff8103a1db>]
> > >> >virt_efi_set_variable+0x40/0x54
> > >> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> > >> >
> > >> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is
> > >0.
> > >> >
> > >> >Ok, can you upload your patches somewhere and tell me exactly how to
> > >> >reproduce this so that I can take a look too?
> > >> >
> > >> >Thanks.
> > >> 
> > >> -- 
> > >> Sent from my mobile phone.  Please pardon brevity and lack of
> > >formatting.
> > 
> > -- 
> > Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                   ` <20130924045818.GC5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-24  5:23                                     ` Dave Young
  2013-09-24  8:57                                       ` Dave Young
  0 siblings, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-24  5:23 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 12:58pm, Dave Young wrote:
> On 09/24/13 at 12:57pm, Dave Young wrote:
> > On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> > > Okay... I see two problems.
> > > 
> > > 1. It looks like we subtract the region size after, rather than before, assigning an address.

Could you explain more about this problem? Where is the code?

> > > 
> > > 2. The second region is assigned the same address in the secondary kernel as in the first, implying the size of the first region was somehow set to zero.
> > 
> > I find the reason, efi_reserve_boot_services will reserve the BOOT_SERVICE_DATA region
> > thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping addr after
> > the md will be not same as 1st kernel, see below code:
> >  
> > void __init efi_map_region(efi_memory_desc_t *md)
> > {
> >         unsigned long size = md->num_pages << PAGE_SHIFT;
> > 
> >         efi_va -= size;
> >         ^^^^^^^^^^^^^^^
> > 	[snip]
> > }
> 
> So how about just reserve BOOT_SERVICE_DATA region but keep the md.numpages as is?

Hmm, num_pages = 0 is only set when boot service region reservation is imporsible, I'm
lost.. But there must be somewhere set the size to 0.

> 
> > 
> > 
> > > 
> > > Dave Young <dyoung-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > >On 09/22/13 at 08:27am, H. Peter Anvin wrote:
> > > >> The address that faults is interesting in that it is indeed just
> > > >below -4G.  The question at hand is probably what information you are
> > > >using to build the EFI mappings in the secondary kernel and what could
> > > >make it not match the primary.
> > > >> 
> > > >> Assuming it isn't as simple as the mappings never get built at all.
> > > >
> > > >Here is my debug output, diff efi-mapping-1st-kernel
> > > >efi-mapping-2nd-kernel:
> > > >Obviously, the high address mapping is not same:
> > > >
> > > >--- efi-mapping-1.txt	2013-09-24 10:46:09.977746047 +0800
> > > >+++ efi-mapping-2.txt	2013-09-24 10:46:33.871421806 +0800
> > > >@@ -1,30 +1,30 @@
> > > > efi mapping PA 0x800000 -> VA 0x800000
> > > > efi mapping PA 0x800000 -> VA 0xffffffff00000000
> > > > efi mapping PA 0x7c000000 -> VA 0x7c000000
> > > >-efi mapping PA 0x7c000000 -> VA 0xfffffffefffe0000
> > > >+efi mapping PA 0x7c000000 -> VA 0xffffffff00000000
> > > > efi mapping PA 0x7d5e2000 -> VA 0x7d5e2000
> > > >-efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffdf000
> > > >+efi mapping PA 0x7d5e2000 -> VA 0xfffffffefffff000
> > > > efi mapping PA 0x7d77d000 -> VA 0x7d77d000
> > > >-efi mapping PA 0x7d77d000 -> VA 0xfffffffefffde000
> > > >+efi mapping PA 0x7d77d000 -> VA 0xfffffffeffffe000
> > > > efi mapping PA 0x7d864000 -> VA 0x7d864000
> > > >-efi mapping PA 0x7d864000 -> VA 0xfffffffeff8d4000
> > > >+efi mapping PA 0x7d864000 -> VA 0xfffffffeff8f4000
> > > > efi mapping PA 0x7df6e000 -> VA 0x7df6e000
> > > >-efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ae000
> > > >+efi mapping PA 0x7df6e000 -> VA 0xfffffffeff6ce000
> > > > efi mapping PA 0x7e194000 -> VA 0x7e194000
> > > >-efi mapping PA 0x7e194000 -> VA 0xfffffffeff6ac000
> > > >+efi mapping PA 0x7e194000 -> VA 0xfffffffeff6cc000
> > > > efi mapping PA 0x7e196000 -> VA 0x7e196000
> > > >-efi mapping PA 0x7e196000 -> VA 0xfffffffeff696000
> > > >+efi mapping PA 0x7e196000 -> VA 0xfffffffeff6b6000
> > > > efi mapping PA 0x7e1ac000 -> VA 0x7e1ac000
> > > >-efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff681000
> > > >+efi mapping PA 0x7e1ac000 -> VA 0xfffffffeff6a1000
> > > > efi mapping PA 0x7e1c1000 -> VA 0x7e1c1000
> > > >-efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe041000
> > > >+efi mapping PA 0x7e1c1000 -> VA 0xfffffffefe061000
> > > > efi mapping PA 0x7f802000 -> VA 0x7f802000
> > > >-efi mapping PA 0x7f802000 -> VA 0xfffffffefdec2000
> > > >+efi mapping PA 0x7f802000 -> VA 0xfffffffefdee2000
> > > > efi mapping PA 0x7f981000 -> VA 0x7f981000
> > > >-efi mapping PA 0x7f981000 -> VA 0xfffffffefde92000
> > > >+efi mapping PA 0x7f981000 -> VA 0xfffffffefdeb2000
> > > > efi mapping PA 0x7f9b1000 -> VA 0x7f9b1000
> > > >-efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde6e000
> > > >+efi mapping PA 0x7f9b1000 -> VA 0xfffffffefde8e000
> > > > efi mapping PA 0x7f9e5000 -> VA 0x7f9e5000
> > > >-efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd873000
> > > >+efi mapping PA 0x7f9e5000 -> VA 0xfffffffefd893000
> > > > efi mapping PA 0x7ffe0000 -> VA 0x7ffe0000
> > > >-efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd853000
> > > >+efi mapping PA 0x7ffe0000 -> VA 0xfffffffefd873000
> > > >
> > > >> 
> > > >> 
> > > >> Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
> > > >> >On Sun, Sep 22, 2013 at 08:35:15PM +0800, Dave Young wrote:
> > > >> >> I tested your new patch, it works both with efi stub and grub boot
> > > >in
> > > >> >> 1st kernel.
> > > >> >
> > > >> >Good, thanks!
> > > >> >
> > > >> >> But it paniced in kexec boot with my kexec related patcheset, the
> > > >> >patchset
> > > >> >
> > > >> >That's the second kernel, right?
> > > >> >
> > > >> >> contains 3 patch:
> > > >> >> 1. introduce cmdline kexecboot=<0|1|2>; 1 == kexec, 2 == kdump
> > > >> >> 2. export physical addr fw_vendor, runtime, tables to
> > > >> >/sys/firmware/efi/systab
> > > >> >> 3. if kexecboot != 0, use fw_vendor, runtime, tables from
> > > >bootparams;
> > > >> >Also do not
> > > >> >>    call SetVirtualAddressMao in case kexecboot.
> > > >> >> 
> > > >> >> The panic happens at the last line of efi_init:
> > > >> >>         /* clean DUMMY object */
> > > >> >>         efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> > > >> >>                          EFI_VARIABLE_NON_VOLATILE |
> > > >> >>                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
> > > >> >>                          EFI_VARIABLE_RUNTIME_ACCESS,
> > > >> >>                          0, NULL);
> > > >> >> 
> > > >> >> Below is the dmesg:
> > > >> >> [    0.003359] pid_max: default: 32768 minimum: 301
> > > >> >> [    0.004792] BUG: unable to handle kernel paging request at
> > > >> >fffffffefde97e70
> > > >> >> [    0.006666] IP: [<ffffffff8103a1db>]
> > > >> >virt_efi_set_variable+0x40/0x54
> > > >> >> [    0.006666] PGD 36981067 PUD 35828063 PMD 0
> > > >> >
> > > >> >Here it is - fffffffefde97e70 is not mapped in the pagetable, PMD is
> > > >0.
> > > >> >
> > > >> >Ok, can you upload your patches somewhere and tell me exactly how to
> > > >> >reproduce this so that I can take a look too?
> > > >> >
> > > >> >Thanks.
> > > >> 
> > > >> -- 
> > > >> Sent from my mobile phone.  Please pardon brevity and lack of
> > > >formatting.
> > > 
> > > -- 
> > > Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
  2013-09-24  5:23                                     ` Dave Young
@ 2013-09-24  8:57                                       ` Dave Young
  0 siblings, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-24  8:57 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal, linux-efi

On 09/24/13 at 01:23pm, Dave Young wrote:
> On 09/24/13 at 12:58pm, Dave Young wrote:
> > On 09/24/13 at 12:57pm, Dave Young wrote:
> > > On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> > > > Okay... I see two problems.
> > > > 
> > > > 1. It looks like we subtract the region size after, rather than before, assigning an address.
> 
> Could you explain more about this problem? Where is the code?
> 
> > > > 
> > > > 2. The second region is assigned the same address in the secondary kernel as in the first, implying the size of the first region was somehow set to zero.
> > > 
> > > I find the reason, efi_reserve_boot_services will reserve the BOOT_SERVICE_DATA region
> > > thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping addr after
> > > the md will be not same as 1st kernel, see below code:
> > >  
> > > void __init efi_map_region(efi_memory_desc_t *md)
> > > {
> > >         unsigned long size = md->num_pages << PAGE_SHIFT;
> > > 
> > >         efi_va -= size;
> > >         ^^^^^^^^^^^^^^^
> > > 	[snip]
> > > }
> > 
> > So how about just reserve BOOT_SERVICE_DATA region but keep the md.numpages as is?
> 
> Hmm, num_pages = 0 is only set when boot service region reservation is imporsible, I'm
> lost.. But there must be somewhere set the size to 0.
> 

digging more about it, it is indeed below code move the num_pages to 0:
void __init efi_reserve_boot_services(void)
{
 [snip]
                if ((start+size >= __pa_symbol(_text)
                                && start <= __pa_symbol(_end)) ||
                        !e820_all_mapped(start, start+size, E820_RAM) ||
                        memblock_is_region_reserved(start, size)) {
                        /* Could not reserve, skip it */
                        md->num_pages = 0;
 [snip]
}

During my test, the first region overlaps with kernel _text <-> _end, thus cause this issue.

I wonder if md->num_pages must be set to 0 here. If so I think we have to save the original memmap
for kexec use. Any better idea?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                               ` <20130924045705.GB5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2013-09-24  4:58                                 ` Dave Young
@ 2013-09-24  9:43                                 ` Borislav Petkov
       [not found]                                   ` <bd9528a453bce9b52ad4be75dcc0f034.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
  1 sibling, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-24  9:43 UTC (permalink / raw)
  To: Dave Young
  Cc: H. Peter Anvin, Borislav Petkov, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

Crap,

I need to send from the web interface since the network here doesn't
somehow let through port 587.

On Tue, September 24, 2013 6:57 am, Dave Young wrote:
> On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
>> Okay... I see two problems.
>>
>> 1. It looks like we subtract the region size after, rather than before,
>> assigning an address.
>>
>> 2. The second region is assigned the same address in the secondary
>> kernel as in the first, implying the size of the first region was
>> somehow set to zero.
>
> I find the reason, efi_reserve_boot_services will reserve the
> BOOT_SERVICE_DATA region
> thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping
> addr after
> the md will be not same as 1st kernel, see below code:
>
> void __init efi_map_region(efi_memory_desc_t *md)
> {
>         unsigned long size = md->num_pages << PAGE_SHIFT;
>
>         efi_va -= size;
>         ^^^^^^^^^^^^^^^

Anyway, yes, this is wrong. We probably want to something like the
following, instead (patch might be whitespace-damaged):

--
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a235dc95d629..ea0ea4fd3dab 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -85,8 +85,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
 {
 	unsigned long size = md->num_pages << PAGE_SHIFT;

-	efi_va -= size;
-	if (efi_va < EFI_VA_END) {
+	if (efi_va - size < EFI_VA_END) {
 		pr_warning(FW_WARN "VA address range overflow!\n");
 		return;
 	}
@@ -101,6 +100,8 @@ void __init efi_map_region(efi_memory_desc_t *md)
 	/* Do the VA map */
 	__map_region(md, efi_va);
 	md->virt_addr = efi_va;
+
+	efi_va -= size;
 }

 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long
size,

-- 
Regards/Gruss,
Boris.

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                   ` <bd9528a453bce9b52ad4be75dcc0f034.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
@ 2013-09-24 10:01                                     ` Dave Young
  2013-09-24 12:45                                     ` Dave Young
  1 sibling, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-24 10:01 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 11:43am, Borislav Petkov wrote:
> Crap,
> 
> I need to send from the web interface since the network here doesn't
> somehow let through port 587.
> 
> On Tue, September 24, 2013 6:57 am, Dave Young wrote:
> > On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> >> Okay... I see two problems.
> >>
> >> 1. It looks like we subtract the region size after, rather than before,
> >> assigning an address.
> >>
> >> 2. The second region is assigned the same address in the secondary
> >> kernel as in the first, implying the size of the first region was
> >> somehow set to zero.
> >
> > I find the reason, efi_reserve_boot_services will reserve the
> > BOOT_SERVICE_DATA region
> > thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping
> > addr after
> > the md will be not same as 1st kernel, see below code:
> >
> > void __init efi_map_region(efi_memory_desc_t *md)
> > {
> >         unsigned long size = md->num_pages << PAGE_SHIFT;
> >
> >         efi_va -= size;
> >         ^^^^^^^^^^^^^^^
> 
> Anyway, yes, this is wrong. We probably want to something like the
> following, instead (patch might be whitespace-damaged):
> 
> --
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index a235dc95d629..ea0ea4fd3dab 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -85,8 +85,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
>  {
>  	unsigned long size = md->num_pages << PAGE_SHIFT;
> 
> -	efi_va -= size;
> -	if (efi_va < EFI_VA_END) {
> +	if (efi_va - size < EFI_VA_END) {
>  		pr_warning(FW_WARN "VA address range overflow!\n");
>  		return;
>  	}
> @@ -101,6 +100,8 @@ void __init efi_map_region(efi_memory_desc_t *md)
>  	/* Do the VA map */
>  	__map_region(md, efi_va);
>  	md->virt_addr = efi_va;
> +
> +	efi_va -= size;
>  }
> 
>  void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long
> size,
> 

Ok, I got it, it it what what Peter mentioned problem 1.

--
Thanks
Dave

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                   ` <bd9528a453bce9b52ad4be75dcc0f034.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
  2013-09-24 10:01                                     ` Dave Young
@ 2013-09-24 12:45                                     ` Dave Young
  1 sibling, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-24 12:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 11:43am, Borislav Petkov wrote:
> Crap,
> 
> I need to send from the web interface since the network here doesn't
> somehow let through port 587.
> 
> On Tue, September 24, 2013 6:57 am, Dave Young wrote:
> > On 09/23/13 at 08:06pm, H. Peter Anvin wrote:
> >> Okay... I see two problems.
> >>
> >> 1. It looks like we subtract the region size after, rather than before,
> >> assigning an address.
> >>
> >> 2. The second region is assigned the same address in the secondary
> >> kernel as in the first, implying the size of the first region was
> >> somehow set to zero.
> >
> > I find the reason, efi_reserve_boot_services will reserve the
> > BOOT_SERVICE_DATA region
> > thus the memmap size is changed to 0, so in 2nd kernel the virtual mapping
> > addr after
> > the md will be not same as 1st kernel, see below code:
> >
> > void __init efi_map_region(efi_memory_desc_t *md)
> > {
> >         unsigned long size = md->num_pages << PAGE_SHIFT;
> >
> >         efi_va -= size;
> >         ^^^^^^^^^^^^^^^
> 
> Anyway, yes, this is wrong. We probably want to something like the
> following, instead (patch might be whitespace-damaged):
> 
> --
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index a235dc95d629..ea0ea4fd3dab 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -85,8 +85,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
>  {
>  	unsigned long size = md->num_pages << PAGE_SHIFT;
> 
> -	efi_va -= size;
> -	if (efi_va < EFI_VA_END) {
> +	if (efi_va - size < EFI_VA_END) {
>  		pr_warning(FW_WARN "VA address range overflow!\n");
>  		return;
>  	}
> @@ -101,6 +100,8 @@ void __init efi_map_region(efi_memory_desc_t *md)
>  	/* Do the VA map */
>  	__map_region(md, efi_va);
>  	md->virt_addr = efi_va;
> +
> +	efi_va -= size;
>  }
> 
>  void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long
> size,
> 

Think again about this, how about 1:1 map them from a base address like -64G
phy_addr  ->  (-64G + phy_addr), in this way we can avoid depending on the
previous region size.

For the zero region problem, we can resolve it as a standalone problem.

--
Thanks
Dave

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
@ 2013-09-24 14:56 Borislav Petkov
       [not found] ` <e39e1065bddc2afee2cb6eed957e3ba8.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-24 14:56 UTC (permalink / raw)
  To: Dave Young
  Cc: Borislav Petkov, H. Peter Anvin, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Tue, September 24, 2013 2:45 pm, Dave Young wrote:
> Think again about this, how about 1:1 map them from a base address
> like -64G phy_addr -> (-64G + phy_addr), in this way we can avoid
> depending on the previous region size.

Right, how we layout the regions is arbitrary as long as we start at
the same VA and use the same regions, in the same order and of the same
size...

> For the zero region problem, we can resolve it as a standalone
> problem.

... however, we still need to understand why it fails mapping the boot
services region as some implementations apparently do call boot services
even after ExitBootServices(). IOW, we need that region mapped in the
kexec'ed kernel too.

Thanks.

-- 
Regards/Gruss,
Boris.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found] ` <e39e1065bddc2afee2cb6eed957e3ba8.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
@ 2013-09-25  0:12   ` H. Peter Anvin
       [not found]     ` <52422A6A.8080305-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  2013-09-25  2:31   ` Dave Young
  1 sibling, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-09-25  0:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/2013 07:56 AM, Borislav Petkov wrote:
> On Tue, September 24, 2013 2:45 pm, Dave Young wrote:
>> Think again about this, how about 1:1 map them from a base address
>> like -64G phy_addr -> (-64G + phy_addr), in this way we can avoid
>> depending on the previous region size.
> 
> Right, how we layout the regions is arbitrary as long as we start at
> the same VA and use the same regions, in the same order and of the same
> size...
> 
>> For the zero region problem, we can resolve it as a standalone
>> problem.
> 
> ... however, we still need to understand why it fails mapping the boot
> services region as some implementations apparently do call boot services
> even after ExitBootServices(). IOW, we need that region mapped in the
> kexec'ed kernel too.
> 

I am starting to think that we really should explicitly pass along the
EFI mappings to the secondary kernel.  This will also help if we have to
change the algorithm in a future kernel.

The most logical way to do this is to define a new setup_data type and
pass the entire set of physical-to-virtual mappings that way.

For example:

struct efi_mapping {
	u64 va;			/* Virtual start address */
	u64 pa;			/* Physical start address */
	u64 len;		/* Length in bytes */
	u64 type;		/* Mapping type */
	u64 reserved[3];	/* Reserved, must be zero */
};

Adding some reserved fields seems like a prudent precaution; the map
shouldn't be all that large anyway.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found] ` <e39e1065bddc2afee2cb6eed957e3ba8.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
  2013-09-25  0:12   ` H. Peter Anvin
@ 2013-09-25  2:31   ` Dave Young
  1 sibling, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-25  2:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 04:56pm, Borislav Petkov wrote:
> On Tue, September 24, 2013 2:45 pm, Dave Young wrote:
> > Think again about this, how about 1:1 map them from a base address
> > like -64G phy_addr -> (-64G + phy_addr), in this way we can avoid
> > depending on the previous region size.
> 
> Right, how we layout the regions is arbitrary as long as we start at
> the same VA and use the same regions, in the same order and of the same
> size...
> 
> > For the zero region problem, we can resolve it as a standalone
> > problem.
> 
> ... however, we still need to understand why it fails mapping the boot
> services region as some implementations apparently do call boot services
> even after ExitBootServices(). IOW, we need that region mapped in the
> kexec'ed kernel too.

In 1st kernel, the memmap is provided by firmware and unchanged before
we do the mapping, later efi_reserve_boot_services() try to reserve the
mem range, but failed due to conflict with other region (why?), then the
memmap item size is set to 0 (why?).

In 2nd kernel, we use same memmap from firmware, but the boot service
ranges size have been set to 0, thus efi mapping code will mapping runtime
regions to different va from 1st kernel.

--
Thanks
Dave

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]     ` <52422A6A.8080305-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
@ 2013-09-25  2:36       ` Dave Young
  2013-09-25  5:47       ` Borislav Petkov
  2013-09-26  3:12       ` Dave Young
  2 siblings, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-09-25  2:36 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 05:12pm, H. Peter Anvin wrote:
> On 09/24/2013 07:56 AM, Borislav Petkov wrote:
> > On Tue, September 24, 2013 2:45 pm, Dave Young wrote:
> >> Think again about this, how about 1:1 map them from a base address
> >> like -64G phy_addr -> (-64G + phy_addr), in this way we can avoid
> >> depending on the previous region size.
> > 
> > Right, how we layout the regions is arbitrary as long as we start at
> > the same VA and use the same regions, in the same order and of the same
> > size...
> > 
> >> For the zero region problem, we can resolve it as a standalone
> >> problem.
> > 
> > ... however, we still need to understand why it fails mapping the boot
> > services region as some implementations apparently do call boot services
> > even after ExitBootServices(). IOW, we need that region mapped in the
> > kexec'ed kernel too.
> > 
> 
> I am starting to think that we really should explicitly pass along the
> EFI mappings to the secondary kernel.  This will also help if we have to
> change the algorithm in a future kernel.

The 0 size mem region is still a problem even we pass the previous mapping
to 2nd kernel. So will be be enough to 1:1 map on high address and leave the
firmware provided memmap not touched, use a copy of memmap in kernel?

> 
> The most logical way to do this is to define a new setup_data type and
> pass the entire set of physical-to-virtual mappings that way.
> 
> For example:
> 
> struct efi_mapping {
> 	u64 va;			/* Virtual start address */
> 	u64 pa;			/* Physical start address */
> 	u64 len;		/* Length in bytes */
> 	u64 type;		/* Mapping type */
> 	u64 reserved[3];	/* Reserved, must be zero */
> };
> 
> Adding some reserved fields seems like a prudent precaution; the map
> shouldn't be all that large anyway.
> 
> 	-hpa
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]     ` <52422A6A.8080305-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  2013-09-25  2:36       ` Dave Young
@ 2013-09-25  5:47       ` Borislav Petkov
  2013-09-26  3:12       ` Dave Young
  2 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-09-25  5:47 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Tue, Sep 24, 2013 at 05:12:26PM -0700, H. Peter Anvin wrote:
> I am starting to think that we really should explicitly pass along the
> EFI mappings to the secondary kernel.  This will also help if we have to
> change the algorithm in a future kernel.

That would be the most flexible solution, sure.

> The most logical way to do this is to define a new setup_data type and
> pass the entire set of physical-to-virtual mappings that way.
> 
> For example:
> 
> struct efi_mapping {
> 	u64 va;			/* Virtual start address */
> 	u64 pa;			/* Physical start address */
> 	u64 len;		/* Length in bytes */
> 	u64 type;		/* Mapping type */
> 	u64 reserved[3];	/* Reserved, must be zero */
> };
> 
> Adding some reserved fields seems like a prudent precaution;

... and making checking they're zeroed out initially so that I can use
them in the future, if needed :)

> the map shouldn't be all that large anyway.

Yeah, let me look at it in more detail when I get back - it shouldn't be
that hard to do.

Thanks.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
  2013-09-22 12:35         ` Dave Young
  2013-09-23  8:45         ` Borislav Petkov
@ 2013-09-25  9:24         ` Borislav Petkov
  2 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-09-25  9:24 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, LKML, Borislav Petkov, Matt Fleming, Matthew Garrett,
	H. Peter Anvin, James Bottomley, Vivek Goyal, Dave Young,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Sat, September 21, 2013 1:39 pm, Borislav Petkov wrote:
> diff --git a/arch/x86/platform/efi/efi_32.c
> b/arch/x86/platform/efi/efi_32.c
> index 40e446941dd7..661663b08eaf 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -37,9 +37,36 @@
>   * claim EFI runtime service handler exclusively and to duplicate a
> memory in
>   * low memory space say 0 - 3G.
>   */
> -
>  static unsigned long efi_rt_eflags;
>
> +void efi_sync_low_kernel_mappings(void) {}
> +
> +void __init efi_map_region(efi_memory_desc_t *md)
> +{
> +	u64 start_pfn, end_pfn, end;
> +	unsigned long size;
> +	void *va;
> +
> +	start_pfn = PFN_DOWN(md->phys_addr);
> +	size	  = md->num_pages << PAGE_SHIFT;
> +	end	  = md->phys_addr + size;
> +	end_pfn   = PFN_UP(end);
> +
> +	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
> +		va = __va(md->phys_addr);
> +
> +		if (!(md->attribute & EFI_MEMORY_WB))
> +			efi_memory_uc((u64)(unsigned long)va, size);
> +	} else
> +		va = efi_ioremap(md->phys_addr, size,
> +				 md->type, md->attribute);
> +
> +	md->virt_addr = (u64) (unsigned long) va;
> +	if (!va)
> +		pr_err("ioremap of 0x%llX failed!\n",
> +		       (unsigned long long)md->phys_addr);
> +}
> +

Another note-to-self, while I'm here: it is probably prudent to
be conservative here and keep the old runtime mapping method
in generic EFI code and behind a chicken bit, something like
"efi.use_old_runtime_mapping" or shorter so that people whose systems
break from the new mapping can fall back to the old, well-tested method.

Thanks.

-- 
Regards/Gruss,
Boris.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]     ` <52422A6A.8080305-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  2013-09-25  2:36       ` Dave Young
  2013-09-25  5:47       ` Borislav Petkov
@ 2013-09-26  3:12       ` Dave Young
       [not found]         ` <20130926031242.GA4487-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
  2 siblings, 1 reply; 44+ messages in thread
From: Dave Young @ 2013-09-26  3:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 09/24/13 at 05:12pm, H. Peter Anvin wrote:
> On 09/24/2013 07:56 AM, Borislav Petkov wrote:
> > On Tue, September 24, 2013 2:45 pm, Dave Young wrote:
> >> Think again about this, how about 1:1 map them from a base address
> >> like -64G phy_addr -> (-64G + phy_addr), in this way we can avoid
> >> depending on the previous region size.
> > 
> > Right, how we layout the regions is arbitrary as long as we start at
> > the same VA and use the same regions, in the same order and of the same
> > size...
> > 
> >> For the zero region problem, we can resolve it as a standalone
> >> problem.
> > 
> > ... however, we still need to understand why it fails mapping the boot
> > services region as some implementations apparently do call boot services
> > even after ExitBootServices(). IOW, we need that region mapped in the
> > kexec'ed kernel too.
> > 
> 
> I am starting to think that we really should explicitly pass along the
> EFI mappings to the secondary kernel.  This will also help if we have to
> change the algorithm in a future kernel.
> 
> The most logical way to do this is to define a new setup_data type and
> pass the entire set of physical-to-virtual mappings that way.
> 
> For example:
> 
> struct efi_mapping {
> 	u64 va;			/* Virtual start address */
> 	u64 pa;			/* Physical start address */
> 	u64 len;		/* Length in bytes */
> 	u64 type;		/* Mapping type */
> 	u64 reserved[3];	/* Reserved, must be zero */
> };
> 
> Adding some reserved fields seems like a prudent precaution; the map
> shouldn't be all that large anyway.

Hmm, since len is saved in efi_mapping so the previous 0 size range would
not a problem.

If we choose this approach, can we save not only the efi_mapping, but also
the fields which will be converted to virt addr, like fw_vendor, runtime,
tables? During my test on a HP workstation, the config table item (SMBIOS) also is 
converted to virt addr though spec only mention fw_vendor/runtime/tables.

--
Thanks
Dave

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]         ` <20130926031242.GA4487-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
@ 2013-09-30 20:17           ` Borislav Petkov
       [not found]             ` <20130930201730.GD16383-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-30 20:17 UTC (permalink / raw)
  To: Dave Young
  Cc: H. Peter Anvin, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Thu, Sep 26, 2013 at 11:12:42AM +0800, Dave Young wrote:
> If we choose this approach, can we save not only the efi_mapping, but
> also the fields which will be converted to virt addr, like fw_vendor,
> runtime, tables? During my test on a HP workstation, the config table
> item (SMBIOS) also is converted to virt addr though spec only mention
> fw_vendor/runtime/tables.

Btw, I was about to ask: how do you pass boot_params to the kexec
kernel?

Because I'm looking into hpa's idea to pass an efi_mapping array of
regions with setup_data but how does this get passed to the kexec'ed
kernel? I see in your patches you have boot_params.saved_*** for the
needed info but you're not writing to them anywhere. Is that why you've
added them to the systab_show function so that userspace can parse it
and build the boot_params thing?

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]             ` <20130930201730.GD16383-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-09-30 20:35               ` Vivek Goyal
       [not found]                 ` <20130930203505.GA6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2013-10-08  9:18               ` Dave Young
  1 sibling, 1 reply; 44+ messages in thread
From: Vivek Goyal @ 2013-09-30 20:35 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, H. Peter Anvin, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 30, 2013 at 10:17:30PM +0200, Borislav Petkov wrote:
> On Thu, Sep 26, 2013 at 11:12:42AM +0800, Dave Young wrote:
> > If we choose this approach, can we save not only the efi_mapping, but
> > also the fields which will be converted to virt addr, like fw_vendor,
> > runtime, tables? During my test on a HP workstation, the config table
> > item (SMBIOS) also is converted to virt addr though spec only mention
> > fw_vendor/runtime/tables.
> 
> Btw, I was about to ask: how do you pass boot_params to the kexec
> kernel?

kexec-tools in user space prepares the boot_params and passes it to
second kernel.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                 ` <20130930203505.GA6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2013-09-30 20:41                   ` Borislav Petkov
  2013-09-30 20:46                     ` Vivek Goyal
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-30 20:41 UTC (permalink / raw)
  To: Vivek Goyal, H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 30, 2013 at 04:35:05PM -0400, Vivek Goyal wrote:
> On Mon, Sep 30, 2013 at 10:17:30PM +0200, Borislav Petkov wrote:
> > On Thu, Sep 26, 2013 at 11:12:42AM +0800, Dave Young wrote:
> > > If we choose this approach, can we save not only the efi_mapping, but
> > > also the fields which will be converted to virt addr, like fw_vendor,
> > > runtime, tables? During my test on a HP workstation, the config table
> > > item (SMBIOS) also is converted to virt addr though spec only mention
> > > fw_vendor/runtime/tables.
> > 
> > Btw, I was about to ask: how do you pass boot_params to the kexec
> > kernel?
> 
> kexec-tools in user space prepares the boot_params and passes it to
> second kernel.

Ok, thanks.

hpa, so, for the struct efi_mapping thing to work, I will have to grep
dmesg of the first kernel to get an output like the one below, filter
the runtime regions so that I can build the struct efi_mapping array of
mem regions which I pass on to the second, kexec kernel with setup_data.

Unless you have a better idea, of course... :)

[    0.000000] efi: EFI v2.00 by American Megatrends
[    0.000000] efi:  ACPI 2.0=0xac812f98  SMBIOS=0xac573018 
[    0.000000] efi: mem00: type=3, attr=0xf, range=[0x0000000000000000-0x0000000000008000) (0MB)
[    0.000000] efi: mem01: type=7, attr=0xf, range=[0x0000000000008000-0x000000000004e000) (0MB)
[    0.000000] efi: mem02: type=4, attr=0xf, range=[0x000000000004e000-0x0000000000060000) (0MB)
[    0.000000] efi: mem03: type=3, attr=0xf, range=[0x0000000000060000-0x00000000000a0000) (0MB)
[    0.000000] efi: mem04: type=2, attr=0xf, range=[0x0000000000100000-0x000000000058b000) (4MB)
[    0.000000] efi: mem05: type=7, attr=0xf, range=[0x000000000058b000-0x0000000001000000) (10MB)
[    0.000000] efi: mem06: type=4, attr=0xf, range=[0x0000000001000000-0x0000000001020000) (0MB)
[    0.000000] efi: mem07: type=7, attr=0xf, range=[0x0000000001020000-0x00000000378fe000) (872MB)
[    0.000000] efi: mem08: type=2, attr=0xf, range=[0x00000000378fe000-0x0000000037c77000) (3MB)
[    0.000000] efi: mem09: type=7, attr=0xf, range=[0x0000000037c77000-0x000000007d6a8000) (1114MB)
[    0.000000] efi: mem10: type=2, attr=0xf, range=[0x000000007d6a8000-0x00000000a7420000) (669MB)
[    0.000000] efi: mem11: type=1, attr=0xf, range=[0x00000000a7420000-0x00000000a743d000) (0MB)
[    0.000000] efi: mem12: type=4, attr=0xf, range=[0x00000000a743d000-0x00000000a7495000) (0MB)
[    0.000000] efi: mem13: type=7, attr=0xf, range=[0x00000000a7495000-0x00000000a74be000) (0MB)
[    0.000000] efi: mem14: type=4, attr=0xf, range=[0x00000000a74be000-0x00000000a74ef000) (0MB)
[    0.000000] efi: mem15: type=7, attr=0xf, range=[0x00000000a74ef000-0x00000000a74f2000) (0MB)
[    0.000000] efi: mem16: type=4, attr=0xf, range=[0x00000000a74f2000-0x00000000a79db000) (4MB)
[    0.000000] efi: mem17: type=6, attr=0x800000000000000f, range=[0x00000000a79db000-0x00000000a99dc000) (32MB)
[    0.000000] efi: mem18: type=4, attr=0xf, range=[0x00000000a99dc000-0x00000000a9ca1000) (2MB)
[    0.000000] efi: mem19: type=7, attr=0xf, range=[0x00000000a9ca1000-0x00000000a9ca3000) (0MB)
[    0.000000] efi: mem20: type=4, attr=0xf, range=[0x00000000a9ca3000-0x00000000abf12000) (34MB)
[    0.000000] efi: mem21: type=7, attr=0xf, range=[0x00000000abf12000-0x00000000ac0a6000) (1MB)
[    0.000000] efi: mem22: type=3, attr=0xf, range=[0x00000000ac0a6000-0x00000000ac512000) (4MB)
[    0.000000] efi: mem23: type=7, attr=0xf, range=[0x00000000ac512000-0x00000000ac529000) (0MB)
[    0.000000] efi: mem24: type=5, attr=0x800000000000000f, range=[0x00000000ac529000-0x00000000ac54c000) (0MB)
[    0.000000] efi: mem25: type=7, attr=0xf, range=[0x00000000ac54c000-0x00000000ac566000) (0MB)
[    0.000000] efi: mem26: type=6, attr=0x800000000000000f, range=[0x00000000ac566000-0x00000000ac569000) (0MB)
[    0.000000] efi: mem27: type=7, attr=0xf, range=[0x00000000ac569000-0x00000000ac56a000) (0MB)
[    0.000000] efi: mem28: type=6, attr=0x800000000000000f, range=[0x00000000ac56a000-0x00000000ac576000) (0MB)
[    0.000000] efi: mem29: type=7, attr=0xf, range=[0x00000000ac576000-0x00000000ac596000) (0MB)
[    0.000000] efi: mem30: type=0, attr=0xf, range=[0x00000000ac596000-0x00000000ac5fb000) (0MB)
[    0.000000] efi: mem31: type=7, attr=0xf, range=[0x00000000ac5fb000-0x00000000ac6b7000) (0MB)
[    0.000000] efi: mem32: type=10, attr=0xf, range=[0x00000000ac6b7000-0x00000000ac7e3000) (1MB)
[    0.000000] efi: mem33: type=7, attr=0xf, range=[0x00000000ac7e3000-0x00000000ac7e4000) (0MB)
[    0.000000] efi: mem34: type=10, attr=0xf, range=[0x00000000ac7e4000-0x00000000ac7fb000) (0MB)
[    0.000000] efi: mem35: type=7, attr=0xf, range=[0x00000000ac7fb000-0x00000000ac80a000) (0MB)
[    0.000000] efi: mem36: type=2, attr=0xf, range=[0x00000000ac80a000-0x00000000ac810000) (0MB)
[    0.000000] efi: mem37: type=9, attr=0xf, range=[0x00000000ac810000-0x00000000ac813000) (0MB)
[    0.000000] efi: mem38: type=4, attr=0xf, range=[0x00000000ac813000-0x00000000ac81a000) (0MB)
[    0.000000] efi: mem39: type=3, attr=0xf, range=[0x00000000ac81a000-0x00000000ad7ef000) (15MB)
[    0.000000] efi: mem40: type=4, attr=0xf, range=[0x00000000ad7ef000-0x00000000ad7f6000) (0MB)
[    0.000000] efi: mem41: type=3, attr=0xf, range=[0x00000000ad7f6000-0x00000000ad800000) (0MB)
[    0.000000] efi: mem42: type=7, attr=0xf, range=[0x0000000100000000-0x0000000450000000) (13568MB)
[    0.000000] efi: mem43: type=11, attr=0x8000000000000001, range=[0x00000000b0000000-0x00000000b4000000) (64MB)
[    0.000000] efi: mem44: type=11, attr=0x8000000000000001, range=[0x00000000fed20000-0x00000000fed40000) (0MB)
[    0.000000] efi: mem45: type=11, attr=0x8000000000000001, range=[0x00000000fed50000-0x00000000fed90000) (0MB)
[    0.000000] efi: mem46: type=11, attr=0x8000000000000000, range=[0x00000000ffa00000-0x00000000ffa40000) (0MB)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
  2013-09-30 20:41                   ` Borislav Petkov
@ 2013-09-30 20:46                     ` Vivek Goyal
       [not found]                       ` <20130930204642.GB6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Vivek Goyal @ 2013-09-30 20:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, Dave Young, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley, linux-efi

On Mon, Sep 30, 2013 at 10:41:54PM +0200, Borislav Petkov wrote:

[..]
> hpa, so, for the struct efi_mapping thing to work, I will have to grep
> dmesg of the first kernel to get an output like the one below, filter
> the runtime regions so that I can build the struct efi_mapping array of
> mem regions which I pass on to the second, kexec kernel with setup_data.

Any data you need to pass to second kernel will have to be exported to
user space either using /sys or /proc so that kexec-tools can parse it.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                       ` <20130930204642.GB6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2013-09-30 21:06                         ` Borislav Petkov
       [not found]                           ` <20130930210646.GF19411-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-09-30 21:06 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: H. Peter Anvin, Dave Young, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 30, 2013 at 04:46:42PM -0400, Vivek Goyal wrote:
> On Mon, Sep 30, 2013 at 10:41:54PM +0200, Borislav Petkov wrote:
> 
> [..]
> > hpa, so, for the struct efi_mapping thing to work, I will have to grep
> > dmesg of the first kernel to get an output like the one below, filter
> > the runtime regions so that I can build the struct efi_mapping array of
> > mem regions which I pass on to the second, kexec kernel with setup_data.
> 
> Any data you need to pass to second kernel will have to be exported to
> user space either using /sys or /proc so that kexec-tools can parse it.

And I cannot parse dmesg? Because I have all the info there already.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                           ` <20130930210646.GF19411-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-09-30 21:09                             ` Vivek Goyal
  0 siblings, 0 replies; 44+ messages in thread
From: Vivek Goyal @ 2013-09-30 21:09 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, Dave Young, X86 ML, LKML, Borislav Petkov,
	Matt Fleming, Matthew Garrett, James Bottomley,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 30, 2013 at 11:06:46PM +0200, Borislav Petkov wrote:
> On Mon, Sep 30, 2013 at 04:46:42PM -0400, Vivek Goyal wrote:
> > On Mon, Sep 30, 2013 at 10:41:54PM +0200, Borislav Petkov wrote:
> > 
> > [..]
> > > hpa, so, for the struct efi_mapping thing to work, I will have to grep
> > > dmesg of the first kernel to get an output like the one below, filter
> > > the runtime regions so that I can build the struct efi_mapping array of
> > > mem regions which I pass on to the second, kexec kernel with setup_data.
> > 
> > Any data you need to pass to second kernel will have to be exported to
> > user space either using /sys or /proc so that kexec-tools can parse it.
> 
> And I cannot parse dmesg? Because I have all the info there already.

dmesg can 't relied on. It is just a kernel buffer which root can
clear. (dmesg --clear). So required info has to be exported to user
space in a suitable form.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                           ` <2d27a1bc-eabf-4d45-8303-27ae58511b11-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  2013-09-24  4:57                             ` Dave Young
@ 2013-10-02 10:04                             ` Borislav Petkov
       [not found]                               ` <20131002100426.GB20568-fF5Pk5pvG8Y@public.gmane.org>
  1 sibling, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-10-02 10:04 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Mon, Sep 23, 2013 at 08:06:38PM -0700, H. Peter Anvin wrote:
> Okay... I see two problems.
> 
> 1. It looks like we subtract the region size after, rather than before, assigning an address.
> 

Ok, so I'm looking at this agan and, actually, we really do subtract the
region size *before* we assign the address:

---
       efi_va -= size;
       if (efi_va < EFI_VA_END) {
               pr_warning(FW_WARN "VA address range overflow!\n");
               return;
       }

       /*
        * Make sure the 1:1 mappings are present as a catch-all for b0rked
        * firmware which doesn't update all internal pointers after switching
        * to virtual mode and would otherwise crap on us.
        */
       __map_region(md, md->phys_addr);

       /* Do the VA map */
       __map_region(md, efi_va);
       md->virt_addr = efi_va;
--

So let me give an example why I think it is correct to subtract *before*
assigning and so that we can talk about it and we completely agree on
the details. :-)

When we start allocating from -4G, i.e. 0xffffffff00000000, I think we
want to do it bottom-up so that 0xffffffff00000000 is the *last*, i.e.
lowest address. Because we link the kernel text at 0xffffffff81000000 by
default, which would mean, if -4G was the first address, we'll have only
2G:

0xffffffff81000000 - 0xffffffff00000000 = 0x0000000081000000 = 2.164.260.864 bytes

of space for UEFI mappings.

That's why, I need to *first* subtract and *then* use the resulting
address to map the region to. Like so (4 hypothetical regions):

1st region: 0xfffffffeffffe000 - 0xffffffff00000000

2nd region: 0xfffffffeffff8000 - 0xfffffffeffffe000

3rd region: 0xfffffffefffec000 - 0xfffffffeffff8000

4th region: 0xfffffffefffd8000 - 0xfffffffefffec000

and so on...

IOW, the VA layout looks like this:

0xfffffffefffd8000
...
region 4
...
0xfffffffefffec000 (non including)
...
region 3
...
0xfffffffeffff8000 (ditto)
...
region 2
...
0xfffffffeffffe000 (ditto)
...
region 1
...
0xffffffff00000000 (ditto)

Am I even making sense here?

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                               ` <20131002100426.GB20568-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-10-02 15:43                                 ` H. Peter Anvin
       [not found]                                   ` <524C3F38.6050507-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-10-02 15:43 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 10/02/2013 03:04 AM, Borislav Petkov wrote:
> When we start allocating from -4G, i.e. 0xffffffff00000000, I think we
> want to do it bottom-up so that 0xffffffff00000000 is the *last*, i.e.
> lowest address. Because we link the kernel text at 0xffffffff81000000 by
> default, which would mean, if -4G was the first address, we'll have only
> 2G:

Right.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                   ` <524C3F38.6050507-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
@ 2013-10-02 17:05                                     ` Borislav Petkov
       [not found]                                       ` <20131002170522.GA20647-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-10-02 17:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Wed, Oct 02, 2013 at 08:43:52AM -0700, H. Peter Anvin wrote:
> On 10/02/2013 03:04 AM, Borislav Petkov wrote:
> > When we start allocating from -4G, i.e. 0xffffffff00000000, I think we
> > want to do it bottom-up so that 0xffffffff00000000 is the *last*, i.e.
> > lowest address. Because we link the kernel text at 0xffffffff81000000 by
> > default, which would mean, if -4G was the first address, we'll have only
> > 2G:
> 
> Right.

Btw, Matt just found another issue with the bottom-up approach - due to
different alignment of VA and PA addresses, this messes up the pagetable
in terms of the order in which we're using 4K, 2M, etc pages.

What can happen is that, you can get a non-2M aligned PA mapped with
2M-aligned VA which results in a #PF with PF_RSVD set, which most likely
happens because one or more of the bits in the [12:20] slice of the PMD
are reserved but they get set due to the PA having address bits set in
the aforementioned slice and thus a #PF is raised.

So we changed the mapping method to a more straight-forward one: we map
all EFI regions in the following range:

[ efi_va - -4G ]

and we compute efi_va by subtracting the highest EFI region address from
-4G, i.e. 0xffff_ffff_0000_0000.

Then, each VA is computed by doing efi_va + PA.

Basically, we have a non-contiguous window in the virtual address space
with the highest address of it being -4G. In OVMF, f.e., we get the
following mappings:

VA: 0xfffffffe80800000..0xfffffffe81000000 -> PA: 0x800000..0x1000000
VA: 0xfffffffefc000000..0xfffffffefc020000 -> PA: 0x7c000000..0x7c020000
VA: 0xfffffffefdc5b000..0xfffffffefe146000 -> PA: 0x7dc5b000..0x7e146000

...

VA: 0xfffffffeffa65000..0xfffffffefffe0000 -> PA: 0x7fa65000..0x7ffe0000
VA: 0xfffffffefffe0000..0xffffffff00000000 -> PA: 0x7ffe0000..0x80000000

So, basically, the EFI regions occupy a 2Gish window with holes in the
range:

[ 0xfffffffe80800000 - 0xffffffff00000000 )

and since we said, we want to give the whole EFI memmap 64G max, that
should be ok.

Oh, and the alignment remains compatible this way.

So this mapping scheme - courtesy of Matt - is very straight-forward
and simple and I like simple. This way we won't need the setup_data
games with kexec tools as we'll be simply doing the same mappings in the
kexec'ed kernel.

Anyway, I'll clean up the patch and send it out later.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                       ` <20131002170522.GA20647-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-10-02 17:32                                         ` H. Peter Anvin
       [not found]                                           ` <524C58A3.4090704-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-10-02 17:32 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 10/02/2013 10:05 AM, Borislav Petkov wrote:
> 
> Btw, Matt just found another issue with the bottom-up approach - due to
> different alignment of VA and PA addresses, this messes up the pagetable
> in terms of the order in which we're using 4K, 2M, etc pages.
> 
> What can happen is that, you can get a non-2M aligned PA mapped with
> 2M-aligned VA which results in a #PF with PF_RSVD set, which most likely
> happens because one or more of the bits in the [12:20] slice of the PMD
> are reserved but they get set due to the PA having address bits set in
> the aforementioned slice and thus a #PF is raised.
> 

So this is a bug in the sense that 2M pages were used when they were not
safe to use (matching alignment is part of the requirement for 2M pages
being allowable.)  However, we of course want to use 2M pages, so see below.

> So we changed the mapping method to a more straight-forward one: we map
> all EFI regions in the following range:
> 
> [ efi_va - -4G ]
> 
> and we compute efi_va by subtracting the highest EFI region address from
> -4G, i.e. 0xffff_ffff_0000_0000.
> 
> Then, each VA is computed by doing efi_va + PA.
> 
> Oh, and the alignment remains compatible this way.
> 
> So this mapping scheme - courtesy of Matt - is very straight-forward
> and simple and I like simple. This way we won't need the setup_data
> games with kexec tools as we'll be simply doing the same mappings in the
> kexec'ed kernel.
> 
> Anyway, I'll clean up the patch and send it out later.
> 

We could achieve the same thing by doing alignment after subtracting the
pointer.  HOWEVER, it also goes to show that any mapping scheme is
inherently fragile (consider if the mapping scheme above ends up
consuming too much virtual space in the future), and as a result I
really think that explicitly passing the map to the kexec kernel really
is the only sane thing to do, as otherwise we have to maintain the same
algorithm forever.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                           ` <524C58A3.4090704-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
@ 2013-10-02 18:42                                             ` Borislav Petkov
       [not found]                                               ` <20131002184229.GE20568-fF5Pk5pvG8Y@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-10-02 18:42 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Wed, Oct 02, 2013 at 10:32:19AM -0700, H. Peter Anvin wrote:
> So this is a bug in the sense that 2M pages were used when they were
> not safe to use (matching alignment is part of the requirement for 2M
> pages being allowable.) However, we of course want to use 2M pages, so
> see below.

Yes, so the alignment has to be such that both PA and VA are the same
amount of 4K pages away from the next 2M boundary, to put it bluntly.

I have a couple of ideas on how to do that.

> We could achieve the same thing by doing alignment after subtracting the
> pointer.  HOWEVER, it also goes to show that any mapping scheme is
> inherently fragile (consider if the mapping scheme above ends up
> consuming too much virtual space in the future), and as a result I

Yes, I understand your sentiment - we want to be as conservative as
possible with the approach before it is cast in stone, for we don't know
what firmware turds are to be expected in the future.

> really think that explicitly passing the map to the kexec kernel
> really is the only sane thing to do, as otherwise we have to maintain
> the same algorithm forever.

Yes, we'll have to announce the mapping over sysfs of proc for the
kexec-tools to parse it, as I'm sure you've already heard. But this can
and will be done in the next step, right after we have a stable regions
mapping algorithm.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                               ` <20131002184229.GE20568-fF5Pk5pvG8Y@public.gmane.org>
@ 2013-10-02 18:46                                                 ` H. Peter Anvin
       [not found]                                                   ` <524C6A14.40905-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-10-02 18:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On 10/02/2013 11:42 AM, Borislav Petkov wrote:
> 
> Yes, so the alignment has to be such that both PA and VA are the same
> amount of 4K pages away from the next 2M boundary, to put it bluntly.
> 
> I have a couple of ideas on how to do that.
> 

It's pretty straightforward - just drop the starting address to proper
alignment after you subtract the size.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                                   ` <524C6A14.40905-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
@ 2013-10-04  9:42                                                     ` Borislav Petkov
       [not found]                                                       ` <20131004094247.GA6796-K5JNixvcfoxupOikMc4+xw@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2013-10-04  9:42 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Wed, Oct 02, 2013 at 11:46:44AM -0700, H. Peter Anvin wrote:
> It's pretty straightforward - just drop the starting address to proper
> alignment after you subtract the size.

Ok, just an observation - it is not necessarily a bad thing but I
thought we should talk about it:

So, when we do the VA space saving mapping, we're basically mapping huge
physical ranges onto a much smaller VA range and adding other mappings
in there pots-factum could turn out to be not straight-forward and
problematic.

To illustrate what I'm trying to say, here's an example from two regions
in OVMF:

[    0.011005] __map_region: VA: 0xfffffffeff800000..0xffffffff00000000 -> PA: 0x800000.. 0x1000000
[    0.017005] __map_region: VA: 0xfffffffeff600000..0xfffffffeff620000 -> PA: 0x7c000000.. 0x7c020000

Now, the physical address range spanned by those regions is:

0x7c020000 - 0x800000 = 0x7b820000 =~ 2G

while the virtual is

0xffffffff00000000 - 0xfffffffeff600000 = 0xa00000 =~ 10M

Now, we obviously cannot map the whole PA space in there, the question
is: do we care?

I mean, we can map it to other VA range but this will totally destroy
the simple math of computing EFI VA addresses with an offset, similar to
PAGE_OFFSET.

OTOH, if we keep Matt's suggestion of mapping the whole EFI address
space window, we don't have that issue. And we've reserved 64G for
EFI and if it needs more, we probably can give it since we're using a
different pagetable anyway.

Opinions?

Thanks.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                                       ` <20131004094247.GA6796-K5JNixvcfoxupOikMc4+xw@public.gmane.org>
@ 2013-10-04 14:43                                                         ` H. Peter Anvin
       [not found]                                                           ` <89e0284e-89b5-42ad-8120-128ef1bf0152-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2013-10-04 14:43 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Young, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

We can do that... but it is different from what Windows does to my understanding and it also has the potential of severe pathologies... e.g. a window at the top of the address space being mapped.

Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org> wrote:
>On Wed, Oct 02, 2013 at 11:46:44AM -0700, H. Peter Anvin wrote:
>> It's pretty straightforward - just drop the starting address to
>proper
>> alignment after you subtract the size.
>
>Ok, just an observation - it is not necessarily a bad thing but I
>thought we should talk about it:
>
>So, when we do the VA space saving mapping, we're basically mapping
>huge
>physical ranges onto a much smaller VA range and adding other mappings
>in there pots-factum could turn out to be not straight-forward and
>problematic.
>
>To illustrate what I'm trying to say, here's an example from two
>regions
>in OVMF:
>
>[    0.011005] __map_region: VA: 0xfffffffeff800000..0xffffffff00000000
>-> PA: 0x800000.. 0x1000000
>[    0.017005] __map_region: VA: 0xfffffffeff600000..0xfffffffeff620000
>-> PA: 0x7c000000.. 0x7c020000
>
>Now, the physical address range spanned by those regions is:
>
>0x7c020000 - 0x800000 = 0x7b820000 =~ 2G
>
>while the virtual is
>
>0xffffffff00000000 - 0xfffffffeff600000 = 0xa00000 =~ 10M
>
>Now, we obviously cannot map the whole PA space in there, the question
>is: do we care?
>
>I mean, we can map it to other VA range but this will totally destroy
>the simple math of computing EFI VA addresses with an offset, similar
>to
>PAGE_OFFSET.
>
>OTOH, if we keep Matt's suggestion of mapping the whole EFI address
>space window, we don't have that issue. And we've reserved 64G for
>EFI and if it needs more, we probably can give it since we're using a
>different pagetable anyway.
>
>Opinions?
>
>Thanks.

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]                                                           ` <89e0284e-89b5-42ad-8120-128ef1bf0152-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
@ 2013-10-04 14:50                                                             ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2013-10-04 14:50 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Matt Fleming, Dave Young, X86 ML, LKML, Borislav Petkov,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

On Fri, Oct 04, 2013 at 07:43:37AM -0700, H. Peter Anvin wrote:
> We can do that... but it is different from what Windows does to my
> understanding and it also has the potential of severe pathologies...
> e.g. a window at the top of the address space being mapped.

Right, so after Matt and I talked about it a bit on IRC, we actually
don't really care how we do the mappings if we spell them out later to
kexec over proc or somewhere else, as you wanted.

So we can do the VA address space saving scheme first and change it
later, if there are issues. We'll see.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH -v2] EFI: Runtime services virtual mapping
       [not found]             ` <20130930201730.GD16383-fF5Pk5pvG8Y@public.gmane.org>
  2013-09-30 20:35               ` Vivek Goyal
@ 2013-10-08  9:18               ` Dave Young
  1 sibling, 0 replies; 44+ messages in thread
From: Dave Young @ 2013-10-08  9:18 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, X86 ML, LKML, Borislav Petkov, Matt Fleming,
	Matthew Garrett, James Bottomley, Vivek Goyal,
	linux-efi-u79uwXL29TY76Z2rM5mHXA

Sorry for reply late because I was in a long holiday...

On 09/30/13 at 10:17pm, Borislav Petkov wrote:
> On Thu, Sep 26, 2013 at 11:12:42AM +0800, Dave Young wrote:
> > If we choose this approach, can we save not only the efi_mapping, but
> > also the fields which will be converted to virt addr, like fw_vendor,
> > runtime, tables? During my test on a HP workstation, the config table
> > item (SMBIOS) also is converted to virt addr though spec only mention
> > fw_vendor/runtime/tables.
> 
> Btw, I was about to ask: how do you pass boot_params to the kexec
> kernel?

As Vivek mentioned, just kexec-tools will get the saved info from sysfs
or procfs, then it as a boot loader will prepare the linux boot setup
struct with these 1st efi values.

> 
> Because I'm looking into hpa's idea to pass an efi_mapping array of
> regions with setup_data but how does this get passed to the kexec'ed
> kernel? I see in your patches you have boot_params.saved_*** for the
> needed info but you're not writing to them anywhere. Is that why you've
> added them to the systab_show function so that userspace can parse it
> and build the boot_params thing?

Yes, I just printed them in systab_show, kexec-tools will parse it.

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2013-10-08  9:18 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-24 14:56 [PATCH -v2] EFI: Runtime services virtual mapping Borislav Petkov
     [not found] ` <e39e1065bddc2afee2cb6eed957e3ba8.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
2013-09-25  0:12   ` H. Peter Anvin
     [not found]     ` <52422A6A.8080305-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
2013-09-25  2:36       ` Dave Young
2013-09-25  5:47       ` Borislav Petkov
2013-09-26  3:12       ` Dave Young
     [not found]         ` <20130926031242.GA4487-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-30 20:17           ` Borislav Petkov
     [not found]             ` <20130930201730.GD16383-fF5Pk5pvG8Y@public.gmane.org>
2013-09-30 20:35               ` Vivek Goyal
     [not found]                 ` <20130930203505.GA6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2013-09-30 20:41                   ` Borislav Petkov
2013-09-30 20:46                     ` Vivek Goyal
     [not found]                       ` <20130930204642.GB6116-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2013-09-30 21:06                         ` Borislav Petkov
     [not found]                           ` <20130930210646.GF19411-fF5Pk5pvG8Y@public.gmane.org>
2013-09-30 21:09                             ` Vivek Goyal
2013-10-08  9:18               ` Dave Young
2013-09-25  2:31   ` Dave Young
  -- strict thread matches above, loose matches on Subject: below --
2013-09-19 14:54 [PATCH 00/11] EFI runtime " Borislav Petkov
2013-09-19 14:54 ` [PATCH 11/11] EFI: Runtime " Borislav Petkov
     [not found]   ` <1379602494-26684-12-git-send-email-bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org>
2013-09-21 11:39     ` [PATCH -v2] " Borislav Petkov
2013-09-23  5:47       ` Dave Young
     [not found]         ` <20130923054741.GC7007-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-23  6:29           ` Borislav Petkov
     [not found]             ` <20130923062947.GA2527-fF5Pk5pvG8Y@public.gmane.org>
2013-09-23  7:08               ` Dave Young
     [not found]       ` <20130921113929.GB1587-fF5Pk5pvG8Y@public.gmane.org>
2013-09-22 12:35         ` Dave Young
     [not found]           ` <20130922123515.GA7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-22 13:37             ` Borislav Petkov
     [not found]               ` <20130922133722.GC28718-fF5Pk5pvG8Y@public.gmane.org>
2013-09-22 14:00                 ` Dave Young
     [not found]                   ` <20130922140038.GB7476-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-22 14:31                     ` Dave Young
2013-09-22 15:27                 ` H. Peter Anvin
     [not found]                   ` <1ba7eca6-419c-4181-9927-9ba0927a6abf-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
2013-09-22 16:38                     ` Borislav Petkov
2013-09-23  5:45                     ` Dave Young
2013-09-24  2:52                     ` Dave Young
     [not found]                       ` <20130924025209.GA5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-24  3:06                         ` H. Peter Anvin
     [not found]                           ` <2d27a1bc-eabf-4d45-8303-27ae58511b11-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
2013-09-24  4:57                             ` Dave Young
     [not found]                               ` <20130924045705.GB5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-24  4:58                                 ` Dave Young
     [not found]                                   ` <20130924045818.GC5561-je1gSBvt1TcFLmT5oZ11vB/sF2h8X+2i0E9HWUfgJXw@public.gmane.org>
2013-09-24  5:23                                     ` Dave Young
2013-09-24  8:57                                       ` Dave Young
2013-09-24  9:43                                 ` Borislav Petkov
     [not found]                                   ` <bd9528a453bce9b52ad4be75dcc0f034.squirrel-Fq0NhQX/+7ild1e1puMNDg@public.gmane.org>
2013-09-24 10:01                                     ` Dave Young
2013-09-24 12:45                                     ` Dave Young
2013-10-02 10:04                             ` Borislav Petkov
     [not found]                               ` <20131002100426.GB20568-fF5Pk5pvG8Y@public.gmane.org>
2013-10-02 15:43                                 ` H. Peter Anvin
     [not found]                                   ` <524C3F38.6050507-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
2013-10-02 17:05                                     ` Borislav Petkov
     [not found]                                       ` <20131002170522.GA20647-fF5Pk5pvG8Y@public.gmane.org>
2013-10-02 17:32                                         ` H. Peter Anvin
     [not found]                                           ` <524C58A3.4090704-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
2013-10-02 18:42                                             ` Borislav Petkov
     [not found]                                               ` <20131002184229.GE20568-fF5Pk5pvG8Y@public.gmane.org>
2013-10-02 18:46                                                 ` H. Peter Anvin
     [not found]                                                   ` <524C6A14.40905-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
2013-10-04  9:42                                                     ` Borislav Petkov
     [not found]                                                       ` <20131004094247.GA6796-K5JNixvcfoxupOikMc4+xw@public.gmane.org>
2013-10-04 14:43                                                         ` H. Peter Anvin
     [not found]                                                           ` <89e0284e-89b5-42ad-8120-128ef1bf0152-2ueSQiBKiTY7tOexoI0I+QC/G2K4zDHf@public.gmane.org>
2013-10-04 14:50                                                             ` Borislav Petkov
2013-09-23  8:45         ` Borislav Petkov
2013-09-25  9:24         ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).