* Re: PV domU with 255GB boot failure
2009-02-18 19:12 ` Mukesh Rathor
@ 2009-02-19 7:49 ` Jan Beulich
0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2009-02-19 7:49 UTC (permalink / raw)
To: mukesh.rathor; +Cc: xen-devel@lists.xensource.com, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 1393 bytes --]
>>> Mukesh Rathor <mukesh.rathor@oracle.com> 18.02.09 20:12 >>>
>Jan Beulich wrote:
> > First a general remark: You're doing this patch to support 256G domains,
> > but by keeping extend_init_mapping() there'll continue to be no way to
> > support domains with close to or above 512G (or, if making use of
> > XEN_ELFNOTE_INIT_P2M, 1T). This function, rather than needing fixes,
> > really just needs to go away.
> >
> > I've done this in our forward ported 2.6.27+ kernels, but unfortunately
> > can't really contribute the changes to the 2.6.18 tree, as there are too
> > many differences, and I'm simply lacking the time (and, honestly, interest)
> > to work out all the issues. I could post the respective patch if you (or
> > someone else) care(s).
> >
>
>I came up with this patch trying to fix the hang on less than 256 GB.
>With 256 GB it's not even coming this far, pl see another thread. Since
>256 GB was the original bug, we definitely need to support that. So please
>post your patches with any relevant pointers and I'll take a crack at it...
Attached. Just to repeat - they are against a 2.6.27+ kernel that has
various other patches enabled, so I can't easily tell whether they have
dependencies on changes made elsewhere. The order we have them
applied here is:
xen-x86-bigmem
xen-x86_64-init-memmap.patch
xen-x86_64-note-init-p2m.patch
Jan
[-- Attachment #2: xen-x86-bigmem --]
[-- Type: application/octet-stream, Size: 6159 bytes --]
From: jbeulich@novell.com
Subject: fix issues with the assignment of huge amounts of memory
Patch-mainline: obsolete
At the same time remove the non-applicable and broken support for the
memmap= command line option.
Also fix the overlap of the modules area with the fixmaps on x86-64.
--- head-2009-02-02.orig/arch/x86/kernel/e820-xen.c 2008-12-01 12:07:15.000000000 +0100
+++ head-2009-02-02/arch/x86/kernel/e820-xen.c 2008-11-25 13:18:07.000000000 +0100
@@ -1235,6 +1235,26 @@ static int __init parse_memopt(char *p)
i = e820.nr_map - 1;
current_end = e820.map[i].addr + e820.map[i].size;
+
+ /*
+ * A little less than 2% of available memory are needed for page
+ * tables, p2m map, and mem_map. Hence the maximum amount of memory
+ * we can potentially balloon up to can in no case exceed about 50
+ * times of what we've been given initially. Since even with that we
+ * won't be able to boot (due to various calculations done based on
+ * the total number of pages) we further restrict this to factor 32.
+ */
+ if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) {
+ u64 size = (u64)xen_start_info->nr_pages << 5;
+
+ printk(KERN_WARNING "mem=%Luk is invalid for an initial"
+ " allocation of %luk, using %Luk\n",
+ (unsigned long long)mem_size >> 10,
+ xen_start_info->nr_pages << (PAGE_SHIFT - 10),
+ (unsigned long long)size << (PAGE_SHIFT - 10));
+ mem_size = size << PAGE_SHIFT;
+ }
+
if (current_end < mem_size) {
/*
* The e820 map ends before our requested size so
@@ -1294,6 +1314,7 @@ static int __init parse_memmap_opt(char
return *p == '\0' ? 0 : -EINVAL;
}
early_param("memmap", parse_memmap_opt);
+#endif
void __init finish_e820_parsing(void)
{
@@ -1308,7 +1329,6 @@ void __init finish_e820_parsing(void)
e820_print_map("user");
}
}
-#endif
static inline const char *e820_type_to_string(int e820_type)
{
--- head-2009-02-02.orig/arch/x86/kernel/setup-xen.c 2008-11-17 13:58:02.000000000 +0100
+++ head-2009-02-02/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:29.000000000 +0100
@@ -128,12 +128,7 @@ static struct notifier_block xen_panic_b
unsigned long *phys_to_machine_mapping;
EXPORT_SYMBOL(phys_to_machine_mapping);
-unsigned long *pfn_to_mfn_frame_list_list,
-#ifdef CONFIG_X86_64
- *pfn_to_mfn_frame_list[512];
-#else
- *pfn_to_mfn_frame_list[128];
-#endif
+unsigned long *pfn_to_mfn_frame_list_list, **pfn_to_mfn_frame_list;
/* Raw start-of-day parameters from the hypervisor. */
start_info_t *xen_start_info;
@@ -1053,17 +1048,17 @@ void __init setup_arch(char **cmdline_p)
p2m_pages = xen_start_info->nr_pages;
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- unsigned long i, j;
+ unsigned long i, j, size;
unsigned int k, fpp;
/* Make sure we have a large enough P->M table. */
phys_to_machine_mapping = alloc_bootmem_pages(
max_pfn * sizeof(unsigned long));
- memset(phys_to_machine_mapping, ~0,
- max_pfn * sizeof(unsigned long));
memcpy(phys_to_machine_mapping,
(unsigned long *)xen_start_info->mfn_list,
p2m_pages * sizeof(unsigned long));
+ memset(phys_to_machine_mapping + p2m_pages, ~0,
+ (max_pfn - p2m_pages) * sizeof(unsigned long));
free_bootmem(
__pa(xen_start_info->mfn_list),
PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
@@ -1073,15 +1068,26 @@ void __init setup_arch(char **cmdline_p)
* Initialise the list of the frames that specify the list of
* frames that make up the p2m table. Used by save/restore.
*/
- pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
-
fpp = PAGE_SIZE/sizeof(unsigned long);
+ size = (max_pfn + fpp - 1) / fpp;
+ size = (size + fpp - 1) / fpp;
+ ++size; /* include a zero terminator for crash tools */
+ size *= sizeof(unsigned long);
+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(size);
+ if (size > PAGE_SIZE
+ && xen_create_contiguous_region((unsigned long)
+ pfn_to_mfn_frame_list_list,
+ get_order(size), 0))
+ BUG();
+ size -= sizeof(unsigned long);
+ pfn_to_mfn_frame_list = alloc_bootmem(size);
+
for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
if (j == fpp)
j = 0;
if (j == 0) {
k++;
- BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
+ BUG_ON(k * sizeof(unsigned long) >= size);
pfn_to_mfn_frame_list[k] =
alloc_bootmem_pages(PAGE_SIZE);
pfn_to_mfn_frame_list_list[k] =
--- head-2009-02-02.orig/arch/x86/mm/init_64-xen.c 2009-02-02 11:42:26.000000000 +0100
+++ head-2009-02-02/arch/x86/mm/init_64-xen.c 2009-02-02 11:42:33.000000000 +0100
@@ -660,6 +660,13 @@ static void __init extend_init_mapping(u
while (va < (__START_KERNEL_map
+ (table_cur << PAGE_SHIFT)
+ tables_space)) {
+ if (!pmd_index(va) && !pte_index(va)) {
+ page = (unsigned long *)init_level4_pgt;
+ addr = page[pgd_index(va)];
+ addr_to_page(addr, page);
+ addr = page[pud_index(va)];
+ addr_to_page(addr, page);
+ }
pmd = (pmd_t *)&page[pmd_index(va)];
if (pmd_none(*pmd)) {
pte_page = alloc_static_page(&phys);
--- head-2009-02-02.orig/drivers/xen/core/machine_reboot.c 2008-12-15 11:32:52.000000000 +0100
+++ head-2009-02-02/drivers/xen/core/machine_reboot.c 2008-11-25 13:18:07.000000000 +0100
@@ -84,7 +84,7 @@ static void post_suspend(int suspend_can
unsigned long shinfo_mfn;
extern unsigned long max_pfn;
extern unsigned long *pfn_to_mfn_frame_list_list;
- extern unsigned long *pfn_to_mfn_frame_list[];
+ extern unsigned long **pfn_to_mfn_frame_list;
if (suspend_cancelled) {
xen_start_info->store_mfn =
--- head-2009-02-02.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-15 11:34:04.000000000 +0100
+++ head-2009-02-02/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-11-25 13:18:07.000000000 +0100
@@ -158,7 +158,7 @@ static inline void xen_set_pgd(pgd_t *pg
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-#define MAXMEM _AC(0x00003fffffffffff, UL)
+#define MAXMEM _AC(0x0000006fffffffff, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
[-- Attachment #3: xen-x86_64-note-init-p2m.patch --]
[-- Type: text/plain, Size: 15855 bytes --]
From: jbeulich@novell.com
Subject: eliminate scalability issues from initial mapping setup
Patch-mainline: obsolete
References: bnc#417417
Direct Xen to place the initial P->M table outside of the initial
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
restriction on the size of the initial mapping limits the amount
of memory a domain can be handed initially.
Note that the flags passed to HYPERVISOR_update_va_mapping() from
__make_page_writable() and make_lowmem_page_writable() are
intentionally not including UVMF_ALL. This is intended to be on optimal
choice between the overhead of a potential spurious page fault (as
remote CPUs may still have read-only translations in their TLBs) and
the overhead of cross processor flushes. Flushing on the local CPU
shouldn't be as expensive (and hence can be viewed as an optimization
avoiding the spurious page fault on the local CPU), but is required
when the functions are used before the page fault handler gets set up.
--- sle11-2009-02-05.orig/arch/x86/kernel/head64-xen.c 2009-01-16 10:31:56.000000000 +0100
+++ sle11-2009-02-05/arch/x86/kernel/head64-xen.c 2008-12-23 12:23:58.000000000 +0100
@@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch
+ (xen_start_info->nr_pt_frames << PAGE_SHIFT),
"Xen provided");
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ xen_start_info->mfn_list = ~0UL;
+ else if (xen_start_info->mfn_list < __START_KERNEL_map)
+ reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT,
+ (xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT,
+ "INITP2M");
+
/*
* At this point everything still needed from the boot loader
* or BIOS or kernel text should be early reserved or marked not
--- sle11-2009-02-05.orig/arch/x86/kernel/head_64-xen.S 2008-12-15 11:34:16.000000000 +0100
+++ sle11-2009-02-05/arch/x86/kernel/head_64-xen.S 2008-12-22 12:57:33.000000000 +0100
@@ -18,6 +18,7 @@
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/dwarf2.h>
@@ -135,6 +136,7 @@ ENTRY(empty_zero_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
--- sle11-2009-02-05.orig/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:42.000000000 +0100
+++ sle11-2009-02-05/arch/x86/kernel/setup-xen.c 2008-12-23 15:58:43.000000000 +0100
@@ -1022,7 +1022,7 @@ void __init setup_arch(char **cmdline_p)
difference = xen_start_info->nr_pages - max_pfn;
set_xen_guest_handle(reservation.extent_start,
- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+ phys_to_machine_mapping + max_pfn);
reservation.nr_extents = difference;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
&reservation);
@@ -1039,14 +1039,86 @@ void __init setup_arch(char **cmdline_p)
phys_to_machine_mapping = alloc_bootmem_pages(
max_pfn * sizeof(unsigned long));
memcpy(phys_to_machine_mapping,
- (unsigned long *)xen_start_info->mfn_list,
+ __va(__pa(xen_start_info->mfn_list)),
p2m_pages * sizeof(unsigned long));
memset(phys_to_machine_mapping + p2m_pages, ~0,
(max_pfn - p2m_pages) * sizeof(unsigned long));
- free_bootmem(
- __pa(xen_start_info->mfn_list),
- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
- sizeof(unsigned long))));
+
+#ifdef CONFIG_X86_64
+ if (xen_start_info->mfn_list == VMEMMAP_START) {
+ /*
+ * Since it is well isolated we can (and since it is
+ * perhaps large we should) also free the page tables
+ * mapping the initial P->M table.
+ */
+ unsigned long va = VMEMMAP_START, pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+
+ BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+ xen_l4_entry_update(pgd, __pgd(0));
+ for(;;) {
+ pud_t *pud = pud_page + pud_index(va);
+
+ if (pud_none(*pud))
+ va += PUD_SIZE;
+ else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ make_pages_writable(__va(pa),
+ PUD_SIZE >> PAGE_SHIFT,
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd_t *pmd = pmd_offset(pud, va);
+
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ make_pages_writable(__va(pa),
+ PMD_SIZE >> PAGE_SHIFT,
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte_t *pte = pte_offset_kernel(pmd, va);
+
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ make_page_writable(__va(pa),
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PAGE_SIZE);
+ }
+ ClearPagePinned(virt_to_page(pte));
+ make_page_writable(pte,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa(pte), PAGE_SIZE);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ ClearPagePinned(virt_to_page(pmd));
+ make_page_writable(pmd,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa((unsigned long)pmd
+ & PAGE_MASK),
+ PAGE_SIZE);
+ }
+ if (!pud_index(va))
+ break;
+ }
+ ClearPagePinned(virt_to_page(pud_page));
+ make_page_writable(pud_page,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+ PAGE_SIZE);
+ } else if (!WARN_ON(xen_start_info->mfn_list
+ < __START_KERNEL_map))
+#endif
+ free_bootmem(__pa(xen_start_info->mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+ sizeof(unsigned long))));
+
/*
* Initialise the list of the frames that specify the list of
--- sle11-2009-02-05.orig/arch/x86/mm/init_64-xen.c 2009-02-02 11:43:04.000000000 +0100
+++ sle11-2009-02-05/arch/x86/mm/init_64-xen.c 2009-02-02 11:43:10.000000000 +0100
@@ -157,6 +157,17 @@ static unsigned long __meminitdata table
static unsigned long __meminitdata table_cur;
static unsigned long __meminitdata table_top;
+static __init unsigned long get_table_cur(void)
+{
+ BUG_ON(!table_cur);
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && table_cur == xen_start_info->first_p2m_pfn) {
+ table_cur += xen_start_info->nr_p2m_frames;
+ table_top += xen_start_info->nr_p2m_frames;
+ }
+ return table_cur++;
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -168,8 +179,7 @@ static __ref void *spp_getpage(void)
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else if (table_cur < table_top) {
- ptr = __va(table_cur << PAGE_SHIFT);
- table_cur++;
+ ptr = __va(get_table_cur() << PAGE_SHIFT);
memset(ptr, 0, PAGE_SIZE);
} else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign
return adr;
}
- BUG_ON(!table_cur);
- pfn = table_cur++;
+ pfn = get_table_cur();
if (pfn >= table_top)
panic("alloc_low_page: ran out of memory");
@@ -361,14 +370,29 @@ static inline int __meminit make_readonl
/* Make new page tables read-only on the first pass. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& !max_pfn_mapped
- && (paddr >= (table_start << PAGE_SHIFT))
- && (paddr < (table_top << PAGE_SHIFT)))
- readonly = 1;
+ && (paddr >= (table_start << PAGE_SHIFT))) {
+ unsigned long top = table_top;
+
+ /* Account for the range get_table_cur() skips. */
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && table_cur <= xen_start_info->first_p2m_pfn
+ && top > xen_start_info->first_p2m_pfn)
+ top += xen_start_info->nr_p2m_frames;
+ if (paddr < (top << PAGE_SHIFT))
+ readonly = 1;
+ }
/* Make old page tables read-only. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
&& (paddr < (table_cur << PAGE_SHIFT)))
readonly = 1;
+ /* Make P->M table (and its page tables) read-only. */
+ if (!xen_feature(XENFEAT_writable_page_tables)
+ && xen_start_info->mfn_list < __START_KERNEL_map
+ && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+ && paddr < (xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+ readonly = 1;
/*
* No need for writable mapping of kernel image. This also ensures that
@@ -616,6 +640,12 @@ void __init xen_init_pt(void)
__pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
memcpy(level2_kernel_pgt, page, PAGE_SIZE);
+ /* Copy the initial P->M table mappings if necessary. */
+ addr = pgd_index(xen_start_info->mfn_list);
+ if (addr < pgd_index(__START_KERNEL_map))
+ init_level4_pgt[addr] =
+ ((pgd_t *)xen_start_info->pt_base)[addr];
+
/* Do an early initialization of the fixmap area. */
addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
level3_kernel_pgt[pud_index(addr)] =
@@ -676,22 +706,28 @@ static void __init find_early_table_spac
static void __init xen_finish_init_mapping(void)
{
unsigned long i, start, end;
+ struct mmuext_op mmuext;
/* Re-vector virtual addresses pointing into the initial
mapping to the just-established permanent ones. */
xen_start_info = __va(__pa(xen_start_info));
xen_start_info->pt_base = (unsigned long)
__va(__pa(xen_start_info->pt_base));
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ if (!xen_feature(XENFEAT_auto_translated_physmap)
+ && xen_start_info->mfn_list >= __START_KERNEL_map)
phys_to_machine_mapping =
__va(__pa(xen_start_info->mfn_list));
- xen_start_info->mfn_list = (unsigned long)
- phys_to_machine_mapping;
- }
if (xen_start_info->mod_start)
xen_start_info->mod_start = (unsigned long)
__va(__pa(xen_start_info->mod_start));
+ /* Unpin the no longer used Xen provided page tables. */
+ mmuext.cmd = MMUEXT_UNPIN_TABLE;
+ mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base)
+ >> PAGE_SHIFT);
+ if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+ BUG();
+
/* Destroy the Xen-created mappings beyond the kernel image. */
start = PAGE_ALIGN((unsigned long)_end);
end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
@@ -948,9 +984,20 @@ unsigned long __init_refok init_memory_m
__flush_tlb_all();
- if (!after_bootmem && table_top > table_start)
+ if (!after_bootmem && table_top > table_start) {
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && table_start <= xen_start_info->first_p2m_pfn
+ && table_top > xen_start_info->first_p2m_pfn) {
+ reserve_early(table_start << PAGE_SHIFT,
+ xen_start_info->first_p2m_pfn
+ << PAGE_SHIFT,
+ "PGTABLE");
+ table_start = xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames;
+ }
reserve_early(table_start << PAGE_SHIFT,
table_top << PAGE_SHIFT, "PGTABLE");
+ }
printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
last_map_addr, end);
--- sle11-2009-02-05.orig/arch/x86/mm/pageattr-xen.c 2009-02-05 10:41:27.000000000 +0100
+++ sle11-2009-02-05/arch/x86/mm/pageattr-xen.c 2009-02-05 10:42:43.000000000 +0100
@@ -1193,7 +1193,7 @@ static void __make_page_writable(unsigne
pte = lookup_address(va, &level);
BUG_ON(!pte || level != PG_LEVEL_4K);
- if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
BUG();
if (in_secondary_range(va)) {
unsigned long pfn = pte_pfn(*pte);
--- sle11-2009-02-05.orig/arch/x86/mm/pgtable-xen.c 2008-12-15 11:34:16.000000000 +0100
+++ sle11-2009-02-05/arch/x86/mm/pgtable-xen.c 2008-12-23 14:38:22.000000000 +0100
@@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void)
if (PTRS_PER_PUD > 1) /* not folded */
SetPagePinned(virt_to_page(pud));
for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
- if (!pud_present(*pud))
+ if (!pud_present(*pud) || pud_large(*pud))
continue;
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
@@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void)
&& m >= pmd_index(HYPERVISOR_VIRT_START))
continue;
#endif
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
continue;
SetPagePinned(pmd_page(*pmd));
}
--- sle11-2009-02-05.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-15 11:28:15.000000000 +0100
+++ sle11-2009-02-05/arch/x86/mm/pgtable_32-xen.c 2009-02-03 14:44:56.000000000 +0100
@@ -188,6 +188,6 @@ void make_lowmem_page_writable(void *va,
pte = lookup_address((unsigned long)va, &level);
BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
rc = HYPERVISOR_update_va_mapping(
- (unsigned long)va, pte_mkwrite(*pte), 0);
+ (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
BUG_ON(rc);
}
--- sle11-2009-02-05.orig/include/xen/interface/elfnote.h 2008-11-25 12:35:56.000000000 +0100
+++ sle11-2009-02-05/include/xen/interface/elfnote.h 2008-12-22 12:53:19.000000000 +0100
@@ -162,9 +162,20 @@
#define XEN_ELFNOTE_SUSPEND_CANCEL 14
/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be prepared that the page table pages used for
+ * this mapping may not be accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M 15
+
+/*
* The number of the highest elfnote defined.
*/
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
/*
* System information exported through crash notes.
--- sle11-2009-02-05.orig/include/xen/interface/xen.h 2008-12-15 11:27:38.000000000 +0100
+++ sle11-2009-02-05/include/xen/interface/xen.h 2008-12-23 10:50:00.000000000 +0100
@@ -534,6 +534,7 @@ typedef struct shared_info shared_info_t
* a. relocated kernel image
* b. initial ram disk [mod_start, mod_len]
* c. list of allocated page frames [mfn_list, nr_pages]
+ * (unless relocated due to XEN_ELFNOTE_INIT_P2M)
* d. start_info_t structure [register ESI (x86)]
* e. bootstrap page tables [pt_base, CR3 (x86)]
* f. bootstrap stack [register ESP (x86)]
@@ -575,6 +576,9 @@ struct start_info {
unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
int8_t cmd_line[MAX_GUEST_CMDLINE];
+ /* The pfn range here covers both page table and p->m table frames. */
+ unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */
+ unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */
};
typedef struct start_info start_info_t;
[-- Attachment #4: xen-x86_64-init-memmap.patch --]
[-- Type: text/plain, Size: 12911 bytes --]
From: jbeulich@novell.com
Subject: eliminate scalability issues from direct mapping setup
Patch-mainline: obsolete
References: bnc#417417
Should be merged into the 2.6.27 merge patch once verified.
--- head-2009-02-02.orig/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:29.000000000 +0100
+++ head-2009-02-02/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:42.000000000 +0100
@@ -914,21 +914,6 @@ void __init setup_arch(char **cmdline_p)
#endif
/* max_pfn_mapped is updated here */
-#ifdef CONFIG_X86_64_XEN
- /*
- * Due to the way initial table space gets calculated on Xen, we have
- * to call init_memory_mapping() with the larger end address first.
- */
- if (max_pfn > max_low_pfn)
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- if (max_pfn > max_low_pfn)
- /* can we preserve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- else
- max_pfn_mapped = max_low_pfn_mapped;
-#else
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
@@ -940,7 +925,6 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
}
#endif
-#endif
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
--- head-2009-02-02.orig/arch/x86/mm/init_64-xen.c 2009-02-02 11:42:55.000000000 +0100
+++ head-2009-02-02/arch/x86/mm/init_64-xen.c 2009-02-02 11:43:04.000000000 +0100
@@ -322,31 +322,45 @@ void __init cleanup_highmap(void)
}
#endif
-static __meminit void *alloc_static_page(unsigned long *phys)
+static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long va = (table_cur << PAGE_SHIFT) + __START_KERNEL_map;
+ unsigned long pfn;
+ void *adr;
if (after_bootmem) {
- void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ adr = (void *)get_zeroed_page(GFP_ATOMIC);
*phys = __pa(adr);
return adr;
}
BUG_ON(!table_cur);
- *phys = table_cur++ << PAGE_SHIFT;
- return memset((void *)va, 0, PAGE_SIZE);
+ pfn = table_cur++;
+ if (pfn >= table_top)
+ panic("alloc_low_page: ran out of memory");
+
+ adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
+ memset(adr, 0, PAGE_SIZE);
+ *phys = pfn * PAGE_SIZE;
+ return adr;
}
-#define unmap_low_page(p) ((void)(p))
+static __ref void unmap_low_page(void *adr)
+{
+ if (after_bootmem)
+ return;
+
+ early_iounmap(adr, PAGE_SIZE);
+}
static inline int __meminit make_readonly(unsigned long paddr)
{
extern char __vsyscall_0;
int readonly = 0;
- /* Make new page tables read-only. */
+ /* Make new page tables read-only on the first pass. */
if (!xen_feature(XENFEAT_writable_page_tables)
+ && !max_pfn_mapped
&& (paddr >= (table_start << PAGE_SHIFT))
&& (paddr < (table_top << PAGE_SHIFT)))
readonly = 1;
@@ -412,7 +426,7 @@ phys_pte_update(pmd_t *pmd, unsigned lon
{
pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- BUG_ON(!after_bootmem);
+ BUG_ON(!max_pfn_mapped);
return phys_pte_init(pte, address, end);
}
@@ -457,12 +471,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
continue;
}
- pte = alloc_static_page(&pte_phys);
+ pte = alloc_low_page(&pte_phys);
last_map_addr = phys_pte_init(pte, address, end);
unmap_low_page(pte);
if (!after_bootmem) {
- early_make_page_readonly(pte, XENFEAT_writable_page_tables);
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pte_phys),
+ XENFEAT_writable_page_tables);
*pmd = __pmd(pte_phys | _PAGE_TABLE);
} else {
spin_lock(&init_mm.page_table_lock);
@@ -481,7 +497,7 @@ phys_pmd_update(pud_t *pud, unsigned lon
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
- BUG_ON(!after_bootmem);
+ BUG_ON(!max_pfn_mapped);
last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
__flush_tlb_all();
return last_map_addr;
@@ -520,12 +536,14 @@ phys_pud_init(pud_t *pud_page, unsigned
continue;
}
- pmd = alloc_static_page(&pmd_phys);
+ pmd = alloc_low_page(&pmd_phys);
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
unmap_low_page(pmd);
if (!after_bootmem) {
- early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pmd_phys),
+ XENFEAT_writable_page_tables);
if (page_size_mask & (1 << PG_LEVEL_NUM))
xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
else
@@ -548,13 +566,7 @@ phys_pud_update(pgd_t *pgd, unsigned lon
{
pud_t *pud;
- if (!after_bootmem) {
- unsigned long addr = __pgd_val(*pgd), *page;
-
- addr_to_page(addr, page);
- pud = (pud_t *)page;
- } else
- pud = (pud_t *)pgd_page_vaddr(*pgd);
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
}
@@ -628,73 +640,6 @@ void __init xen_init_pt(void)
xen_pgd_pin(init_level4_pgt);
}
-static void __init extend_init_mapping(unsigned long tables_space)
-{
- unsigned long va = __START_KERNEL_map;
- unsigned long start = table_cur;
- unsigned long phys, addr, *pte_page;
- pmd_t *pmd;
- pte_t *pte, new_pte;
- unsigned long *page = (unsigned long *)init_level4_pgt;
-
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
-
- /* Kill mapping of low 1MB. */
- while (va < (unsigned long)&_text) {
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
-
- /* Ensure init mappings cover kernel text/data and initial tables. */
- while (va < (__START_KERNEL_map
- + (table_cur << PAGE_SHIFT)
- + tables_space)) {
- if (!pmd_index(va) && !pte_index(va)) {
- page = (unsigned long *)init_level4_pgt;
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
- }
- pmd = (pmd_t *)&page[pmd_index(va)];
- if (pmd_none(*pmd)) {
- pte_page = alloc_static_page(&phys);
- early_make_page_readonly(
- pte_page, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
- } else {
- addr = page[pmd_index(va)];
- addr_to_page(addr, pte_page);
- }
- pte = (pte_t *)&pte_page[pte_index(va)];
- if (pte_none(*pte)) {
- new_pte = pfn_pte(
- (va - __START_KERNEL_map) >> PAGE_SHIFT,
- __pgprot(_KERNPG_TABLE));
- xen_l1_entry_update(pte, new_pte);
- }
- va += PAGE_SIZE;
- }
-
- /* Finally, blow away any spurious initial mappings. */
- while (1) {
- pmd = (pmd_t *)&page[pmd_index(va)];
- if (pmd_none(*pmd))
- break;
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
-
- if (table_cur > start)
- reserve_early(start << PAGE_SHIFT,
- table_cur << PAGE_SHIFT, "INITMAP");
-}
-
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
@@ -708,19 +653,27 @@ static void __init find_early_table_spac
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
- table_cur = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
- xen_start_info->nr_pt_frames;
-
- extend_init_mapping(tables);
+ if (!table_top) {
+ table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+ xen_start_info->nr_pt_frames;
+ table_cur = table_start;
+ } else {
+ /*
+ * [table_start, table_top) gets passed to reserve_early(),
+ * so we must not use table_cur here, despite continuing
+ * to allocate from there. table_cur possibly being below
+ * table_start is otoh not a problem.
+ */
+ table_start = table_top;
+ }
- table_start = table_cur;
- table_top = table_start + (tables >> PAGE_SHIFT);
+ table_top = table_cur + (tables >> PAGE_SHIFT);
printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
+ end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
}
-static void __init xen_finish_init_mapping(bool reserve)
+static void __init xen_finish_init_mapping(void)
{
unsigned long i, start, end;
@@ -739,18 +692,17 @@ static void __init xen_finish_init_mappi
xen_start_info->mod_start = (unsigned long)
__va(__pa(xen_start_info->mod_start));
- /* Destroy the Xen-created mappings beyond the kernel image as
- * well as the temporary mappings created above. Prevents
- * overlap with modules area (if init mapping is very big).
- */
+ /* Destroy the Xen-created mappings beyond the kernel image. */
start = PAGE_ALIGN((unsigned long)_end);
- end = __START_KERNEL_map + (table_top << PAGE_SHIFT);
+ end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
for (; start < end; start += PAGE_SIZE)
if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
BUG();
/* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
- start = table_cur;
+ start = table_top;
+ WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
+ table_start, table_cur, start);
table_top = ~0UL;
/* Switch to the real shared_info page, and clear the dummy page. */
@@ -768,11 +720,7 @@ static void __init xen_finish_init_mappi
<< PAGE_SHIFT,
PAGE_KERNEL_RO);
- /* Disable the 'table_cur' allocator. */
- table_top = table_cur;
- if (reserve && table_cur > start)
- reserve_early(start << PAGE_SHIFT,
- table_cur << PAGE_SHIFT, "FIXMAP");
+ table_top = max(table_cur, start);
}
static void __init init_gbpages(void)
@@ -810,13 +758,15 @@ static unsigned long __meminit kernel_ph
continue;
}
- pud = alloc_static_page(&pud_phys);
+ pud = alloc_low_page(&pud_phys);
last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
page_size_mask);
unmap_low_page(pud);
if(!after_bootmem) {
- early_make_page_readonly(pud, XENFEAT_writable_page_tables);
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pud_phys),
+ XENFEAT_writable_page_tables);
xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
} else {
spin_lock(&init_mm.page_table_lock);
@@ -864,7 +814,7 @@ unsigned long __init_refok init_memory_m
unsigned long last_map_addr = 0;
unsigned long page_size_mask = 0;
unsigned long start_pfn, end_pfn;
- bool first = !table_start;
+
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
int use_pse, use_gbpages;
@@ -955,23 +905,50 @@ unsigned long __init_refok init_memory_m
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- if (first)
+ if (!after_bootmem)
find_early_table_space(end, use_pse, use_gbpages);
+ if (!start) {
+ unsigned long addr, va = __START_KERNEL_map;
+ unsigned long *page = (unsigned long *)init_level4_pgt;
+
+ /* Kill mapping memory below _text. */
+ while (va < (unsigned long)&_text) {
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ /* Blow away any spurious initial mappings. */
+ va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+ addr = page[pgd_index(va)];
+ addr_to_page(addr, page);
+ addr = page[pud_index(va)];
+ addr_to_page(addr, page);
+ while (pmd_index(va) | pte_index(va)) {
+ if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+ break;
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+ }
+
for (i = 0; i < nr_range; i++)
last_map_addr = kernel_physical_mapping_init(
mr[i].start, mr[i].end,
mr[i].page_size_mask);
BUG_ON(table_cur > table_top);
- if (start < (table_start << PAGE_SHIFT)) {
- WARN_ON(table_cur != table_top);
- xen_finish_init_mapping(!first);
- }
+ if (!start)
+ xen_finish_init_mapping();
+ else if (table_cur < table_top)
+ /* Disable the 'table_cur' allocator. */
+ table_top = table_cur;
__flush_tlb_all();
- if (first && table_top > table_start)
+ if (!after_bootmem && table_top > table_start)
reserve_early(table_start << PAGE_SHIFT,
table_top << PAGE_SHIFT, "PGTABLE");
--- head-2009-02-02.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-15 11:34:16.000000000 +0100
+++ head-2009-02-02/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-02-02 10:23:00.000000000 +0100
@@ -165,7 +165,7 @@ static inline void xen_set_pgd(pgd_t *pg
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-#define MAXMEM _AC(0x0000006fffffffff, UL)
+#define MAXMEM _AC(0x000004ffffffffff, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
[-- Attachment #5: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 10+ messages in thread