From: Samuel Thibault <samuel.thibault@ens-lyon.org>
To: Juergen Gross <jgross@suse.com>
Cc: minios-devel@lists.xenproject.org,
xen-devel@lists.xenproject.org, wei.liu2@citrix.com
Subject: Re: [PATCH 12/22] mini-os: add x86 native page table handling
Date: Wed, 24 Aug 2016 00:40:02 +0200 [thread overview]
Message-ID: <20160823224002.GH4401@var.home> (raw)
In-Reply-To: <1471965368-6159-13-git-send-email-jgross@suse.com>
Juergen Gross, on Tue 23 Aug 2016 17:15:58 +0200, wrote:
> For support of HVMlite don't use mmu_update hypercalls, but write the
> page table entries directly.
>
> Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
> ---
> arch/x86/mm.c | 147 +++++++++++++++++++++++++++++++++++++-------------
> arch/x86/traps.c | 10 ++++
> include/x86/arch_mm.h | 4 ++
> include/x86/os.h | 9 ++++
> 4 files changed, 132 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/mm.c b/arch/x86/mm.c
> index cbb5617..f5248a4 100644
> --- a/arch/x86/mm.c
> +++ b/arch/x86/mm.c
> @@ -123,16 +123,25 @@ void arch_mm_preinit(void *p)
> * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest
> * PFN.
> */
> +static pgentry_t pt_prot[PAGETABLE_LEVELS] = {
> + L1_PROT,
> + L2_PROT,
> + L3_PROT,
> +#if defined(__x86_64__)
> + L4_PROT,
> +#endif
> +};
> +
> static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn,
> unsigned long offset, unsigned long level)
> {
> - pgentry_t *tab = pt_base;
> + pgentry_t *tab;
> unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn);
> - pgentry_t prot_e, prot_t;
> +#ifdef CONFIG_PARAVIRT
> mmu_update_t mmu_updates[1];
> int rc;
> +#endif
>
> - prot_e = prot_t = 0;
> DEBUG("Allocating new L%d pt frame for pfn=%lx, "
> "prev_l_mfn=%lx, offset=%lx",
> level, *pt_pfn, prev_l_mfn, offset);
> @@ -140,30 +149,12 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn,
> /* We need to clear the page, otherwise we might fail to map it
> as a page table page */
> memset((void*) pt_page, 0, PAGE_SIZE);
> -
> - switch ( level )
> - {
> - case L1_FRAME:
> - prot_e = L1_PROT;
> - prot_t = L2_PROT;
> - break;
> - case L2_FRAME:
> - prot_e = L2_PROT;
> - prot_t = L3_PROT;
> - break;
> -#if defined(__x86_64__)
> - case L3_FRAME:
> - prot_e = L3_PROT;
> - prot_t = L4_PROT;
> - break;
> -#endif
> - default:
> - printk("new_pt_frame() called with invalid level number %lu\n", level);
> - do_exit();
> - break;
> - }
>
> + ASSERT(level >= 1 && level <= PAGETABLE_LEVELS);
> +
> +#ifdef CONFIG_PARAVIRT
> /* Make PFN a page table page */
> + tab = pt_base;
> #if defined(__x86_64__)
> tab = pte_to_virt(tab[l4_table_offset(pt_page)]);
> #endif
> @@ -172,7 +163,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn,
> mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) +
> sizeof(pgentry_t) * l1_table_offset(pt_page);
> mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT |
> - (prot_e & ~_PAGE_RW);
> + (pt_prot[level - 1] & ~_PAGE_RW);
>
> if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
> {
> @@ -184,13 +175,18 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn,
> /* Hook the new page table page into the hierarchy */
> mmu_updates[0].ptr =
> ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
> - mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | prot_t;
> + mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT |
> + pt_prot[level];
>
> if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
> {
> printk("ERROR: mmu_update failed with rc=%d\n", rc);
> do_exit();
> }
> +#else
> + tab = mfn_to_virt(prev_l_mfn);
> + tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level];
> +#endif
>
> *pt_pfn += 1;
> }
> @@ -202,12 +198,14 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
> {
> unsigned long start_address, end_address;
> unsigned long pfn_to_map, pt_pfn = *start_pfn;
> - static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
> pgentry_t *tab = pt_base, page;
> unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(pt_base));
> unsigned long offset;
> +#ifdef CONFIG_PARAVIRT
> + static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
> int count = 0;
> int rc;
> +#endif
>
> /* Be conservative: even if we know there will be more pages already
> mapped, start the loop at the very beginning. */
> @@ -225,6 +223,10 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
> ((unsigned long)pfn_to_virt(*max_pfn) -
> (unsigned long)&_text)>>20);
> }
> +#else
> + /* Round up to next 2MB boundary as we are using 2MB pages on HVMlite. */
> + pfn_to_map = (pfn_to_map + L1_PAGETABLE_ENTRIES - 1) &
> + ~(L1_PAGETABLE_ENTRIES - 1);
> #endif
>
> start_address = (unsigned long)pfn_to_virt(pfn_to_map);
> @@ -257,6 +259,7 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
> pt_mfn = pte_to_mfn(page);
> tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
> offset = l2_table_offset(start_address);
> +#ifdef CONFIG_PARAVIRT
> /* Need new L1 pt frame */
> if ( !(tab[offset] & _PAGE_PRESENT) )
> new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
> @@ -288,6 +291,12 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
> count = 0;
> }
> start_address += PAGE_SIZE;
> +#else
> + if ( !(tab[offset] & _PAGE_PRESENT) )
> + tab[offset] = (pgentry_t)pfn_to_map << PAGE_SHIFT |
> + L2_PROT | _PAGE_PSE;
> + start_address += 1UL << L2_PAGETABLE_SHIFT;
> +#endif
> }
>
> *start_pfn = pt_pfn;
> @@ -302,16 +311,19 @@ static void set_readonly(void *text, void *etext)
> unsigned long start_address =
> ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
> unsigned long end_address = (unsigned long) etext;
> - static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
> pgentry_t *tab = pt_base, page;
> unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base));
> unsigned long offset;
> + unsigned long page_size = PAGE_SIZE;
> +#ifdef CONFIG_PARAVIRT
> + static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
> int count = 0;
> int rc;
> +#endif
>
> printk("setting %p-%p readonly\n", text, etext);
>
> - while ( start_address + PAGE_SIZE <= end_address )
> + while ( start_address + page_size <= end_address )
> {
> tab = pt_base;
> mfn = pfn_to_mfn(virt_to_pfn(pt_base));
> @@ -327,26 +339,34 @@ static void set_readonly(void *text, void *etext)
> mfn = pte_to_mfn(page);
> tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
> offset = l2_table_offset(start_address);
> - page = tab[offset];
> - mfn = pte_to_mfn(page);
> - tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
> + if ( !(tab[offset] & _PAGE_PSE) )
> + {
> + page = tab[offset];
> + mfn = pte_to_mfn(page);
> + tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
>
> - offset = l1_table_offset(start_address);
> + offset = l1_table_offset(start_address);
> + }
>
> if ( start_address != (unsigned long)&shared_info )
> {
> +#ifdef CONFIG_PARAVIRT
> mmu_updates[count].ptr =
> ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
> mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
> count++;
> +#else
> + tab[offset] &= ~_PAGE_RW;
> +#endif
> }
> else
> printk("skipped %lx\n", start_address);
>
> - start_address += PAGE_SIZE;
> + start_address += page_size;
>
> +#ifdef CONFIG_PARAVIRT
> if ( count == L1_PAGETABLE_ENTRIES ||
> - start_address + PAGE_SIZE > end_address )
> + start_address + page_size > end_address )
> {
> rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
> if ( rc < 0 )
> @@ -356,8 +376,13 @@ static void set_readonly(void *text, void *etext)
> }
> count = 0;
> }
> +#else
> + if ( start_address == (1UL << L2_PAGETABLE_SHIFT) )
> + page_size = 1UL << L2_PAGETABLE_SHIFT;
> +#endif
> }
>
> +#ifdef CONFIG_PARAVIRT
> {
> mmuext_op_t op = {
> .cmd = MMUEXT_TLB_FLUSH_ALL,
> @@ -365,6 +390,9 @@ static void set_readonly(void *text, void *etext)
> int count;
> HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
> }
> +#else
> + write_cr3((unsigned long)pt_base);
> +#endif
> }
>
> /*
> @@ -394,6 +422,8 @@ static pgentry_t *get_pgt(unsigned long va)
> offset = l2_table_offset(va);
> if ( !(tab[offset] & _PAGE_PRESENT) )
> return NULL;
> + if ( tab[offset] & _PAGE_PSE )
> + return &tab[offset];
> mfn = pte_to_mfn(tab[offset]);
> tab = mfn_to_virt(mfn);
> offset = l1_table_offset(va);
> @@ -448,6 +478,9 @@ pgentry_t *need_pgt(unsigned long va)
> new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
> }
> ASSERT(tab[offset] & _PAGE_PRESENT);
> + if ( tab[offset] & _PAGE_PSE )
> + return &tab[offset];
> +
> pt_mfn = pte_to_mfn(tab[offset]);
> tab = mfn_to_virt(pt_mfn);
>
> @@ -524,8 +557,6 @@ int do_map_frames(unsigned long va,
> {
> pgentry_t *pgt = NULL;
> unsigned long done = 0;
> - unsigned long i;
> - int rc;
>
> if ( !mfns )
> {
> @@ -539,6 +570,9 @@ int do_map_frames(unsigned long va,
> memset(err, 0x00, n * sizeof(int));
> while ( done < n )
> {
> +#ifdef CONFIG_PARAVIRT
> + unsigned long i;
> + int rc;
> unsigned long todo;
>
> if ( err )
> @@ -578,6 +612,17 @@ int do_map_frames(unsigned long va,
> }
> }
> done += todo;
> +#else
> + if ( !pgt || !(va & L1_MASK) )
> + pgt = need_pgt(va & ~L1_MASK);
> + if ( !pgt )
> + return -ENOMEM;
> +
> + ASSERT(!(*pgt & _PAGE_PSE));
> + pgt[l1_table_offset(va)] = (pgentry_t)
> + (((mfns[done * stride] + done * incr) << PAGE_SHIFT) | prot);
> + done++;
> +#endif
> }
>
> return 0;
> @@ -609,16 +654,21 @@ void *map_frames_ex(const unsigned long *mfns, unsigned long n,
> #define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
> int unmap_frames(unsigned long va, unsigned long num_frames)
> {
> +#ifdef CONFIG_PARAVIRT
> int n = UNMAP_BATCH;
> multicall_entry_t call[n];
> int ret;
> int i;
> +#else
> + pgentry_t *pgt;
> +#endif
>
> ASSERT(!((unsigned long)va & ~PAGE_MASK));
>
> DEBUG("va=%p, num=0x%lx\n", va, num_frames);
>
> while ( num_frames ) {
> +#ifdef CONFIG_PARAVIRT
> if ( n > num_frames )
> n = num_frames;
>
> @@ -653,6 +703,17 @@ int unmap_frames(unsigned long va, unsigned long num_frames)
> }
> }
> num_frames -= n;
> +#else
> + pgt = get_pgt(va);
> + if ( pgt )
> + {
> + ASSERT(!(*pgt & _PAGE_PSE));
> + *pgt = 0;
> + invlpg(va);
> + }
> + va += PAGE_SIZE;
> + num_frames--;
> +#endif
> }
> return 0;
> }
> @@ -662,14 +723,24 @@ int unmap_frames(unsigned long va, unsigned long num_frames)
> */
> static void clear_bootstrap(void)
> {
> +#ifdef CONFIG_PARAVIRT
> pte_t nullpte = { };
> int rc;
> +#else
> + pgentry_t *pgt;
> +#endif
>
> /* Use first page as the CoW zero page */
> memset(&_text, 0, PAGE_SIZE);
> mfn_zero = virt_to_mfn((unsigned long) &_text);
> +#ifdef CONFIG_PARAVIRT
> if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) )
> printk("Unable to unmap NULL page. rc=%d\n", rc);
> +#else
> + pgt = get_pgt((unsigned long)&_text);
> + *pgt = 0;
> + invlpg((unsigned long)&_text);
> +#endif
> }
>
> #ifdef CONFIG_PARAVIRT
> diff --git a/arch/x86/traps.c b/arch/x86/traps.c
> index 2d3222d..aa17da3 100644
> --- a/arch/x86/traps.c
> +++ b/arch/x86/traps.c
> @@ -121,7 +121,9 @@ void page_walk(unsigned long virt_address)
> static int handle_cow(unsigned long addr) {
> pgentry_t *tab = pt_base, page;
> unsigned long new_page;
> +#ifdef CONFIG_PARAVIRT
> int rc;
> +#endif
>
> #if defined(__x86_64__)
> page = tab[l4_table_offset(addr)];
> @@ -137,6 +139,8 @@ static int handle_cow(unsigned long addr) {
> page = tab[l2_table_offset(addr)];
> if (!(page & _PAGE_PRESENT))
> return 0;
> + if ( page & _PAGE_PSE )
> + return 0;
> tab = pte_to_virt(page);
>
> page = tab[l1_table_offset(addr)];
> @@ -149,12 +153,18 @@ static int handle_cow(unsigned long addr) {
> new_page = alloc_pages(0);
> memset((void*) new_page, 0, PAGE_SIZE);
>
> +#ifdef CONFIG_PARAVIRT
> rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, __pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG);
> if (!rc)
> return 1;
>
> printk("Map zero page to %lx failed: %d.\n", addr, rc);
> return 0;
> +#else
> + tab[l1_table_offset(addr)] = virt_to_mach(new_page) | L1_PROT;
> + invlpg(addr);
> + return 1;
> +#endif
> }
>
> static void do_stack_walk(unsigned long frame_base)
> diff --git a/include/x86/arch_mm.h b/include/x86/arch_mm.h
> index 28ab406..e0ae552 100644
> --- a/include/x86/arch_mm.h
> +++ b/include/x86/arch_mm.h
> @@ -78,6 +78,8 @@
> #define L2_PAGETABLE_ENTRIES 512
> #define L3_PAGETABLE_ENTRIES 4
>
> +#define PAGETABLE_LEVELS 3
> +
> #define PADDR_BITS 44
> #define PADDR_MASK ((1ULL << PADDR_BITS)-1)
>
> @@ -110,6 +112,8 @@ typedef uint64_t pgentry_t;
> #define L3_PAGETABLE_ENTRIES 512
> #define L4_PAGETABLE_ENTRIES 512
>
> +#define PAGETABLE_LEVELS 4
> +
> /* These are page-table limitations. Current CPUs support only 40-bit phys. */
> #define PADDR_BITS 52
> #define VADDR_BITS 48
> diff --git a/include/x86/os.h b/include/x86/os.h
> index 1083328..20cc27f 100644
> --- a/include/x86/os.h
> +++ b/include/x86/os.h
> @@ -206,6 +206,15 @@ static inline int irqs_disabled(void)
> */
> typedef struct { volatile int counter; } atomic_t;
>
> +static inline void write_cr3(unsigned long cr3)
> +{
> + asm volatile( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
> +}
> +
> +static inline void invlpg(unsigned long va)
> +{
> + asm volatile ( "invlpg %0": : "m" (*(const char *)(va)) : "memory" );
> +}
>
> /************************** i386 *******************************/
> #ifdef __INSIDE_MINIOS__
> --
> 2.6.6
>
--
Samuel
/* Amuse the user. */
printk(
" \\|/ ____ \\|/\n"
" \"@'/ ,. \\`@\"\n"
" /_| \\__/ |_\\\n"
" \\__U_/\n");
(From linux/arch/sparc/kernel/traps.c:die_if_kernel())
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
next prev parent reply other threads:[~2016-08-23 22:40 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-08-23 15:15 [PATCH 00/22] mini-os: support HVMlite mode Juergen Gross
2016-08-23 15:15 ` [PATCH 01/22] mini-os: resync xen headers Juergen Gross
2016-08-23 19:44 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 02/22] mini-os: make dump_regs() work in early boot Juergen Gross
2016-08-23 19:44 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 03/22] mini-os: add CONFIG_PARAVIRT Juergen Gross
2016-08-23 19:54 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 04/22] mini-os: make some memory management related macros usable from assembler Juergen Gross
2016-08-23 19:46 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 05/22] mini-os: add boot code for HVMlite support Juergen Gross
2016-08-23 20:51 ` Samuel Thibault
2016-08-24 5:13 ` Juergen Gross
2016-08-23 15:15 ` [PATCH 06/22] mini-os: setup hypercall page for HVMlite Juergen Gross
2016-08-23 21:03 ` Samuel Thibault
2016-08-24 5:10 ` Juergen Gross
2016-08-23 15:15 ` [PATCH 07/22] mini-os: support hvm_op hypercall Juergen Gross
2016-08-23 22:00 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 08/22] mini-os: initialize trap handling for HVMlite Juergen Gross
2016-08-23 22:05 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 09/22] mini-os: support HVMlite traps Juergen Gross
2016-08-23 22:10 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 10/22] mini-os: make p2m related code depend on CONFIG_PARAVIRT Juergen Gross
2016-08-23 22:20 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 11/22] mini-os: add static page tables for virtual kernel area for HVMlite Juergen Gross
2016-08-23 22:27 ` Samuel Thibault
2016-08-23 15:15 ` [PATCH 12/22] mini-os: add x86 native page table handling Juergen Gross
2016-08-23 22:40 ` Samuel Thibault [this message]
2016-08-23 15:15 ` [PATCH 13/22] mini-os: correct wrong calculation of alloc bitmap size Juergen Gross
2016-08-23 19:49 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 14/22] mini-os: add map_frame_virt() function Juergen Gross
2016-08-23 22:42 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 15/22] mini-os: setup console interface parameters Juergen Gross
2016-08-23 22:44 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 16/22] mini-os: setup xenbus " Juergen Gross
2016-08-23 22:45 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 17/22] mini-os: add get_cmdline() function Juergen Gross
2016-08-23 23:03 ` [Minios-devel] " Samuel Thibault
2016-08-23 15:16 ` [PATCH 18/22] mini-os: map shared info page for HVMlite Juergen Gross
2016-08-23 22:47 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 19/22] mini-os: remove using start_info in architecture independent code Juergen Gross
2016-08-23 22:48 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 20/22] mini-os: print start of day messages depending on domain type Juergen Gross
2016-08-23 22:51 ` Samuel Thibault
2016-08-24 5:09 ` Juergen Gross
2016-08-23 15:16 ` [PATCH 21/22] mini-os: get physical memory map Juergen Gross
2016-08-23 22:58 ` Samuel Thibault
2016-08-23 15:16 ` [PATCH 22/22] mini-os: support idle for HVMlite Juergen Gross
2016-08-23 23:01 ` Samuel Thibault
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160823224002.GH4401@var.home \
--to=samuel.thibault@ens-lyon.org \
--cc=jgross@suse.com \
--cc=minios-devel@lists.xenproject.org \
--cc=wei.liu2@citrix.com \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).