From: Mukesh Rathor <mukesh.rathor@oracle.com>
To: "Xen-devel@lists.xensource.com" <Xen-devel@lists.xensource.com>
Subject: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
Date: Fri, 15 Mar 2013 18:06:45 -0700 [thread overview]
Message-ID: <20130315180645.59f8618e@mantra.us.oracle.com> (raw)
Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
PVH mode. Introduce, opt_dom0pvh, which when specified in the command
line, causes dom0 to boot in PVH mode.
Change in V2:
- Map the entire IO region upfront in the P2M for PVH dom0.
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
xen/arch/x86/domain_build.c | 241 +++++++++++++++++++++++++++++++++----------
xen/arch/x86/mm/hap/hap.c | 17 +++-
xen/arch/x86/setup.c | 10 ++-
xen/include/asm-x86/hap.h | 1 +
4 files changed, 212 insertions(+), 57 deletions(-)
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 8c5b27a..72aa70b 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,8 @@
#include <asm/setup.h>
#include <asm/bzimage.h> /* for bzimage_parse */
#include <asm/io_apic.h>
+#include <asm/hap.h>
+#include <asm/debugger.h>
#include <public/version.h>
@@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
}
}
+/*
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ */
+static __init void pvh_map_all_iomem(struct domain *d)
+{
+ unsigned long start = 0;
+ const struct e820entry *entry;
+ int rc, i, nump;
+
+ for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
+ unsigned long end = entry->addr + entry->size;
+
+ if (entry->type == E820_RAM || i == e820.nr_map - 1) {
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ if (entry->type == E820_RAM)
+ end_pfn = PFN_UP(entry->addr);
+
+ if (start_pfn < end_pfn) {
+ nump = end_pfn - start_pfn + 1;
+ rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+ BUG_ON(rc);
+ }
+ start = end;
+ }
+ }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+ unsigned long mfn, unsigned long vphysmap_s)
+{
+ if ( is_pvh_domain(d) ) {
+ int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+ BUG_ON(rc);
+ return;
+ }
+ if ( !is_pv_32on64_domain(d) )
+ ((unsigned long *)vphysmap_s)[pfn] = mfn;
+ else
+ ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+ set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void copy_pvh(char *dest, char *src, int bytes)
+{
+ /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr
+ * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
+ * So we just use dbg_rw_mem().
+ */
+ int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);
+ if (rem) {
+ printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
+ BUG();
+ }
+}
+
int __init construct_dom0(
struct domain *d,
const module_t *image, unsigned long image_headroom,
@@ -314,6 +375,7 @@ int __init construct_dom0(
void *(*bootstrap_map)(const module_t *),
char *cmdline)
{
+ char *si_buf=NULL, *tmp_buf=NULL;
int i, cpu, rc, compatible, compat32, order, machine;
struct cpu_user_regs *regs;
unsigned long pfn, mfn;
@@ -322,7 +384,7 @@ int __init construct_dom0(
unsigned long alloc_spfn;
unsigned long alloc_epfn;
unsigned long initrd_pfn = -1, initrd_mfn = 0;
- unsigned long count;
+ unsigned long count, shared_info_pfn_addr = 0;
struct page_info *page = NULL;
start_info_t *si;
struct vcpu *v = d->vcpu[0];
@@ -416,6 +478,13 @@ int __init construct_dom0(
{
printk("Kernel does not support Dom0 operation\n");
return -EINVAL;
+
+ if ( is_pvh_domain(d) &&
+ !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
+ {
+ printk("Kernel does not support PVH mode\n");
+ return -EINVAL;
+ }
}
if ( compat32 )
@@ -480,6 +549,12 @@ int __init construct_dom0(
vstartinfo_end = (vstartinfo_start +
sizeof(struct start_info) +
sizeof(struct dom0_vga_console_info));
+
+ if ( is_pvh_domain(d) ) {
+ shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;
+ vstartinfo_end += PAGE_SIZE;
+ }
+
vpt_start = round_pgup(vstartinfo_end);
for ( nr_pt_pages = 2; ; nr_pt_pages++ )
{
@@ -621,16 +696,26 @@ int __init construct_dom0(
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
}
- clear_page(l4tab);
- init_guest_l4_table(l4tab, d);
- v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
- if ( is_pv_32on64_domain(d) )
- v->arch.guest_table_user = v->arch.guest_table;
+ if ( is_pvh_domain(d) )
+ {
+ v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);
+ pfn = 0;
+ } else {
+ clear_page(l4tab);
+ init_guest_l4_table(l4tab, d);
+ v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+ if ( is_pv_32on64_domain(d) )
+ v->arch.guest_table_user = v->arch.guest_table;
+ pfn = alloc_spfn;
+ }
l4tab += l4_table_offset(v_start);
- pfn = alloc_spfn;
for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
{
+ /* initrd chunk's mfns are separate, so we need to adjust for them */
+ signed long pvh_adj = is_pvh_domain(d) ?
+ (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0;
+
if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
{
maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
@@ -657,16 +742,17 @@ int __init construct_dom0(
clear_page(l3tab);
if ( count == 0 )
l3tab += l3_table_offset(v_start);
- *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+ *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT);
l4tab++;
}
- *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+ *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT);
l3tab++;
}
- *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+ *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT);
l2tab++;
}
- if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
+ if ( is_pvh_domain(d) ||
+ count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
mfn = pfn++;
else
mfn = initrd_mfn++;
@@ -674,6 +760,9 @@ int __init construct_dom0(
L1_PROT : COMPAT_L1_PROT));
l1tab++;
+ if ( is_pvh_domain(d) )
+ continue;
+
page = mfn_to_page(mfn);
if ( (page->u.inuse.type_info == 0) &&
!get_page_and_type(page, d, PGT_writable_page) )
@@ -702,6 +791,9 @@ int __init construct_dom0(
COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
}
+ if ( is_pvh_domain(d) )
+ goto pvh_skip_pt_rdonly;
+
/* Pages that are part of page tables must be read only. */
l4tab = l4start + l4_table_offset(vpt_start);
l3start = l3tab = l4e_to_l3e(*l4tab);
@@ -741,6 +833,8 @@ int __init construct_dom0(
}
}
+pvh_skip_pt_rdonly:
+
/* Mask all upcalls... */
for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
@@ -754,6 +848,11 @@ int __init construct_dom0(
(void)alloc_vcpu(d, i, cpu);
}
+ if ( is_pvh_domain(d) )
+ {
+ v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
+ (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT;
+ }
/* Set up CR3 value for write_ptbase */
if ( paging_mode_enabled(d) )
paging_update_paging_modes(v);
@@ -764,35 +863,16 @@ int __init construct_dom0(
write_ptbase(v);
mapcache_override_current(v);
- /* Copy the OS image and free temporary buffer. */
- elf.dest = (void*)vkern_start;
- rc = elf_load_binary(&elf, 0);
- if ( rc < 0 )
- {
- printk("Failed to load the kernel binary\n");
- return rc;
- }
- bootstrap_map(NULL);
-
- if ( UNSET_ADDR != parms.virt_hypercall )
- {
- if ( (parms.virt_hypercall < v_start) ||
- (parms.virt_hypercall >= v_end) )
- {
- mapcache_override_current(NULL);
- write_ptbase(current);
- printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
- return -1;
+ /* Set up start info area. */
+ if ( is_pvh_domain(d) ) {
+ if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {
+ printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE);
+ return -ENOMEM;
}
- hypercall_page_initialise(
- d, (void *)(unsigned long)parms.virt_hypercall);
- }
-
- /* Free temporary buffers. */
- discard_initial_images();
+ si = (start_info_t *)si_buf;
+ } else
+ si = (start_info_t *)vstartinfo_start;
- /* Set up start info area. */
- si = (start_info_t *)vstartinfo_start;
clear_page(si);
si->nr_pages = nr_pages;
@@ -814,7 +894,7 @@ int __init construct_dom0(
l2tab = NULL;
l1tab = NULL;
/* Set up the phys->machine table if not part of the initial mapping. */
- if ( parms.p2m_base != UNSET_ADDR )
+ if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )
{
unsigned long va = vphysmap_start;
@@ -935,6 +1015,9 @@ int __init construct_dom0(
unmap_domain_page(l3tab);
unmap_domain_page(l4start);
+ if (is_pvh_domain(d) )
+ hap_set_pvh_alloc_for_dom0(d, nr_pages);
+
/* Write the phys->machine and machine->phys table entries. */
for ( pfn = 0; pfn < count; pfn++ )
{
@@ -951,11 +1034,8 @@ int __init construct_dom0(
if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
mfn = alloc_epfn - (pfn - REVERSE_START);
#endif
- if ( !is_pv_32on64_domain(d) )
- ((unsigned long *)vphysmap_start)[pfn] = mfn;
- else
- ((unsigned int *)vphysmap_start)[pfn] = mfn;
- set_gpfn_from_mfn(mfn, pfn);
+ dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+
if (!(pfn & 0xfffff))
process_pending_softirqs();
}
@@ -971,8 +1051,8 @@ int __init construct_dom0(
if ( !page->u.inuse.type_info &&
!get_page_and_type(page, d, PGT_writable_page) )
BUG();
- ((unsigned long *)vphysmap_start)[pfn] = mfn;
- set_gpfn_from_mfn(mfn, pfn);
+
+ dom0_update_physmap(d, pfn, mfn, vphysmap_start);
++pfn;
if (!(pfn & 0xfffff))
process_pending_softirqs();
@@ -992,11 +1072,7 @@ int __init construct_dom0(
#ifndef NDEBUG
#define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
#endif
- if ( !is_pv_32on64_domain(d) )
- ((unsigned long *)vphysmap_start)[pfn] = mfn;
- else
- ((unsigned int *)vphysmap_start)[pfn] = mfn;
- set_gpfn_from_mfn(mfn, pfn);
+ dom0_update_physmap(d, pfn, mfn, vphysmap_start);
#undef pfn
page++; pfn++;
if (!(pfn & 0xfffff))
@@ -1004,6 +1080,47 @@ int __init construct_dom0(
}
}
+ /* Copy the OS image and free temporary buffer. */
+ elf.dest = (void*)vkern_start;
+ rc = elf_load_binary(&elf, is_pvh_domain(d) );
+ if ( rc < 0 )
+ {
+ printk("Failed to load the kernel binary\n");
+ return rc;
+ }
+ bootstrap_map(NULL);
+
+ if ( UNSET_ADDR != parms.virt_hypercall )
+ {
+ void *addr;
+
+ if ( is_pvh_domain(d) ) {
+ if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) {
+ printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE);
+ return -ENOMEM;
+ }
+ addr = tmp_buf;
+ } else
+ addr = (void *)parms.virt_hypercall;
+
+ if ( (parms.virt_hypercall < v_start) ||
+ (parms.virt_hypercall >= v_end) )
+ {
+ write_ptbase(current);
+ printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+ return -1;
+ }
+ hypercall_page_initialise(d, addr);
+
+ if ( is_pvh_domain(d) ) {
+ copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
+ xfree(tmp_buf);
+ }
+ }
+
+ /* Free temporary buffers. */
+ discard_initial_images();
+
if ( initrd_len != 0 )
{
si->mod_start = vinitrd_start ?: initrd_pfn;
@@ -1019,6 +1136,15 @@ int __init construct_dom0(
si->console.dom0.info_off = sizeof(struct start_info);
si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
}
+ if ( is_pvh_domain(d) ) {
+ unsigned long mfn = virt_to_mfn(d->shared_info);
+ unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
+ si->shared_info = shared_info_pfn_addr;
+ dom0_update_physmap(d, pfn, mfn, 0);
+
+ copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
+ xfree(si_buf);
+ }
if ( is_pv_32on64_domain(d) )
xlat_start_info(si, XLAT_start_info_console_dom0);
@@ -1050,12 +1176,16 @@ int __init construct_dom0(
regs->eip = parms.virt_entry;
regs->esp = vstack_end;
regs->esi = vstartinfo_start;
- regs->eflags = X86_EFLAGS_IF;
+ regs->eflags = X86_EFLAGS_IF | 0x2;
- if ( opt_dom0_shadow )
+ if ( opt_dom0_shadow ) {
+ if ( is_pvh_domain(d) ) {
+ printk("Invalid option dom0_shadow for PVH\n");
+ return -EINVAL;
+ }
if ( paging_enable(d, PG_SH_enable) == 0 )
paging_update_paging_modes(v);
-
+ }
if ( supervisor_mode_kernel )
{
v->arch.pv_vcpu.kernel_ss &= ~3;
@@ -1132,6 +1262,9 @@ int __init construct_dom0(
BUG_ON(rc != 0);
+ if ( is_pvh_domain(d) )
+ pvh_map_all_iomem(d);
+
iommu_dom0_init(dom0);
return 0;
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 055833d..d3d5697 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
}
}
+/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+ int rc;
+ unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+ memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+ num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+ paging_lock(d);
+ rc = hap_set_allocation(d, num_pages, NULL);
+ paging_unlock(d);
+ BUG_ON(rc);
+}
+
static const struct paging_mode hap_paging_real_mode;
static const struct paging_mode hap_paging_protected_mode;
static const struct paging_mode hap_paging_pae_mode;
@@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking)
const struct paging_mode *
hap_paging_get_mode(struct vcpu *v)
{
- return !hvm_paging_enabled(v) ? &hap_paging_real_mode :
+ return is_pvh_vcpu(v) ? &hap_paging_long_mode :
+ !hvm_paging_enabled(v) ? &hap_paging_real_mode :
hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
hvm_pae_enabled(v) ? &hap_paging_pae_mode :
&hap_paging_protected_mode;
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 43301a5..f307f24 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus);
static bool_t __initdata disable_smep;
invbool_param("smep", disable_smep);
+/* Boot dom0 in PVH mode */
+static bool_t __initdata opt_dom0pvh;
+boolean_param("dom0pvh", opt_dom0pvh);
+
/* **** Linux config option: propagated to domain0. */
/* "acpi=off": Sisables both ACPI table parsing and interpreter. */
/* "acpi=force": Override the disable blacklist. */
@@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p)
{
char *memmap_type = NULL;
char *cmdline, *kextra, *loader;
- unsigned int initrdidx;
+ unsigned int initrdidx, domcr_flags = 0;
multiboot_info_t *mbi = __va(mbi_p);
module_t *mod = (module_t *)__va(mbi->mods_addr);
unsigned long nr_pages, modules_headroom, *module_map;
@@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p)
panic("Could not protect TXT memory regions\n");
/* Create initial domain 0. */
- dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
+ domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0);
+ domcr_flags |= DOMCRF_s3_integrity;
+ dom0 = domain_create(0, domcr_flags, 0);
if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
panic("Error creating domain 0\n");
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..aab8558 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d,
XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
#endif /* XEN_HAP_H */
--
1.7.2.3
next reply other threads:[~2013-03-16 1:06 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-16 1:06 Mukesh Rathor [this message]
2013-03-18 13:01 ` [PATCH 17/18 V2]: PVH xen: PVH dom0 creation Jan Beulich
2013-03-27 0:34 ` Mukesh Rathor
2013-03-29 0:32 ` Mukesh Rathor
2013-04-02 7:03 ` Jan Beulich
2013-03-18 20:06 ` Konrad Rzeszutek Wilk
2013-03-26 23:25 ` Mukesh Rathor
2013-03-19 9:27 ` Jan Beulich
2013-03-19 13:32 ` Konrad Rzeszutek Wilk
2013-03-26 23:42 ` Mukesh Rathor
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130315180645.59f8618e@mantra.us.oracle.com \
--to=mukesh.rathor@oracle.com \
--cc=Xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).