From mboxrd@z Thu Jan 1 00:00:00 1970 From: Konrad Rzeszutek Wilk Subject: Re: backport requests for 4.x-testing Date: Sat, 24 Mar 2012 13:27:58 -0400 Message-ID: <20120324172757.GA29504@phenom.dumpdata.com> References: <4F55F15F02000078000769AC@nat28.tlf.novell.com> <4F55EF2D.7010302@citrix.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="/04w6evG8XlLl3ft" Return-path: Content-Disposition: inline In-Reply-To: <4F55EF2D.7010302@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Andrew Cooper , keir.xen@gmail.com Cc: xen-devel@lists.xen.org List-Id: xen-devel@lists.xenproject.org --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Tue, Mar 06, 2012 at 11:04:13AM +0000, Andrew Cooper wrote: > On 06/03/12 10:13, Jan Beulich wrote: > > For a (hopefully soon) upcoming 4.1.3 and 4.0.4, may I ask to consider > > the following changesets from -unstable for backporting: And also these: 24140 tools: xend: tolerate empty state/*.xml 23412 libxc: xc_domain_set_memory_map, xc_get_machine_memory_map (x86, amd64 only) 23632 libxc: Squash xc_e820.h (and delete) into xenctrl.h 23225 x86: make the pv-only e820 array be dynamic. 23426 libxl: Add support for passing in the host's E820 for PCI passthrough 23428 libxl: Add 'e820_host' option to config file. 23427 libxl: Convert E820_UNUSABLE and E820_RAM to E820_UNUSABLE as appropriate. 24013 x86,hvm: enable VCPUOP_register_vcpu_info op in hvm hypercall which I've back-ported to 4.1 (please see attachments) --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="tools-two-new-calls.patch" # HG changeset patch # Parent 476b0d68e7d5405babc1182da3b345b1e4cc1bca libxc: xc_domain_set_memory_map, xc_get_machine_memory_map (x86, amd64 only) The later retrieves the E820 as seen by the hypervisor (completely unchanged) and the second call sets the E820 for the specified guest. [backport of c/s 23412] Signed-off-by: Konrad Rzeszutek Wilk diff -r 476b0d68e7d5 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Sat Apr 30 09:48:16 2011 +0100 +++ b/tools/libxc/xc_domain.c Wed May 04 08:57:40 2011 -0400 @@ -478,37 +478,64 @@ int xc_domain_pin_memory_cacheattr(xc_in } #if defined(__i386__) || defined(__x86_64__) -#include "xc_e820.h" +int xc_domain_set_memory_map(xc_interface *xch, + uint32_t domid, + struct e820entry entries[], + uint32_t nr_entries) +{ + int rc; + struct xen_foreign_memory_map fmap = { + .domid = domid, + .map = { .nr_entries = nr_entries } + }; + DECLARE_HYPERCALL_BOUNCE(entries, nr_entries * sizeof(struct e820entry), + XC_HYPERCALL_BUFFER_BOUNCE_IN); + + if ( !entries || xc_hypercall_bounce_pre(xch, entries) ) + return -1; + + set_xen_guest_handle(fmap.map.buffer, entries); + + rc = do_memory_op(xch, XENMEM_set_memory_map, &fmap, sizeof(fmap)); + + xc_hypercall_bounce_post(xch, entries); + + return rc; +} +int xc_get_machine_memory_map(xc_interface *xch, + struct e820entry entries[], + uint32_t max_entries) +{ + int rc; + struct xen_memory_map memmap = { + .nr_entries = max_entries + }; + DECLARE_HYPERCALL_BOUNCE(entries, sizeof(struct e820entry) * max_entries, + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + + if ( !entries || xc_hypercall_bounce_pre(xch, entries) || max_entries <= 1) + return -1; + + + set_xen_guest_handle(memmap.buffer, entries); + + rc = do_memory_op(xch, XENMEM_machine_memory_map, &memmap, sizeof(memmap)); + + xc_hypercall_bounce_post(xch, entries); + + return rc ? rc : memmap.nr_entries; +} int xc_domain_set_memmap_limit(xc_interface *xch, uint32_t domid, unsigned long map_limitkb) { - int rc; - struct xen_foreign_memory_map fmap = { - .domid = domid, - .map = { .nr_entries = 1 } - }; - DECLARE_HYPERCALL_BUFFER(struct e820entry, e820); + struct e820entry e820; - e820 = xc_hypercall_buffer_alloc(xch, e820, sizeof(*e820)); + e820.addr = 0; + e820.size = (uint64_t)map_limitkb << 10; + e820.type = E820_RAM; - if ( e820 == NULL ) - { - PERROR("Could not allocate memory for xc_domain_set_memmap_limit hypercall"); - return -1; - } - - e820->addr = 0; - e820->size = (uint64_t)map_limitkb << 10; - e820->type = E820_RAM; - - set_xen_guest_handle(fmap.map.buffer, e820); - - rc = do_memory_op(xch, XENMEM_set_memory_map, &fmap, sizeof(fmap)); - - xc_hypercall_buffer_free(xch, e820); - - return rc; + return xc_domain_set_memory_map(xch, domid, &e820, 1); } #else int xc_domain_set_memmap_limit(xc_interface *xch, diff -r 476b0d68e7d5 tools/libxc/xc_e820.h --- a/tools/libxc/xc_e820.h Sat Apr 30 09:48:16 2011 +0100 +++ b/tools/libxc/xc_e820.h Wed May 04 08:57:40 2011 -0400 @@ -26,6 +26,9 @@ #define E820_RESERVED 2 #define E820_ACPI 3 #define E820_NVS 4 +#define E820_UNUSABLE 5 + +#define E820MAX (128) struct e820entry { uint64_t addr; diff -r 476b0d68e7d5 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Sat Apr 30 09:48:16 2011 +0100 +++ b/tools/libxc/xenctrl.h Wed May 04 08:57:40 2011 -0400 @@ -966,6 +966,17 @@ int xc_domain_set_memmap_limit(xc_interf uint32_t domid, unsigned long map_limitkb); +#if defined(__i386__) || defined(__x86_64__) +#include "xc_e820.h" +int xc_domain_set_memory_map(xc_interface *xch, + uint32_t domid, + struct e820entry entries[], + uint32_t nr_entries); + +int xc_get_machine_memory_map(xc_interface *xch, + struct e820entry entries[], + uint32_t max_entries); +#endif int xc_domain_set_time_offset(xc_interface *xch, uint32_t domid, int32_t time_offset_seconds); --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="tools-fix-compile-error-out-of-tree.patch" # HG changeset patch # Parent 5d31bd0eb8d040c0b44fe2a3b737fd752a607e74 libxc: Squash xc_e820.h (and delete) into xenctrl.h .. as there is no need to keep that internal header file anymore. We export two functions xc_domain_[set|get]_memory_map which depend on the 'struct e820entry' defined in 'xc_e820.h'. We move the contents of the 'xc_e820.h' to the 'xenctrl.h' fixing compiler errors when applications outside the Xen tree are trying to compile against the libraries. [backport of c/s 23632] Signed-off-by: Konrad Rzeszutek Wilk diff -r 5d31bd0eb8d0 tools/libxc/xc_core_x86.c --- a/tools/libxc/xc_core_x86.c Wed Jun 01 16:10:50 2011 -0400 +++ b/tools/libxc/xc_core_x86.c Mon Jun 13 13:41:31 2011 -0400 @@ -20,7 +20,7 @@ #include "xg_private.h" #include "xc_core.h" -#include "xc_e820.h" +#include #define GET_FIELD(_p, _f) ((dinfo->guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f)) diff -r 5d31bd0eb8d0 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Wed Jun 01 16:10:50 2011 -0400 +++ b/tools/libxc/xc_domain_save.c Mon Jun 13 13:41:31 2011 -0400 @@ -32,7 +32,6 @@ #include "xg_save_restore.h" #include -#include "xc_e820.h" /* ** Default values for important tuning parameters. Can override by passing diff -r 5d31bd0eb8d0 tools/libxc/xc_e820.h --- a/tools/libxc/xc_e820.h Wed Jun 01 16:10:50 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -/* - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef __XC_E820_H__ -#define __XC_E820_H__ - -#include - -/* - * PC BIOS standard E820 types and structure. - */ -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 -#define E820_UNUSABLE 5 - -#define E820MAX (128) - -struct e820entry { - uint64_t addr; - uint64_t size; - uint32_t type; -} __attribute__((packed)); - -#endif /* __XC_E820_H__ */ diff -r 5d31bd0eb8d0 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Wed Jun 01 16:10:50 2011 -0400 +++ b/tools/libxc/xenctrl.h Mon Jun 13 13:41:31 2011 -0400 @@ -967,7 +967,22 @@ int xc_domain_set_memmap_limit(xc_interf unsigned long map_limitkb); #if defined(__i386__) || defined(__x86_64__) -#include "xc_e820.h" +/* + * PC BIOS standard E820 types and structure. + */ +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 +#define E820_UNUSABLE 5 + +#define E820MAX (128) + +struct e820entry { + uint64_t addr; + uint64_t size; + uint32_t type; +} __attribute__((packed)); int xc_domain_set_memory_map(xc_interface *xch, uint32_t domid, struct e820entry entries[], --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="xen-e820-pv.patch" # HG changeset patch # User Keir Fraser # Date 1302707426 -3600 # Node ID 3f00c5faa12aed4d0993391f71b7f12cf92f0208 # Parent eb3b40bf0ba21ca518fdc0b4f86cd49228bbb853 x86: make the pv-only e820 array be dynamic. During creation of the PV domain we allocate the E820 structure to have the amount of E820 entries on the machine, plus the number three. This will allow the tool stack to fill the E820 with more than three entries. Specifically the use cases is , where the toolstack retrieves the E820, sanitizes it, and then sets it for the PV guest (for PCI passthrough), this dynamic number of E820 is just right. [backport of c/s 23225] Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Keir Fraser diff -r eb3b40bf0ba2 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Tue Dec 13 14:12:56 2011 -0500 +++ b/xen/arch/x86/domain.c Tue Dec 13 14:15:01 2011 -0500 @@ -557,6 +557,8 @@ int arch_domain_create(struct domain *d, /* 32-bit PV guest by default only if Xen is not 64-bit. */ d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = (CONFIG_PAGING_LEVELS != 4); + + spin_lock_init(&d->arch.e820_lock); } memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids)); @@ -603,6 +605,8 @@ void arch_domain_destroy(struct domain * if ( is_hvm_domain(d) ) hvm_domain_destroy(d); + else + xfree(d->arch.e820); vmce_destroy_msr(d); pci_release_devices(d); diff -r eb3b40bf0ba2 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Tue Dec 13 14:12:56 2011 -0500 +++ b/xen/arch/x86/mm.c Tue Dec 13 14:15:01 2011 -0500 @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -4667,11 +4668,12 @@ long arch_memory_op(int op, XEN_GUEST_HA { struct xen_foreign_memory_map fmap; struct domain *d; + struct e820entry *e820; if ( copy_from_guest(&fmap, arg, 1) ) return -EFAULT; - if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) ) + if ( fmap.map.nr_entries > E820MAX ) return -EINVAL; rc = rcu_lock_target_domain_by_id(fmap.domid, &d); @@ -4685,9 +4687,25 @@ long arch_memory_op(int op, XEN_GUEST_HA return rc; } - rc = copy_from_guest(d->arch.e820, fmap.map.buffer, - fmap.map.nr_entries) ? -EFAULT : 0; + e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries); + if ( e820 == NULL ) + { + rcu_unlock_domain(d); + return -ENOMEM; + } + + if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) ) + { + xfree(e820); + rcu_unlock_domain(d); + return -EFAULT; + } + + spin_lock(&d->arch.e820_lock); + xfree(d->arch.e820); + d->arch.e820 = e820; d->arch.nr_e820 = fmap.map.nr_entries; + spin_unlock(&d->arch.e820_lock); rcu_unlock_domain(d); return rc; @@ -4698,18 +4716,28 @@ long arch_memory_op(int op, XEN_GUEST_HA struct xen_memory_map map; struct domain *d = current->domain; - /* Backwards compatibility. */ - if ( d->arch.nr_e820 == 0 ) - return -ENOSYS; - if ( copy_from_guest(&map, arg, 1) ) return -EFAULT; + spin_lock(&d->arch.e820_lock); + + /* Backwards compatibility. */ + if ( (d->arch.nr_e820 == 0) || + (d->arch.e820 == NULL) ) + { + spin_unlock(&d->arch.e820_lock); + return -ENOSYS; + } + map.nr_entries = min(map.nr_entries, d->arch.nr_e820); if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) || copy_to_guest(arg, &map, 1) ) + { + spin_unlock(&d->arch.e820_lock); return -EFAULT; - + } + + spin_unlock(&d->arch.e820_lock); return 0; } diff -r eb3b40bf0ba2 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Tue Dec 13 14:12:56 2011 -0500 +++ b/xen/include/asm-x86/domain.h Tue Dec 13 14:15:01 2011 -0500 @@ -270,7 +270,8 @@ struct arch_domain unsigned long pirq_eoi_map_mfn; /* Pseudophysical e820 map (XENMEM_memory_map). */ - struct e820entry e820[3]; + spinlock_t e820_lock; + struct e820entry *e820; unsigned int nr_e820; /* Maximum physical-address bitwidth supported by this guest. */ --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=libxl-allocate-e820 # HG changeset patch # Parent 37cd883a764874bd5dfc8cfd6fbe82af1920a77e libxl: Add support for passing in the host's E820 for PCI passthrough The code that populates E820 is unconditionally triggered by the guest configuration having "pci=[',..']", being a PV guest, and if b_info->u.pv.e820_host is set. The code do_domain_create calls the libxl__e820_alloc when it notices that the guest is PV, has at least one PCI devices, and has the e820_host flag set. libxl__e820_alloc calls the xc_get_machine_memory_map to retrieve the systems E820. Then the E820 is sanitized to weed out E820 entries below 16MB, and as well remove any E820_RAM or E820_UNUSED regions as the guest does not need to know about them. The guest only needs the E820_ACPI, E820_NVS, E820_RESERVED to get an idea of where the PCI I/O space is. Mostly.. The Linux kernel assumes that any gap in the E820 is considered PCI I/O space which means that if we pass in the guest 2GB, and the E820_ACPI, and its friend start at 3GB, the gap between 2GB and 3GB will be considered as PCI I/O space. To guard against that we also create an E820_UNUSABLE between the region of 'target_kb' (called ram_end in the code) up to the first E820_[ACPI,NVS,RESERVED] region. Lastly, the xc_domain_set_memory_map is called to install the new E820. When tested with another PV guest (NetBSD 5.1) the modified E820 gave it no trouble. The code has also been tested with older "classic" Xen Linux and with the newer "pvops" with success (SLES11, RHEL5, Ubuntu Lucid, Debian Squeeze, 2.6.37, 2.6.38, 2.6.39). Memory that is slack or for balloon (so 'maxmem' in guest configuration) is put behind the machine E820. Which in most cases is after the 4GB. The reason for doing the fetching of the E820 using the hypercall in the toolstack (instead of the guest doing it) is that when a guest would do a hypercall to 'XENMEM_machine_memory_map' it would retrieve an E820 with I/O range caps added in. Meaning that the region after 4GB up to end of possible memory would be marked as unusable and the kernel would not have any space to allocate a balloon region. [backport of c/s 23426] Signed-off-by: Konrad Rzeszutek Wilk diff -r 37cd883a7648 tools/libxl/libxl.idl --- a/tools/libxl/libxl.idl Wed Apr 13 16:10:26 2011 +0100 +++ b/tools/libxl/libxl.idl Wed Nov 16 16:06:34 2011 -0500 @@ -117,6 +117,7 @@ libxl_domain_build_info = Struct("domain ("cmdline", string), ("ramdisk", libxl_file_reference), ("features", string, True), + ("e820_host", bool, False, "Use host's E820 for PCI passthrough."), ])), ])), ], diff -r 37cd883a7648 tools/libxl/libxl_create.c --- a/tools/libxl/libxl_create.c Wed Apr 13 16:10:26 2011 +0100 +++ b/tools/libxl/libxl_create.c Wed Nov 16 16:06:34 2011 -0500 @@ -540,6 +540,14 @@ static int do_domain_create(libxl_ctx *c for (i = 0; i < d_config->num_pcidevs; i++) libxl__device_pci_add(ctx, domid, &d_config->pcidevs[i], 1); + if (!d_config->c_info.hvm && d_config->b_info.u.pv.e820_host) { + int rc; + rc = libxl__e820_alloc(ctx, domid, d_config); + if (rc) + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, + "Failed while collecting E820 with: %d (errno:%d)\n", + rc, errno); + } if ( cb && (d_config->c_info.hvm || d_config->b_info.u.pv.bootloader )) { if ( (*cb)(ctx, domid, priv) ) goto error_out; diff -r 37cd883a7648 tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h Wed Apr 13 16:10:26 2011 +0100 +++ b/tools/libxl/libxl_internal.h Wed Nov 16 16:06:34 2011 -0500 @@ -330,4 +330,5 @@ _hidden int libxl__error_set(libxl_ctx * _hidden int libxl__file_reference_map(libxl_file_reference *f); _hidden int libxl__file_reference_unmap(libxl_file_reference *f); +_hidden int libxl__e820_alloc(libxl_ctx *ctx, uint32_t domid, libxl_domain_config *d_config); #endif diff -r 37cd883a7648 tools/libxl/libxl_pci.c --- a/tools/libxl/libxl_pci.c Wed Apr 13 16:10:26 2011 +0100 +++ b/tools/libxl/libxl_pci.c Wed Nov 16 16:06:34 2011 -0500 @@ -1079,3 +1079,167 @@ int libxl_device_pci_shutdown(libxl_ctx free(pcidevs); return 0; } + +static const char *e820_names(int type) +{ + switch (type) { + case E820_RAM: return "RAM"; + case E820_RESERVED: return "Reserved"; + case E820_ACPI: return "ACPI"; + case E820_NVS: return "ACPI NVS"; + case E820_UNUSABLE: return "Unusable"; + default: break; + } + return "Unknown"; +} + +static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], + uint32_t *nr_entries, + unsigned long map_limitkb, + unsigned long balloon_kb) +{ + uint64_t delta_kb = 0, start = 0, start_kb = 0, last = 0, ram_end; + uint32_t i, idx = 0, nr; + struct e820entry e820[E820MAX]; + + if (!src || !map_limitkb || !balloon_kb || !nr_entries) + return ERROR_INVAL; + + nr = *nr_entries; + if (!nr) + return ERROR_INVAL; + + if (nr > E820MAX) + return ERROR_NOMEM; + + /* Weed out anything under 1MB */ + for (i = 0; i < nr; i++) { + if (src[i].addr > 0x100000) + continue; + + src[i].type = 0; + src[i].size = 0; + src[i].addr = -1ULL; + } + + /* Find the lowest and highest entry in E820, skipping over + * undesired entries. */ + start = -1ULL; + last = 0; + for (i = 0; i < nr; i++) { + if ((src[i].type == E820_RAM) || + (src[i].type == E820_UNUSABLE) || + (src[i].type == 0)) + continue; + + start = src[i].addr < start ? src[i].addr : start; + last = src[i].addr + src[i].size > last ? + src[i].addr + src[i].size > last : last; + } + if (start > 1024) + start_kb = start >> 10; + + /* Add the memory RAM region for the guest */ + e820[idx].addr = 0; + e820[idx].size = (uint64_t)map_limitkb << 10; + e820[idx].type = E820_RAM; + + /* .. and trim if neccessary */ + if (start_kb && map_limitkb > start_kb) { + delta_kb = map_limitkb - start_kb; + if (delta_kb) + e820[idx].size -= (uint64_t)(delta_kb << 10); + } + /* Note: We don't touch balloon_kb here. Will add it at the end. */ + ram_end = e820[idx].addr + e820[idx].size; + idx ++; + + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Memory: %"PRIu64"kB End of RAM: " \ + "0x%"PRIx64" (PFN) Delta: %"PRIu64"kB, PCI start: %"PRIu64"kB " \ + "(0x%"PRIx64" PFN), Balloon %"PRIu64"kB\n", (uint64_t)map_limitkb, + ram_end >> 12, delta_kb, start_kb ,start >> 12, + (uint64_t)balloon_kb); + + /* Check if there is a region between ram_end and start. */ + if (start > ram_end) { + /* .. and if not present, add it in. This is to guard against + the Linux guest assuming that the gap between the end of + RAM region and the start of the E820_[ACPI,NVS,RESERVED] + is PCI I/O space. Which it certainly is _not_. */ + e820[idx].type = E820_UNUSABLE; + e820[idx].addr = ram_end; + e820[idx].size = start - ram_end; + idx++; + } + /* Almost done: copy them over, ignoring the undesireable ones */ + for (i = 0; i < nr; i++) { + if ((src[i].type == E820_RAM) || + (src[i].type == E820_UNUSABLE) || + (src[i].type == 0)) + continue; + + e820[idx].type = src[i].type; + e820[idx].addr = src[i].addr; + e820[idx].size = src[i].size; + idx++; + } + /* At this point we have the mapped RAM + E820 entries from src. */ + if (balloon_kb) { + /* and if we truncated the RAM region, then add it to the end. */ + e820[idx].type = E820_RAM; + e820[idx].addr = (uint64_t)(1ULL << 32) > last ? + (uint64_t)(1ULL << 32) : last; + /* also add the balloon memory to the end. */ + e820[idx].size = (uint64_t)(delta_kb << 10) + + (uint64_t)(balloon_kb << 10); + idx++; + + } + nr = idx; + + for (i = 0; i < nr; i++) { + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, ":\t[%"PRIx64" -> %"PRIx64"] %s", + e820[i].addr >> 12, (e820[i].addr + e820[i].size) >> 12, + e820_names(e820[i].type)); + } + + /* Done: copy the sanitized version. */ + *nr_entries = nr; + memcpy(src, e820, nr * sizeof(struct e820entry)); + return 0; +} + +int libxl__e820_alloc(libxl_ctx *ctx, uint32_t domid, libxl_domain_config *d_config) +{ + int rc; + uint32_t nr; + struct e820entry map[E820MAX]; + libxl_domain_build_info *b_info; + + if (d_config == NULL || d_config->c_info.hvm) + return ERROR_INVAL; + + b_info = &d_config->b_info; + if (!b_info->u.pv.e820_host) + return ERROR_INVAL; + + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX); + if (rc < 0) { + errno = rc; + return ERROR_FAIL; + } + nr = rc; + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb, + (b_info->max_memkb - b_info->target_memkb) + + b_info->u.pv.slack_memkb); + if (rc) + return ERROR_FAIL; + + rc = xc_domain_set_memory_map(ctx->xch, domid, map, nr); + + if (rc < 0) { + errno = rc; + return ERROR_FAIL; + } + return 0; +} diff -r 37cd883a7648 tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Wed Apr 13 16:10:26 2011 +0100 +++ b/tools/libxl/xl_cmdimpl.c Wed Nov 16 16:06:34 2011 -0500 @@ -371,6 +371,7 @@ static void printf_info(int domid, printf("\t\t\t(kernel %s)\n", b_info->kernel.path); printf("\t\t\t(cmdline %s)\n", b_info->u.pv.cmdline); printf("\t\t\t(ramdisk %s)\n", b_info->u.pv.ramdisk.path); + printf("\t\t\t(e820_host %d)\n", b_info->u.pv.e820_host); printf("\t\t)\n"); } printf("\t)\n"); @@ -1025,6 +1026,8 @@ skip_vfb: if (!libxl_device_pci_parse_bdf(&ctx, pcidev, buf)) d_config->num_pcidevs++; } + if (d_config->num_pcidevs && !c_info->hvm) + b_info->u.pv.e820_host = true; } switch (xlu_cfg_get_list(config, "cpuid", &cpuids, 0, 1)) { --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=libxl_e820_override # HG changeset patch # Parent 615d58973988e505c4cff62a338225152ceaa052 libxl: Add 'e820_host' option to config file. .. which will be removed once the auto-ballooning of guests with PCI devices works. During testing of the patches which provide a host E820 in a PV guest, certain inconsistencies were found with guests. When launching a RHEL5 or SLES11 PV guest with 4GB and a PCI device, the kernel would report 4GB, but have 1.5G "used". What happend was that the P2M that fall within the E820 I/O holes would never be used and was just wasted. The mechanism to go around this is to shrink the size of the guest before launch (say memory=2048, maxmem=4096) and then balloon back to 4096M after start. For PVOPS type kernels it would detect the E820 I/O holes and deflate by the correct amount but would not inflate back to 4GB. Manually inflating makes it work. The fix in the future for guests where the memory amount flows over the PCI hole, is to launch the guest with decreased amount right up to the cusp of where the E820 PCI hole starts. Also increase the 'maxmem' by the delta and then when the guest has launched, balloon up to the delta number. This will require some careful surgery so for right now this parameter will guard against unsuspecting users seeing their PV guests memory "vanish." [backport of c/s 23428] Signed-off-by: Konrad Rzeszutek Wilk diff -r 615d58973988 tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Wed Nov 16 16:06:34 2011 -0500 +++ b/tools/libxl/xl_cmdimpl.c Wed Nov 16 16:06:44 2011 -0500 @@ -1010,6 +1010,16 @@ skip_vfb: if (!xlu_cfg_get_long (config, "pci_power_mgmt", &l)) pci_power_mgmt = l; + /* To be reworked (automatically enabled) once the auto ballooning + * after guest starts is done (with PCI devices passed in). */ + if (!xlu_cfg_get_long (config, "e820_host", &l)) { + if (c_info->hvm) + fprintf(stderr, "Can't do e820_host in HVM mode!"); + else { + if (l) + b_info->u.pv.e820_host = true; + } + } if (!xlu_cfg_get_list (config, "pci", &pcis, 0, 0)) { int i; d_config->num_pcidevs = 0; --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=libxl-fix-unused-ram-e820 # HG changeset patch # Parent d825fb0ba571f9f15c7fe1bb19c85c3568c0bb6f libxl: Convert E820_UNUSABLE and E820_RAM to E820_UNUSABLE as appropriate. Most machines after the RAM regions in the e802 have a couple of E820_RESERVED, with E820_ACPI and E820_NVS. On some Intel machines, the E820 looks like swiss cheese: (XEN) Initial Xen-e820 RAM map: (XEN) 0000000000000000 - 000000000009d000 (usable) (XEN) 000000000009d000 - 00000000000a0000 (reserved) (XEN) 00000000000e0000 - 0000000000100000 (reserved) (XEN) 0000000000100000 - 000000009cf66000 (usable) (XEN) 000000009cf66000 - 000000009d102000 (ACPI NVS) (XEN) 000000009d102000 - 000000009f6bd000 (usable) <-- (XEN) 000000009f6bd000 - 000000009f6bf000 (reserved) (XEN) 000000009f6bf000 - 000000009f714000 (usable) <-- (XEN) 000000009f714000 - 000000009f7bf000 (ACPI NVS) (XEN) 000000009f7bf000 - 000000009f7e0000 (usable) <-- (XEN) 000000009f7e0000 - 000000009f7ff000 (ACPI data) (XEN) 000000009f7ff000 - 000000009f800000 (usable) <-- (XEN) 000000009f800000 - 00000000a0000000 (reserved) (XEN) 00000000a0000000 - 00000000b0000000 (reserved) (XEN) 00000000fc000000 - 00000000fd000000 (reserved) (XEN) 00000000ffe00000 - 0000000100000000 (reserved) (XEN) 0000000100000000 - 0000000160000000 (usable) Which means we have to pay attention to the E820_RAM that are between the E820_[ACPI,NVS,RESERVED]. If we remove those E820_RAM (b/c the amount of memory passed to the guest is less that where those E820 regions reside) from the E820, the Linux kernel interprets those "gaps" as PCI I/O space. This is what we are currently doing. This can be disastrous if we pass in an Intel IGD card which tries to use the first available PCI I/O space - and ends up using the MFNs which are actually RAM instead of being the PCI I/O space. To make this work, we convert all E820_RAM that are above the 'target_kb' (those that overlap the 'target_kb' are truncated appropriately) to be E820_UNUSABLE. We also limit this alternation up to 4GB. This means that an E820 for a guest from this (target_kb=1024, maxmem=2048): [ 0.000000] Set 405658 page(s) to 1-1 mapping. [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable) [ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved) [ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable) [ 0.000000] Xen: 0000000040000000 - 000000009cf66000 (unusable) [ 0.000000] Xen: 000000009cf66000 - 000000009d102000 (ACPI NVS) [ 0.000000] Xen: 000000009f6bd000 - 000000009f6bf000 (reserved) [ 0.000000] Xen: 000000009f714000 - 000000009f7bf000 (ACPI NVS) [ 0.000000] Xen: 000000009f7e0000 - 000000009f7ff000 (ACPI data) [ 0.000000] Xen: 000000009f800000 - 00000000b0000000 (reserved) [ 0.000000] Xen: 00000000fc000000 - 00000000fd000000 (reserved) [ 0.000000] Xen: 00000000fec00000 - 00000000fec01000 (reserved) [ 0.000000] Xen: 00000000fee00000 - 00000000fee01000 (reserved) [ 0.000000] Xen: 00000000ffe00000 - 0000000100000000 (reserved) [ 0.000000] Xen: 0000000100000000 - 0000000140800000 (usable) Will look as so: [ 0.000000] Set 395880 page(s) to 1-1 mapping. [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable) [ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved) [ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable) [ 0.000000] Xen: 0000000040000000 - 000000009cf66000 (unusable) [ 0.000000] Xen: 000000009cf66000 - 000000009d102000 (ACPI NVS) [ 0.000000] Xen: 000000009d102000 - 000000009f6bd000 (unusable) [ 0.000000] Xen: 000000009f6bd000 - 000000009f6bf000 (reserved) [ 0.000000] Xen: 000000009f6bf000 - 000000009f714000 (unusable) [ 0.000000] Xen: 000000009f714000 - 000000009f7bf000 (ACPI NVS) [ 0.000000] Xen: 000000009f7bf000 - 000000009f7e0000 (unusable) [ 0.000000] Xen: 000000009f7e0000 - 000000009f7ff000 (ACPI data) [ 0.000000] Xen: 000000009f7ff000 - 000000009f800000 (unusable) [ 0.000000] Xen: 000000009f800000 - 00000000b0000000 (reserved) [ 0.000000] Xen: 00000000fc000000 - 00000000fd000000 (reserved) [ 0.000000] Xen: 00000000fec00000 - 00000000fec01000 (reserved) [ 0.000000] Xen: 00000000fee00000 - 00000000fee01000 (reserved) [ 0.000000] Xen: 00000000ffe00000 - 0000000100000000 (reserved) [ 0.000000] Xen: 0000000100000000 - 0000000140800000 (usable) [backport of c/s 23427] Signed-off-by: Konrad Rzeszutek Wilk diff -r d825fb0ba571 tools/libxl/libxl_pci.c --- a/tools/libxl/libxl_pci.c Wed Nov 16 16:06:44 2011 -0500 +++ b/tools/libxl/libxl_pci.c Wed Nov 16 16:06:50 2011 -0500 @@ -1160,21 +1160,98 @@ static int e820_sanitize(libxl_ctx *ctx, ram_end >> 12, delta_kb, start_kb ,start >> 12, (uint64_t)balloon_kb); + + /* This whole code below is to guard against if the Intel IGD is passed into + * the guest. If we don't pass in IGD, this whole code can be ignored. + * + * The reason for this code is that Intel boxes fill their E820 with + * E820_RAM amongst E820_RESERVED and we can't just ditch those E820_RAM. + * That is b/c any "gaps" in the E820 is considered PCI I/O space by + * Linux and it would be utilized by the Intel IGD as I/O space while + * in reality it was an RAM region. + * + * What this means is that we have to walk the E820 and for any region + * that is RAM and below 4GB and above ram_end, needs to change its type + * to E820_UNUSED. We also need to move some of the E820_RAM regions if + * the overlap with ram_end. */ + for (i = 0; i < nr; i++) { + uint64_t end = src[i].addr + src[i].size; + + /* We don't care about E820_UNUSABLE, but we need to + * change the type to zero b/c the loop after this + * sticks E820_UNUSABLE on the guest's E820 but ignores + * the ones with type zero. */ + if ((src[i].type == E820_UNUSABLE) || + /* Any region that is within the "RAM region" can + * be safely ditched. */ + (end < ram_end)) { + src[i].type = 0; + continue; + } + + /* Look only at RAM regions. */ + if (src[i].type != E820_RAM) + continue; + + /* We only care about RAM regions below 4GB. */ + if (src[i].addr >= (1ULL<<32)) + continue; + + /* E820_RAM overlaps with our RAM region. Move it */ + if (src[i].addr < ram_end) { + uint64_t delta; + + src[i].type = E820_UNUSABLE; + delta = ram_end - src[i].addr; + /* The end < ram_end should weed this out */ + if (src[i].size - delta < 0) + src[i].type = 0; + else { + src[i].size -= delta; + src[i].addr = ram_end; + } + if (src[i].addr + src[i].size != end) { + /* We messed up somewhere */ + src[i].type = 0; + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Computed E820 wrongly. Continuing on."); + } + } + /* Lastly, convert the RAM to UNSUABLE. Look in the Linux kernel + at git commit 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948 + "xen/setup: Inhibit resource API from using System RAM E820 + gaps as PCI mem gaps" for full explanation. */ + if (end > ram_end) + src[i].type = E820_UNUSABLE; + } + /* Check if there is a region between ram_end and start. */ if (start > ram_end) { + int add_unusable = 1; + for (i = 0; i < nr && add_unusable; i++) { + if (src[i].type != E820_UNUSABLE) + continue; + if (ram_end != src[i].addr) + continue; + if (start != src[i].addr + src[i].size) { + /* there is one, adjust it */ + src[i].size = start - src[i].addr; + } + add_unusable = 0; + } /* .. and if not present, add it in. This is to guard against - the Linux guest assuming that the gap between the end of - RAM region and the start of the E820_[ACPI,NVS,RESERVED] - is PCI I/O space. Which it certainly is _not_. */ - e820[idx].type = E820_UNUSABLE; - e820[idx].addr = ram_end; - e820[idx].size = start - ram_end; - idx++; + the Linux guest assuming that the gap between the end of + RAM region and the start of the E820_[ACPI,NVS,RESERVED] + is PCI I/O space. Which it certainly is _not_. */ + if (add_unusable) { + e820[idx].type = E820_UNUSABLE; + e820[idx].addr = ram_end; + e820[idx].size = start - ram_end; + idx++; + } } /* Almost done: copy them over, ignoring the undesireable ones */ for (i = 0; i < nr; i++) { if ((src[i].type == E820_RAM) || - (src[i].type == E820_UNUSABLE) || (src[i].type == 0)) continue; --/04w6evG8XlLl3ft Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="24013-allows-vcpu_register_for_hvm_domains.patch" # HG changeset patch # User Zhenzhong Duan # Date 1319818821 -3600 # Node ID c4ed56a102dc3a66dc4dda6ed1584cb6531074a2 # Parent 2e16d8e6a965be8f62c99c6e456988cfa72b7e1d x86,hvm: enable VCPUOP_register_vcpu_info op in hvm hypercall pvhvm running with more than 32 vcpus and pv_irq/pv_time enabled need vcpu placement to work, or else it will softlockup. Signed-off-by: Zhenzhong Duan Committed-by: Keir Fraser [backport of c/s 24013] Signed-off-by: Konrad Rzeszutek Wilk diff -r 2e16d8e6a965 -r c4ed56a102dc xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Fri Oct 28 17:17:47 2011 +0100 +++ b/xen/arch/x86/hvm/hvm.c Fri Oct 28 17:20:21 2011 +0100 @@ -2794,6 +2794,7 @@ static long hvm_vcpu_op( case VCPUOP_stop_periodic_timer: case VCPUOP_set_singleshot_timer: case VCPUOP_stop_singleshot_timer: + case VCPUOP_register_vcpu_info: rc = do_vcpu_op(cmd, vcpuid, arg); break; default: @@ -2869,6 +2870,7 @@ static long hvm_vcpu_op_compat32( case VCPUOP_stop_periodic_timer: case VCPUOP_set_singleshot_timer: case VCPUOP_stop_singleshot_timer: + case VCPUOP_register_vcpu_info: rc = compat_vcpu_op(cmd, vcpuid, arg); break; default: --/04w6evG8XlLl3ft Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel --/04w6evG8XlLl3ft--