From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from [140.186.70.92] (port=40109 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1PiET9-0003ob-Vs for qemu-devel@nongnu.org; Wed, 26 Jan 2011 18:08:14 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1PiET0-0002ST-8i for qemu-devel@nongnu.org; Wed, 26 Jan 2011 18:08:00 -0500 Received: from mail-qy0-f180.google.com ([209.85.216.180]:37180) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1PiET0-0002SK-0J for qemu-devel@nongnu.org; Wed, 26 Jan 2011 18:07:58 -0500 Received: by qyk29 with SMTP id 29so1616243qyk.4 for ; Wed, 26 Jan 2011 15:07:57 -0800 (PST) Message-ID: <4D40A946.2020903@codemonkey.ws> Date: Wed, 26 Jan 2011 17:07:50 -0600 From: Anthony Liguori MIME-Version: 1.0 Subject: Re: [Qemu-devel] [PATCH V9 10/16] xen: Introduce the Xen mapcache References: <1295965760-31508-1-git-send-email-anthony.perard@citrix.com> <1295965760-31508-11-git-send-email-anthony.perard@citrix.com> In-Reply-To: <1295965760-31508-11-git-send-email-anthony.perard@citrix.com> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: anthony.perard@citrix.com Cc: Xen Devel , QEMU-devel , Jun Nakajima , Stefano Stabellini On 01/25/2011 08:29 AM, anthony.perard@citrix.com wrote: > From: Jun Nakajima > > On IA32 host or IA32 PAE host, at present, generally, we can't create > an HVM guest with more than 2G memory, because generally it's almost > impossible for Qemu to find a large enough and consecutive virtual > address space to map an HVM guest's whole physical address space. > The attached patch fixes this issue using dynamic mapping based on > little blocks of memory. > > Each call to qemu_get_ram_ptr makes a call to qemu_map_cache with the > lock option, so mapcache will not unmap these ram_ptr. > > Signed-off-by: Jun Nakajima > Signed-off-by: Anthony PERARD > Signed-off-by: Stefano Stabellini > --- > Makefile.target | 3 + > configure | 3 + > exec.c | 40 ++++++- > hw/xen.h | 13 +++ > xen-all.c | 64 +++++++++++ > xen-mapcache-stub.c | 33 ++++++ > xen-mapcache.c | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++ > xen-mapcache.h | 14 +++ > xen-stub.c | 4 + > 9 files changed, 471 insertions(+), 4 deletions(-) > create mode 100644 xen-mapcache-stub.c > create mode 100644 xen-mapcache.c > create mode 100644 xen-mapcache.h > > diff --git a/Makefile.target b/Makefile.target > index 8126da9..18b3959 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -207,9 +207,12 @@ QEMU_CFLAGS += $(VNC_JPEG_CFLAGS) > QEMU_CFLAGS += $(VNC_PNG_CFLAGS) > > # xen support > +CONFIG_NO_XEN_MAPCACHE = $(if $(subst n,,$(CONFIG_XEN_MAPCACHE)),n,y) > obj-$(CONFIG_XEN) += xen_interfaces.o > obj-$(CONFIG_XEN) += xen-all.o > obj-$(CONFIG_NO_XEN) += xen-stub.o > +obj-$(CONFIG_XEN_MAPCACHE) += xen-mapcache.o > +obj-$(CONFIG_NO_XEN_MAPCACHE) += xen-mapcache-stub.o > > # xen backend driver support > obj-$(CONFIG_XEN) += xen_backend.o xen_devconfig.o > diff --git a/configure b/configure > index fde9bad..c9a13e1 100755 > --- a/configure > +++ b/configure > @@ -3069,6 +3069,9 @@ case "$target_arch2" in > echo "CONFIG_XEN=y">> $config_target_mak > echo "LIBS+=$xen_libs">> $config_target_mak > echo "CONFIG_XEN_CTRL_INTERFACE_VERSION=$xen_ctrl_version">> $config_target_mak > + if test "$cpu" = "i386" -o "$cpu" = "x86_64"; then > + echo "CONFIG_XEN_MAPCACHE=y">> $config_target_mak > + fi > fi > esac > case "$target_arch2" in > diff --git a/exec.c b/exec.c > index e950df2..3b137dc 100644 > --- a/exec.c > +++ b/exec.c > @@ -32,6 +32,7 @@ > #include "hw/qdev.h" > #include "osdep.h" > #include "kvm.h" > +#include "hw/xen.h" > #include "qemu-timer.h" > #if defined(CONFIG_USER_ONLY) > #include > @@ -51,6 +52,8 @@ > #include > #endif > #endif > +#else /* !CONFIG_USER_ONLY */ > +#include "xen-mapcache.h" > #endif > > //#define DEBUG_TB_INVALIDATE > @@ -2835,6 +2838,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, > } > } > > + new_block->offset = find_ram_offset(size); > if (host) { > new_block->host = host; > } else { > @@ -2856,13 +2860,15 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, > PROT_EXEC|PROT_READ|PROT_WRITE, > MAP_SHARED | MAP_ANONYMOUS, -1, 0); > #else > - new_block->host = qemu_vmalloc(size); > + if (xen_mapcache_enabled()) { > + xen_ram_alloc(new_block->offset, size); > + } else { > + new_block->host = qemu_vmalloc(size); > + } > #endif > qemu_madvise(new_block->host, size, QEMU_MADV_MERGEABLE); > } > } > - > - new_block->offset = find_ram_offset(size); > new_block->length = size; > > QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); > @@ -2903,7 +2909,11 @@ void qemu_ram_free(ram_addr_t addr) > #if defined(TARGET_S390X)&& defined(CONFIG_KVM) > munmap(block->host, block->length); > #else > - qemu_vfree(block->host); > + if (xen_mapcache_enabled()) { > + qemu_invalidate_entry(block->host); > + } else { > + qemu_vfree(block->host); > + } > #endif > } > qemu_free(block); > @@ -2929,6 +2939,15 @@ void *qemu_get_ram_ptr(ram_addr_t addr) > if (addr - block->offset< block->length) { > QLIST_REMOVE(block, next); > QLIST_INSERT_HEAD(&ram_list.blocks, block, next); > + if (xen_mapcache_enabled()) { > + /* We need to check if the requested address is in the RAM > + * because we don't want to map the entire memory in QEMU. > + */ > + if (block->offset == 0) { > + return qemu_map_cache(addr, 0, 1); > + } > + block->host = qemu_map_cache(block->offset, block->length, 1); > + } > return block->host + (addr - block->offset); > } > } > @@ -2964,11 +2983,21 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) > uint8_t *host = ptr; > > QLIST_FOREACH(block,&ram_list.blocks, next) { > + /* This case append when the block is not mapped. */ > + if (block->host == NULL) { > + continue; > + } > if (host - block->host< block->length) { > *ram_addr = block->offset + (host - block->host); > return 0; > } > } > + > + if (xen_mapcache_enabled()) { > + *ram_addr = qemu_ram_addr_from_mapcache(ptr); > + return 0; > + } > + > return -1; > } > > @@ -3879,6 +3908,9 @@ void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len, > if (is_write) { > cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len); > } > + if (xen_enabled()) { > + qemu_invalidate_entry(buffer); > + } > qemu_vfree(bounce.buffer); > bounce.buffer = NULL; > cpu_notify_map_clients(); > diff --git a/hw/xen.h b/hw/xen.h > index 338cf76..dd3fb68 100644 > --- a/hw/xen.h > +++ b/hw/xen.h > @@ -31,10 +31,23 @@ static inline int xen_enabled(void) > #endif > } > > +static inline int xen_mapcache_enabled(void) > +{ > +#ifdef CONFIG_XEN_MAPCACHE > + return xen_enabled(); > +#else > + return 0; > +#endif > +} > + > int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num); > void xen_piix3_set_irq(void *opaque, int irq_num, int level); > void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len); > > int xen_init(int smp_cpus); > > +#if defined(NEED_CPU_H)&& !defined(CONFIG_USER_ONLY) > +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size); > +#endif > + > #endif /* QEMU_HW_XEN_H */ > diff --git a/xen-all.c b/xen-all.c > index 205cbc4..2b9e71c 100644 > --- a/xen-all.c > +++ b/xen-all.c > @@ -13,6 +13,8 @@ > #include "hw/xen_backend.h" > #include "hw/xen_redirect.h" > > +#include "xen-mapcache.h" > + > /* Xen specific function for piix pci */ > > int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num) > @@ -55,6 +57,64 @@ qemu_irq *i8259_xen_init(void) > return qemu_allocate_irqs(i8259_set_irq, NULL, 16); > } > > + > +/* Memory Ops */ > + > +static void xen_ram_init(ram_addr_t ram_size) > +{ > + RAMBlock *new_block; > + ram_addr_t below_4g_mem_size, above_4g_mem_size = 0; > + > + new_block = qemu_mallocz(sizeof (*new_block)); > + pstrcpy(new_block->idstr, sizeof (new_block->idstr), "xen.ram"); > + new_block->host = NULL; > + new_block->offset = 0; > + new_block->length = ram_size; > + > + QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); > + > + ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty, > + new_block->length>> TARGET_PAGE_BITS); > + memset(ram_list.phys_dirty + (new_block->offset>> TARGET_PAGE_BITS), > + 0xff, new_block->length>> TARGET_PAGE_BITS); > + > + if (ram_size>= 0xe0000000 ) { > + above_4g_mem_size = ram_size - 0xe0000000; > + below_4g_mem_size = 0xe0000000; > + } else { > + below_4g_mem_size = ram_size; > + } > + > + cpu_register_physical_memory(0, below_4g_mem_size, new_block->offset); > +#if TARGET_PHYS_ADDR_BITS> 32 > + if (above_4g_mem_size> 0) { > + cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, > + new_block->offset + below_4g_mem_size); > + } > +#endif > +} > + > +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size) > +{ > + unsigned long nr_pfn; > + xen_pfn_t *pfn_list; > + int i; > + > + nr_pfn = size>> TARGET_PAGE_BITS; > + pfn_list = qemu_malloc(sizeof (*pfn_list) * nr_pfn); > + > + for (i = 0; i< nr_pfn; i++) { > + pfn_list[i] = (ram_addr>> TARGET_PAGE_BITS) + i; > + } > + > + if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) { > + hw_error("xen: failed to populate ram at %lx", ram_addr); > + } > + > + qemu_free(pfn_list); > +} > + > + > /* Initialise Xen */ > > int xen_init(int smp_cpus) > @@ -68,5 +128,9 @@ int xen_init(int smp_cpus) > return -1; > } > > + /* Init RAM management */ > + qemu_map_cache_init(); > + xen_ram_init(ram_size); > + > return 0; > } > diff --git a/xen-mapcache-stub.c b/xen-mapcache-stub.c > new file mode 100644 > index 0000000..69ce2e7 > --- /dev/null > +++ b/xen-mapcache-stub.c > @@ -0,0 +1,33 @@ > +#include "config.h" > + > +#include "exec-all.h" > +#include "qemu-common.h" > +#include "cpu-common.h" > +#include "xen-mapcache.h" > + > +int qemu_map_cache_init(void) > +{ > + return 0; > +} > + > +uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size, uint8_t lock) > +{ > + return qemu_get_ram_ptr(phys_addr); > +} > + > +void qemu_map_cache_unlock(void *buffer) > +{ > +} > + > +ram_addr_t qemu_ram_addr_from_mapcache(void *ptr) > +{ > + return -1; > +} > + > +void qemu_invalidate_map_cache(void) > +{ > +} > + > +void qemu_invalidate_entry(uint8_t *buffer) > +{ > +} > diff --git a/xen-mapcache.c b/xen-mapcache.c > new file mode 100644 > index 0000000..3e1cca9 > --- /dev/null > +++ b/xen-mapcache.c > @@ -0,0 +1,301 @@ > Needs a copyright. > +#include "config.h" > + > +#include "hw/xen_backend.h" > +#include "blockdev.h" > + > +#include > +#include > + > +#include "xen-mapcache.h" > + > + > +//#define MAPCACHE_DEBUG > + > +#ifdef MAPCACHE_DEBUG > +# define DPRINTF(fmt, ...) do { \ > + fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \ > +} while (0) > +#else > +# define DPRINTF(fmt, ...) do { } while (0) > +#endif > + > +#if defined(__i386__) > +# define MAX_MCACHE_SIZE 0x40000000 /* 1GB max for x86 */ > +# define MCACHE_BUCKET_SHIFT 16 > +#elif defined(__x86_64__) > +# define MAX_MCACHE_SIZE 0x1000000000 /* 64GB max for x86_64 */ > +# define MCACHE_BUCKET_SHIFT 20 > +#endif > +#define MCACHE_BUCKET_SIZE (1UL<< MCACHE_BUCKET_SHIFT) > This is pretty awful. Set limits based on host address with, not based on i386 vs. x86_64. > +#define BITS_PER_LONG (sizeof(long) * 8) > +#define BITS_TO_LONGS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) > +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] > +#define test_bit(bit, map) \ > + (!!((map)[(bit) / BITS_PER_LONG]& (1UL<< ((bit) % BITS_PER_LONG)))) > + > +typedef struct MapCacheEntry { > + target_phys_addr_t paddr_index; > + uint8_t *vaddr_base; > + DECLARE_BITMAP(valid_mapping, MCACHE_BUCKET_SIZE>> XC_PAGE_SHIFT); > + uint8_t lock; > + struct MapCacheEntry *next; > +} MapCacheEntry; > + > +typedef struct MapCacheRev { > + uint8_t *vaddr_req; > + target_phys_addr_t paddr_index; > + QTAILQ_ENTRY(MapCacheRev) next; > +} MapCacheRev; > + > +typedef struct MapCache { > + MapCacheEntry *entry; > + unsigned long nr_buckets; > + QTAILQ_HEAD(map_cache_head, MapCacheRev) locked_entries; > + > + /* For most cases (>99.9%), the page address is the same. */ > + target_phys_addr_t last_address_index; > + uint8_t *last_address_vaddr; > +} MapCache; > + > +static MapCache *mapcache; > + > + > +int qemu_map_cache_init(void) > +{ > + unsigned long size; > + > + mapcache = qemu_mallocz(sizeof (MapCache)); > + > + QTAILQ_INIT(&mapcache->locked_entries); > + mapcache->last_address_index = -1; > + > + mapcache->nr_buckets = (((MAX_MCACHE_SIZE>> XC_PAGE_SHIFT) + > + (1UL<< (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1)>> > + (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)); > + > + /* > + * Use mmap() directly: lets us allocate a big hash table with no up-front > + * cost in storage space. The OS will allocate memory only for the buckets > + * that we actually use. All others will contain all zeroes. > You don't need to use mmap. malloc() does the same thing. > + */ > + size = mapcache->nr_buckets * sizeof (MapCacheEntry); > + size = (size + XC_PAGE_SIZE - 1)& ~(XC_PAGE_SIZE - 1); > + DPRINTF("qemu_map_cache_init, nr_buckets = %lx size %lu\n", mapcache->nr_buckets, size); > + mapcache->entry = mmap(NULL, size, PROT_READ|PROT_WRITE, > + MAP_SHARED|MAP_ANON, -1, 0); > + if (mapcache->entry == MAP_FAILED) { > + return -1; > + } > + > + return 0; > +} > + > +static void qemu_remap_bucket(MapCacheEntry *entry, > + target_phys_addr_t size, > + target_phys_addr_t address_index) > +{ > + uint8_t *vaddr_base; > + xen_pfn_t *pfns; > + int *err; > + unsigned int i, j; > + target_phys_addr_t nb_pfn = size>> XC_PAGE_SHIFT; > + > + pfns = qemu_mallocz(nb_pfn * sizeof (xen_pfn_t)); > + err = qemu_mallocz(nb_pfn * sizeof (int)); > + > + if (entry->vaddr_base != NULL) { > + if (munmap(entry->vaddr_base, size) != 0) { > + perror("unmap fails"); > + exit(-1); > + } > + } > + > + for (i = 0; i< nb_pfn; i++) { > + pfns[i] = (address_index<< (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i; > + } > + > + vaddr_base = xc_map_foreign_bulk(xen_xc, xen_domid, PROT_READ|PROT_WRITE, > + pfns, err, nb_pfn); > + if (vaddr_base == NULL) { > + perror("xc_map_foreign_bulk"); > + exit(-1); > + } > + > + entry->vaddr_base = vaddr_base; > + entry->paddr_index = address_index; > + > + for (i = 0; i< nb_pfn; i += BITS_PER_LONG) { > + unsigned long word = 0; > + if ((i + BITS_PER_LONG)> nb_pfn) { > + j = nb_pfn % BITS_PER_LONG; > + } else { > + j = BITS_PER_LONG; > + } > + while (j> 0) { > + word = (word<< 1) | !err[i + --j]; > + } > + entry->valid_mapping[i / BITS_PER_LONG] = word; > + } > + > + qemu_free(pfns); > + qemu_free(err); > +} > + > +uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size, uint8_t lock) > +{ > + MapCacheEntry *entry, *pentry = NULL; > + target_phys_addr_t address_index = phys_addr>> MCACHE_BUCKET_SHIFT; > + target_phys_addr_t address_offset = phys_addr& (MCACHE_BUCKET_SIZE - 1); > + > + if (address_index == mapcache->last_address_index&& !lock) { > + return mapcache->last_address_vaddr + address_offset; > + } > + > + entry =&mapcache->entry[address_index % mapcache->nr_buckets]; > + > + while (entry&& entry->lock&& entry->paddr_index != address_index&& entry->vaddr_base) { > + pentry = entry; > + entry = entry->next; > + } > + if (!entry) { > + entry = qemu_mallocz(sizeof (MapCacheEntry)); > + pentry->next = entry; > + qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index); > + } else if (!entry->lock) { > + if (!entry->vaddr_base || entry->paddr_index != address_index || > + !test_bit(address_offset>> XC_PAGE_SHIFT, entry->valid_mapping)) { > + qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index); > + } > + } > + > + if (!test_bit(address_offset>> XC_PAGE_SHIFT, entry->valid_mapping)) { > + mapcache->last_address_index = -1; > + return NULL; > + } > + > + mapcache->last_address_index = address_index; > + mapcache->last_address_vaddr = entry->vaddr_base; > + if (lock) { > + MapCacheRev *reventry = qemu_mallocz(sizeof(MapCacheRev)); > + entry->lock++; > + reventry->vaddr_req = mapcache->last_address_vaddr + address_offset; > + reventry->paddr_index = mapcache->last_address_index; > + QTAILQ_INSERT_TAIL(&mapcache->locked_entries, reventry, next); > + } > + > + return mapcache->last_address_vaddr + address_offset; > +} > + > +ram_addr_t qemu_ram_addr_from_mapcache(void *ptr) > +{ > + MapCacheRev *reventry; > + target_phys_addr_t paddr_index; > + int found = 0; > + > + QTAILQ_FOREACH(reventry,&mapcache->locked_entries, next) { > + if (reventry->vaddr_req == ptr) { > + paddr_index = reventry->paddr_index; > + found = 1; > + break; > + } > + } > + if (!found) { > + fprintf(stderr, "qemu_ram_addr_from_mapcache, could not find %p\n", ptr); > + QTAILQ_FOREACH(reventry,&mapcache->locked_entries, next) { > + DPRINTF(" %lx -> %p is present\n", reventry->paddr_index, > + reventry->vaddr_req); > + } > + abort(); > + return 0; > + } > + > + return paddr_index<< MCACHE_BUCKET_SHIFT; > +} > + > +void qemu_invalidate_entry(uint8_t *buffer) > +{ > + MapCacheEntry *entry = NULL, *pentry = NULL; > + MapCacheRev *reventry; > + target_phys_addr_t paddr_index; > + int found = 0; > + > + if (mapcache->last_address_vaddr == buffer) { > + mapcache->last_address_index = -1; > + } > + > + QTAILQ_FOREACH(reventry,&mapcache->locked_entries, next) { > + if (reventry->vaddr_req == buffer) { > + paddr_index = reventry->paddr_index; > + found = 1; > + break; > + } > + } > + if (!found) { > + DPRINTF("qemu_invalidate_entry, could not find %p\n", buffer); > + QTAILQ_FOREACH(reventry,&mapcache->locked_entries, next) { > + DPRINTF(" %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req); > + } > + return; > + } > + QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next); > + qemu_free(reventry); > + > + entry =&mapcache->entry[paddr_index % mapcache->nr_buckets]; > + while (entry&& entry->paddr_index != paddr_index) { > + pentry = entry; > + entry = entry->next; > + } > + if (!entry) { > + DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer); > + return; > + } > + entry->lock--; > + if (entry->lock> 0 || pentry == NULL) { > + return; > + } > + > + pentry->next = entry->next; > + if (munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE) != 0) { > + perror("unmap fails"); > + exit(-1); > + } > + qemu_free(entry); > +} > + > +void qemu_invalidate_map_cache(void) > +{ > + unsigned long i; > + MapCacheRev *reventry; > + > + qemu_aio_flush(); > This is bizarre? Why is this needed? Regards, Anthony Liguori