* [Patch] Qemu map cache
@ 2006-12-04 17:33 Cui, Dexuan
2006-12-04 18:26 ` Anthony Liguori
2006-12-04 22:58 ` Anthony Liguori
0 siblings, 2 replies; 8+ messages in thread
From: Cui, Dexuan @ 2006-12-04 17:33 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 495 bytes --]
On IA32 host or IA32 PAE host, at present, generally, we can't create an
HVM guest with more than 2G memory, because generally it's almost
impossible for Qemu to find a large enough and consecutive virtual
address space to map an HVM guest's whole physical address space.
The attached patch fixes this issue using dynamic mapping based on
little blocks of memory.
-- Dexuan Cui
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
[-- Attachment #2: qemu-map-cache-2.diff --]
[-- Type: application/octet-stream, Size: 8322 bytes --]
diff -r fd28a1b139de tools/ioemu/cpu-defs.h
--- a/tools/ioemu/cpu-defs.h Mon Dec 04 09:29:26 2006 +0000
+++ b/tools/ioemu/cpu-defs.h Tue Dec 05 00:11:49 2006 +0800
@@ -28,6 +28,8 @@
#ifndef TARGET_LONG_BITS
#error TARGET_LONG_BITS must be defined before including this header
#endif
+
+#define TARGET_PHYS_ADDR_BITS 64
#ifndef TARGET_PHYS_ADDR_BITS
#if TARGET_LONG_BITS >= HOST_LONG_BITS
diff -r fd28a1b139de tools/ioemu/target-i386-dm/cpu.h
--- a/tools/ioemu/target-i386-dm/cpu.h Mon Dec 04 09:29:26 2006 +0000
+++ b/tools/ioemu/target-i386-dm/cpu.h Tue Dec 05 00:11:49 2006 +0800
@@ -25,7 +25,8 @@
#ifdef TARGET_X86_64
#define TARGET_LONG_BITS 64
#else
-#define TARGET_LONG_BITS 32
+/* #define TARGET_LONG_BITS 32 */
+#define TARGET_LONG_BITS 64 /* for Qemu map cache */
#endif
/* target supports implicit self modifying code */
diff -r fd28a1b139de tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c Mon Dec 04 09:29:26 2006 +0000
+++ b/tools/ioemu/target-i386-dm/exec-dm.c Tue Dec 05 00:11:49 2006 +0800
@@ -36,6 +36,7 @@
#include "cpu.h"
#include "exec-all.h"
+#include "vl.h"
//#define DEBUG_TB_INVALIDATE
//#define DEBUG_FLUSH
@@ -426,6 +427,12 @@ static inline int paddr_is_ram(target_ph
#endif
}
+#if defined(__i386__) || defined(__x86_64__)
+#define phys_ram_addr(x) (qemu_map_cache(x))
+#elif defined(__ia64__)
+#define phys_ram_addr(x) (phys_ram_base + (x))
+#endif
+
void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
int len, int is_write)
{
@@ -438,7 +445,7 @@ void cpu_physical_memory_rw(target_phys_
l = TARGET_PAGE_SIZE - (addr & ~TARGET_PAGE_MASK);
if (l > len)
l = len;
-
+
io_index = iomem_index(addr);
if (is_write) {
if (io_index) {
@@ -460,9 +467,10 @@ void cpu_physical_memory_rw(target_phys_
}
} else if (paddr_is_ram(addr)) {
/* Reading from RAM */
- memcpy(phys_ram_base + addr, buf, l);
+ ptr = phys_ram_addr(addr);
+ memcpy(ptr, buf, l);
#ifdef __ia64__
- sync_icache((unsigned long)(phys_ram_base + addr), l);
+ sync_icache(ptr, l);
#endif
}
} else {
@@ -485,7 +493,8 @@ void cpu_physical_memory_rw(target_phys_
}
} else if (paddr_is_ram(addr)) {
/* Reading from RAM */
- memcpy(buf, phys_ram_base + addr, l);
+ ptr = phys_ram_addr(addr);
+ memcpy(buf, ptr, l);
} else {
/* Neither RAM nor known MMIO space */
memset(buf, 0xff, len);
diff -r fd28a1b139de tools/ioemu/vl.c
--- a/tools/ioemu/vl.c Mon Dec 04 09:29:26 2006 +0000
+++ b/tools/ioemu/vl.c Tue Dec 05 00:11:49 2006 +0800
@@ -5808,6 +5808,101 @@ int set_mm_mapping(int xc_handle, uint32
return 0;
}
+static xen_pfn_t *page_array;
+
+#if defined(__i386__) || defined(__x86_64__)
+static struct map_cache *mapcache_entry;
+static unsigned long nr_buckets;
+
+static inline struct map_cache* get_hash_bucket(target_phys_addr_t addr)
+{
+ return &mapcache_entry[(addr >> MCACHE_BUCKET_SHIFT) % nr_buckets];
+}
+
+/*
+ * For most cases (>99.9%), the page address is the same.
+ */
+static target_phys_addr_t last_bucket = ~0LL;
+static uint8_t *last_bucket_mapped;
+
+#define likely(x) __builtin_expect((x),1)
+
+static int qemu_map_cache_init(unsigned long nr_pages)
+{
+ unsigned long max_pages = MAX_MCACHE_SIZE >> PAGE_SHIFT;
+ int i;
+
+ if ( nr_pages < max_pages )
+ max_pages = nr_pages;
+
+ nr_buckets = (max_pages << PAGE_SHIFT) >> MCACHE_BUCKET_SHIFT;
+
+ fprintf(logfile, "qemu_map_cache_init nr_buckets = %lx\n", nr_buckets);
+
+ mapcache_entry =
+ (struct map_cache *)malloc(nr_buckets * sizeof(struct map_cache));
+ if ( mapcache_entry == NULL )
+ return errno = ENOMEM;
+
+ memset(mapcache_entry, 0, nr_buckets * sizeof(struct map_cache));
+
+ /*
+ * To avoid ENOMEM from xc_map_foreign_batch() at runtime, we
+ * pre-fill all the map caches in advance.
+ */
+ for (i = 0; i < nr_buckets; i++)
+ (void)qemu_map_cache( ((target_phys_addr_t)i) << MCACHE_BUCKET_SHIFT );
+
+ return 0;
+}
+
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr)
+{
+ struct map_cache *entry;
+ target_phys_addr_t bucket = phys_addr & MCACHE_BUCKET_MASK;
+ target_phys_addr_t offset = phys_addr & ~MCACHE_BUCKET_MASK;
+
+ if ( likely(bucket == last_bucket) )
+ return last_bucket_mapped + offset;
+
+ entry = get_hash_bucket(bucket);
+
+ if ( entry->mc_addr == 0 || entry->phys_addr != bucket )
+ {
+ /* We need to remap the existing mapping. First unmap if existing
+ * mapping is there.
+ */
+ uint8_t *mc_addr;
+ unsigned long pfn = bucket >> PAGE_SHIFT;
+
+ if ( entry->mc_addr ) {
+ errno = munmap(entry->mc_addr, MCACHE_BUCKET_SIZE);
+ if ( errno ) {
+ fprintf(logfile, "unmap fails %d\n", errno);
+ exit(-1);
+ }
+ }
+
+ mc_addr = xc_map_foreign_batch(xc_handle, domid, PROT_READ|PROT_WRITE,
+ &page_array[pfn], (MCACHE_BUCKET_SIZE >> PAGE_SHIFT));
+ if(mc_addr == 0) {
+ fprintf(logfile, "xc_map_foreign_batch returned error %d\n", errno);
+ exit(-1);
+ }
+
+ entry->mc_addr = mc_addr;
+ entry->phys_addr = bucket;
+ entry->map_count++;
+ }
+
+ last_bucket = bucket;
+ last_bucket_mapped = entry->mc_addr;
+
+ return last_bucket_mapped + offset;
+}
+#endif
+
+
int main(int argc, char **argv)
{
#ifdef CONFIG_GDBSTUB
@@ -5842,7 +5937,6 @@ int main(int argc, char **argv)
char usb_devices[MAX_USB_CMDLINE][128];
int usb_devices_index;
unsigned long nr_pages, tmp_nr_pages, shared_page_nr;
- xen_pfn_t *page_array;
extern void *shared_page;
extern void *buffered_io_page;
@@ -6130,6 +6224,7 @@ int main(int argc, char **argv)
break;
case QEMU_OPTION_m:
ram_size = atol(optarg) * 1024 * 1024;
+ ram_size = (uint64_t)atol(optarg) * 1024 * 1024;
if (ram_size <= 0)
help();
#ifndef CONFIG_DM
@@ -6410,11 +6505,9 @@ int main(int argc, char **argv)
for ( i = 0; i < tmp_nr_pages; i++)
page_array[i] = i;
- phys_ram_base = xc_map_foreign_batch(xc_handle, domid,
- PROT_READ|PROT_WRITE, page_array,
- tmp_nr_pages);
- if (phys_ram_base == NULL) {
- fprintf(logfile, "batch map guest memory returned error %d\n", errno);
+ if ( qemu_map_cache_init(tmp_nr_pages) )
+ {
+ fprintf(logfile, "qemu_map_cache_init returned: error %d\n", errno);
exit(-1);
}
@@ -6440,7 +6533,8 @@ int main(int argc, char **argv)
fprintf(logfile, "buffered io page at pfn:%lx, mfn: %"PRIx64"\n",
shared_page_nr - 2, (uint64_t)(page_array[shared_page_nr - 2]));
- free(page_array);
+ /* free(page_array); */
+ /* Qemu map cache needs page_array to get mfn from pfn. */
#elif defined(__ia64__)
diff -r fd28a1b139de tools/ioemu/vl.h
--- a/tools/ioemu/vl.h Mon Dec 04 09:29:26 2006 +0000
+++ b/tools/ioemu/vl.h Tue Dec 05 00:11:49 2006 +0800
@@ -156,6 +156,29 @@ extern void *shared_vram;
extern FILE *logfile;
+
+#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
+#define MAX_MCACHE_SIZE 0x40000000 /* 1GB max for x86 */
+#define MCACHE_BUCKET_SHIFT 16
+#elif defined(__x86_64__)
+#define MAX_MCACHE_SIZE 0x1000000000 /* 64GB max for x86_64 */
+#define MCACHE_BUCKET_SHIFT 20
+#endif
+
+#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
+#define MCACHE_BUCKET_MASK ~(MCACHE_BUCKET_SIZE - 1)
+
+struct map_cache
+{
+ target_phys_addr_t phys_addr; /* MCACHE_BUCKET_SIZE unit */
+ uint8_t *mc_addr; /* virtual address */
+ unsigned int map_count; /* how many times remapped? */
+};
+
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr);
+#endif
+
extern int xc_handle;
extern int domid;
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [Patch] Qemu map cache
2006-12-04 17:33 [Patch] Qemu map cache Cui, Dexuan
@ 2006-12-04 18:26 ` Anthony Liguori
2006-12-04 22:58 ` Anthony Liguori
1 sibling, 0 replies; 8+ messages in thread
From: Anthony Liguori @ 2006-12-04 18:26 UTC (permalink / raw)
To: Cui, Dexuan; +Cc: xen-devel
Cui, Dexuan wrote:
> On IA32 host or IA32 PAE host, at present, generally, we can't create an
> HVM guest with more than 2G memory, because generally it's almost
> impossible for Qemu to find a large enough and consecutive virtual
> address space to map an HVM guest's whole physical address space.
> The attached patch fixes this issue using dynamic mapping based on
> little blocks of memory.
I have a feeling this patch is going to require quite a lot more work
for V2E. Have you guys considered implementing this functionality for
upstream QEMU? The Xen specific portions should be small and upstream
QEMU has the same problem.
Regards,
Anthony Liguori
> -- Dexuan Cui
>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Patch] Qemu map cache
2006-12-04 17:33 [Patch] Qemu map cache Cui, Dexuan
2006-12-04 18:26 ` Anthony Liguori
@ 2006-12-04 22:58 ` Anthony Liguori
2006-12-05 20:28 ` Ian Pratt
1 sibling, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2006-12-04 22:58 UTC (permalink / raw)
To: Cui, Dexuan; +Cc: xen-devel
Cui, Dexuan wrote:
> On IA32 host or IA32 PAE host, at present, generally, we can't create an
> HVM guest with more than 2G memory, because generally it's almost
> impossible for Qemu to find a large enough and consecutive virtual
> address space to map an HVM guest's whole physical address space.
> The attached patch fixes this issue using dynamic mapping based on
> little blocks of memory.
>
(Sorry if this comes through twice).
Have you considered doing something similar for mainline QEMU? The
reason I ask is that V2E pulls in all of the dynamic translation code.
My initial reaction is that doing map cache will require a significant
amount change to the dynamic translation bits since we can no longer
make the assumption that memory can be accessed directly. I don't fully
have my head around it yet, but this may involve lots of nastiness with
keeping track of which TB's reference what memory and invalidating those
TBs when map cache references are invalidated. The QEMU TLB may
simplify some of this but I'm not entirely sure.
Have you given this any thought?
Thanks,
Anthony Liguori
> -- Dexuan Cui
>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [Patch] Qemu map cache
2006-12-04 22:58 ` Anthony Liguori
@ 2006-12-05 20:28 ` Ian Pratt
2006-12-05 20:48 ` Anthony Liguori
0 siblings, 1 reply; 8+ messages in thread
From: Ian Pratt @ 2006-12-05 20:28 UTC (permalink / raw)
To: Anthony Liguori, Cui, Dexuan; +Cc: xen-devel
> Have you considered doing something similar for mainline QEMU? The
> reason I ask is that V2E pulls in all of the dynamic translation code.
> My initial reaction is that doing map cache will require a significant
> amount change to the dynamic translation bits since we can no longer
> make the assumption that memory can be accessed directly. I don't
fully
> have my head around it yet, but this may involve lots of nastiness
with
> keeping track of which TB's reference what memory and invalidating
those
> TBs when map cache references are invalidated. The QEMU TLB may
> simplify some of this but I'm not entirely sure.
>
> Have you given this any thought?
Being able to invalidate (sections of) qemu's mappings (at least
asynchronously) is essential to allow the balloon driver to work for HVM
guests. V2E is going to have to bite the bullet on this one.
Of course, in a 64b environment the map cache can be direct mapped, but
you still need the ability to do invalidates. BTW: I'm comfortable if
V2E only works on 64b. Sooner or latter there's going to be some new
feature which isn't supported on Yonah...
Ian
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Patch] Qemu map cache
2006-12-05 20:28 ` Ian Pratt
@ 2006-12-05 20:48 ` Anthony Liguori
2006-12-06 0:33 ` Ian Pratt
0 siblings, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2006-12-05 20:48 UTC (permalink / raw)
To: Ian Pratt; +Cc: xen-devel, Cui, Dexuan
Ian Pratt wrote:
>> Have you considered doing something similar for mainline QEMU? The
>> reason I ask is that V2E pulls in all of the dynamic translation code.
>> My initial reaction is that doing map cache will require a significant
>> amount change to the dynamic translation bits since we can no longer
>> make the assumption that memory can be accessed directly. I don't
>>
> fully
>
>> have my head around it yet, but this may involve lots of nastiness
>>
> with
>
>> keeping track of which TB's reference what memory and invalidating
>>
> those
>
>> TBs when map cache references are invalidated. The QEMU TLB may
>> simplify some of this but I'm not entirely sure.
>>
>> Have you given this any thought?
>>
>
> Being able to invalidate (sections of) qemu's mappings (at least
> asynchronously) is essential to allow the balloon driver to work for HVM
> guests.
To be able to change portions of the physical memory mapping right? You
don't strictly need a map cache for this (you can simply remap portions
of the address space). You really only need the map cache for > 2GB
guests (which admittedly could be a ballooned down guest that started
out > 2GB).
> V2E is going to have to bite the bullet on this one.
>
> Of course, in a 64b environment the map cache can be direct mapped, but
> you still need the ability to do invalidates. BTW: I'm comfortable if
> V2E only works on 64b.
We may be able to work around this using one of QEMU's TLB. If the map
cache goes in for 3.0.4, then we can look at just supporting 64 bit for
3.0.5 and fixing 32 bit post 3.0.5 (if that's necessary). Sound like a
reasonable plan?
Regards,
Anthony Liguori
> Sooner or latter there's going to be some new
> feature which isn't supported on Yonah...
>
> Ian
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [Patch] Qemu map cache
2006-12-05 20:48 ` Anthony Liguori
@ 2006-12-06 0:33 ` Ian Pratt
0 siblings, 0 replies; 8+ messages in thread
From: Ian Pratt @ 2006-12-06 0:33 UTC (permalink / raw)
To: Anthony Liguori, Ian Pratt; +Cc: xen-devel, Cui, Dexuan
> > Being able to invalidate (sections of) qemu's mappings (at least
> > asynchronously) is essential to allow the balloon driver to work for
HVM
> > guests.
>
> To be able to change portions of the physical memory mapping right?
You
> don't strictly need a map cache for this (you can simply remap
portions
> of the address space).
Yes (where remap includes 'unmap')
> You really only need the map cache for > 2GB
> guests (which admittedly could be a ballooned down guest that started
> out > 2GB).
One could argue that for large memory guests having a mapping of all of
guest memory is 'wasteful' anyhow, not that page tables take up that
much space. It's probably good hygiene to only map what you need.
> > V2E is going to have to bite the bullet on this one.
> >
> > Of course, in a 64b environment the map cache can be direct mapped,
but
> > you still need the ability to do invalidates. BTW: I'm comfortable
if
> > V2E only works on 64b.
>
> We may be able to work around this using one of QEMU's TLB. If the
map
> cache goes in for 3.0.4, then we can look at just supporting 64 bit
for
> 3.0.5 and fixing 32 bit post 3.0.5 (if that's necessary). Sound like
a
> reasonable plan?
If necessary, however I still like the mapcache approach. I think
running a 32b dom0 on a 64b hypervisor is actually going to be a pretty
common way of running things (likely gives best performance).
Ian
> Regards,
>
> Anthony Liguori
>
> > Sooner or latter there's going to be some new
> > feature which isn't supported on Yonah...
> >
> > Ian
> >
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [Patch] Qemu map cache
@ 2006-12-07 2:08 Cui, Dexuan
2006-12-07 10:35 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: Cui, Dexuan @ 2006-12-07 2:08 UTC (permalink / raw)
To: xen-devel
Hi Keir,
There is a minor mistake in the patch I posted previously.
The "1UL" in the following line should be "1ULL":
#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT
Sorry for that.
Btw, it seems the patch has not been checked in. Could you give some commetns?
Thanks!
-- Dexuan
-----Original Message-----
From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Cui, Dexuan
Sent: 2006年12月5日 1:34
To: xen-devel@lists.xensource.com
Subject: [Xen-devel] [Patch] Qemu map cache
On IA32 host or IA32 PAE host, at present, generally, we can't create an
HVM guest with more than 2G memory, because generally it's almost
impossible for Qemu to find a large enough and consecutive virtual
address space to map an HVM guest's whole physical address space.
The attached patch fixes this issue using dynamic mapping based on
little blocks of memory.
-- Dexuan Cui
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Patch] Qemu map cache
2006-12-07 2:08 Cui, Dexuan
@ 2006-12-07 10:35 ` Keir Fraser
0 siblings, 0 replies; 8+ messages in thread
From: Keir Fraser @ 2006-12-07 10:35 UTC (permalink / raw)
To: Cui, Dexuan, xen-devel
On 7/12/06 02:08, "Cui, Dexuan" <dexuan.cui@intel.com> wrote:
> There is a minor mistake in the patch I posted previously.
> The "1UL" in the following line should be "1ULL":
> #define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT
>
> Sorry for that.
>
> Btw, it seems the patch has not been checked in. Could you give some commetns?
I'm reworking it a bit. It'll go in this morning.
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2006-12-07 10:35 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-04 17:33 [Patch] Qemu map cache Cui, Dexuan
2006-12-04 18:26 ` Anthony Liguori
2006-12-04 22:58 ` Anthony Liguori
2006-12-05 20:28 ` Ian Pratt
2006-12-05 20:48 ` Anthony Liguori
2006-12-06 0:33 ` Ian Pratt
-- strict thread matches above, loose matches on Subject: below --
2006-12-07 2:08 Cui, Dexuan
2006-12-07 10:35 ` Keir Fraser
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.