* [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
@ 2013-11-07 16:14 Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 1/2] split definitions for exec.c and translate-all.c radix trees Paolo Bonzini
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 16:14 UTC (permalink / raw)
To: qemu-devel; +Cc: marcel.a, lcapitulino, mst
This fixes the problems with the misalignment of the master abort region.
See patch 2 for details, patch 1 is just a preparatory search-and-replace
patch.
Paolo Bonzini (2):
split definitions for exec.c and translate-all.c radix trees
exec: make address spaces 64-bit wide
exec.c | 28 ++++++++++++++++------------
translate-all.c | 32 ++++++++++++++++++--------------
translate-all.h | 7 -------
3 files changed, 34 insertions(+), 33 deletions(-)
--
1.8.4.2
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 1/2] split definitions for exec.c and translate-all.c radix trees
2013-11-07 16:14 [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Paolo Bonzini
@ 2013-11-07 16:14 ` Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide Paolo Bonzini
2013-11-07 16:21 ` [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Michael S. Tsirkin
2 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 16:14 UTC (permalink / raw)
To: qemu-devel; +Cc: marcel.a, lcapitulino, mst
The exec.c and translate-all.c radix trees are quite different, and
the exec.c one in particular is not limited to the CPU---it can be
used also by devices that do DMA, and in that case the address space
is not limited to TARGET_PHYS_ADDR_SPACE_BITS bits.
We want to make exec.c's radix trees 64-bit wide. As a first step,
stop sharing the constants between exec.c and translate-all.c.
exec.c gets P_L2_* constants, translate-all.c gets V_L2_*, for
consistency with the existing V_L1_* symbols. Though actually
in the softmmu case translate-all.c is also indexed by physical
addresses...
This patch has no semantic change.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
exec.c | 28 ++++++++++++++++++----------
translate-all.c | 32 ++++++++++++++++++--------------
translate-all.h | 7 -------
3 files changed, 36 insertions(+), 31 deletions(-)
diff --git a/exec.c b/exec.c
index 79610ce..9e2fc4b 100644
--- a/exec.c
+++ b/exec.c
@@ -88,7 +88,15 @@ struct PhysPageEntry {
uint16_t ptr : 15;
};
-typedef PhysPageEntry Node[L2_SIZE];
+/* Size of the L2 (and L3, etc) page tables. */
+#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+
+#define P_L2_BITS 10
+#define P_L2_SIZE (1 << P_L2_BITS)
+
+#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
+
+typedef PhysPageEntry Node[P_L2_SIZE];
struct AddressSpaceDispatch {
/* This is a multi-level map on the physical address space.
@@ -155,7 +163,7 @@ static uint16_t phys_map_node_alloc(void)
ret = next_map.nodes_nb++;
assert(ret != PHYS_MAP_NODE_NIL);
assert(ret != next_map.nodes_nb_alloc);
- for (i = 0; i < L2_SIZE; ++i) {
+ for (i = 0; i < P_L2_SIZE; ++i) {
next_map.nodes[ret][i].is_leaf = 0;
next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
}
@@ -168,13 +176,13 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
{
PhysPageEntry *p;
int i;
- hwaddr step = (hwaddr)1 << (level * L2_BITS);
+ hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
lp->ptr = phys_map_node_alloc();
p = next_map.nodes[lp->ptr];
if (level == 0) {
- for (i = 0; i < L2_SIZE; i++) {
+ for (i = 0; i < P_L2_SIZE; i++) {
p[i].is_leaf = 1;
p[i].ptr = PHYS_SECTION_UNASSIGNED;
}
@@ -182,9 +190,9 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
} else {
p = next_map.nodes[lp->ptr];
}
- lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
+ lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
- while (*nb && lp < &p[L2_SIZE]) {
+ while (*nb && lp < &p[P_L2_SIZE]) {
if ((*index & (step - 1)) == 0 && *nb >= step) {
lp->is_leaf = true;
lp->ptr = leaf;
@@ -218,7 +226,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr index,
return §ions[PHYS_SECTION_UNASSIGNED];
}
p = nodes[lp.ptr];
- lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
+ lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
}
return §ions[lp.ptr];
}
@@ -1742,11 +1750,11 @@ static void memory_map_init(void)
{
system_memory = g_malloc(sizeof(*system_memory));
- assert(TARGET_PHYS_ADDR_SPACE_BITS <= 64);
+ assert(ADDR_SPACE_BITS <= 64);
memory_region_init(system_memory, NULL, "system",
- TARGET_PHYS_ADDR_SPACE_BITS == 64 ?
- UINT64_MAX : (0x1ULL << TARGET_PHYS_ADDR_SPACE_BITS));
+ ADDR_SPACE_BITS == 64 ?
+ UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
address_space_init(&address_space_memory, system_memory, "memory");
system_io = g_malloc(sizeof(*system_io));
diff --git a/translate-all.c b/translate-all.c
index aeda54d..1c63d78 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -96,12 +96,16 @@ typedef struct PageDesc {
# define L1_MAP_ADDR_SPACE_BITS TARGET_VIRT_ADDR_SPACE_BITS
#endif
+/* Size of the L2 (and L3, etc) page tables. */
+#define V_L2_BITS 10
+#define V_L2_SIZE (1 << V_L2_BITS)
+
/* The bits remaining after N lower levels of page tables. */
#define V_L1_BITS_REM \
- ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+ ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS)
#if V_L1_BITS_REM < 4
-#define V_L1_BITS (V_L1_BITS_REM + L2_BITS)
+#define V_L1_BITS (V_L1_BITS_REM + V_L2_BITS)
#else
#define V_L1_BITS V_L1_BITS_REM
#endif
@@ -395,18 +399,18 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
/* Level 2..N-1. */
- for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+ for (i = V_L1_SHIFT / V_L2_BITS - 1; i > 0; i--) {
void **p = *lp;
if (p == NULL) {
if (!alloc) {
return NULL;
}
- ALLOC(p, sizeof(void *) * L2_SIZE);
+ ALLOC(p, sizeof(void *) * V_L2_SIZE);
*lp = p;
}
- lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
+ lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
}
pd = *lp;
@@ -414,13 +418,13 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
if (!alloc) {
return NULL;
}
- ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
+ ALLOC(pd, sizeof(PageDesc) * V_L2_SIZE);
*lp = pd;
}
#undef ALLOC
- return pd + (index & (L2_SIZE - 1));
+ return pd + (index & (V_L2_SIZE - 1));
}
static inline PageDesc *page_find(tb_page_addr_t index)
@@ -655,14 +659,14 @@ static void page_flush_tb_1(int level, void **lp)
if (level == 0) {
PageDesc *pd = *lp;
- for (i = 0; i < L2_SIZE; ++i) {
+ for (i = 0; i < V_L2_SIZE; ++i) {
pd[i].first_tb = NULL;
invalidate_page_bitmap(pd + i);
}
} else {
void **pp = *lp;
- for (i = 0; i < L2_SIZE; ++i) {
+ for (i = 0; i < V_L2_SIZE; ++i) {
page_flush_tb_1(level - 1, pp + i);
}
}
@@ -673,7 +677,7 @@ static void page_flush_tb(void)
int i;
for (i = 0; i < V_L1_SIZE; i++) {
- page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+ page_flush_tb_1(V_L1_SHIFT / V_L2_BITS - 1, l1_map + i);
}
}
@@ -1600,7 +1604,7 @@ static int walk_memory_regions_1(struct walk_memory_regions_data *data,
if (level == 0) {
PageDesc *pd = *lp;
- for (i = 0; i < L2_SIZE; ++i) {
+ for (i = 0; i < V_L2_SIZE; ++i) {
int prot = pd[i].flags;
pa = base | (i << TARGET_PAGE_BITS);
@@ -1614,9 +1618,9 @@ static int walk_memory_regions_1(struct walk_memory_regions_data *data,
} else {
void **pp = *lp;
- for (i = 0; i < L2_SIZE; ++i) {
+ for (i = 0; i < V_L2_SIZE; ++i) {
pa = base | ((abi_ulong)i <<
- (TARGET_PAGE_BITS + L2_BITS * level));
+ (TARGET_PAGE_BITS + V_L2_BITS * level));
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
if (rc != 0) {
return rc;
@@ -1639,7 +1643,7 @@ int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
for (i = 0; i < V_L1_SIZE; i++) {
int rc = walk_memory_regions_1(&data, (abi_ulong)i << V_L1_SHIFT,
- V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+ V_L1_SHIFT / V_L2_BITS - 1, l1_map + i);
if (rc != 0) {
return rc;
diff --git a/translate-all.h b/translate-all.h
index 5c38819..f7e5932 100644
--- a/translate-all.h
+++ b/translate-all.h
@@ -19,13 +19,6 @@
#ifndef TRANSLATE_ALL_H
#define TRANSLATE_ALL_H
-/* Size of the L2 (and L3, etc) page tables. */
-#define L2_BITS 10
-#define L2_SIZE (1 << L2_BITS)
-
-#define P_L2_LEVELS \
- (((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / L2_BITS) + 1)
-
/* translate-all.c */
void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
void cpu_unlink_tb(CPUState *cpu);
--
1.8.4.2
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide
2013-11-07 16:14 [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 1/2] split definitions for exec.c and translate-all.c radix trees Paolo Bonzini
@ 2013-11-07 16:14 ` Paolo Bonzini
2013-11-10 10:31 ` Michael S. Tsirkin
2013-11-07 16:21 ` [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Michael S. Tsirkin
2 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 16:14 UTC (permalink / raw)
To: qemu-devel; +Cc: marcel.a, lcapitulino, mst
As an alternative to commit 818f86b (exec: limit system memory
size, 2013-11-04) let's just make all address spaces 64-bit wide.
This eliminates problems with phys_page_find ignoring bits above
TARGET_PHYS_ADDR_SPACE_BITS and address_space_translate_internal
consequently messing up the computations.
In Luiz's reported crash, at startup gdb attempts to read from address
0xffffffffffffffe6 to 0xffffffffffffffff inclusive. The region it gets
is the newly introduced master abort region, which is as big as the PCI
address space (see pci_bus_init). Due to a typo that's only 2^63-1,
not 2^64. But we get it anyway because phys_page_find ignores the upper
bits of the physical address. In address_space_translate_internal then
diff = int128_sub(section->mr->size, int128_make64(addr));
*plen = int128_get64(int128_min(diff, int128_make64(*plen)));
diff becomes negative, and int128_get64 booms.
The size of the PCI address space region should be fixed anyway.
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
exec.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/exec.c b/exec.c
index 9e2fc4b..d5ce3da 100644
--- a/exec.c
+++ b/exec.c
@@ -89,7 +89,7 @@ struct PhysPageEntry {
};
/* Size of the L2 (and L3, etc) page tables. */
-#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+#define ADDR_SPACE_BITS 64
#define P_L2_BITS 10
#define P_L2_SIZE (1 << P_L2_BITS)
@@ -1750,11 +1750,7 @@ static void memory_map_init(void)
{
system_memory = g_malloc(sizeof(*system_memory));
- assert(ADDR_SPACE_BITS <= 64);
-
- memory_region_init(system_memory, NULL, "system",
- ADDR_SPACE_BITS == 64 ?
- UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
+ memory_region_init(system_memory, NULL, "system", UINT64_MAX);
address_space_init(&address_space_memory, system_memory, "memory");
system_io = g_malloc(sizeof(*system_io));
--
1.8.4.2
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 16:14 [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 1/2] split definitions for exec.c and translate-all.c radix trees Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide Paolo Bonzini
@ 2013-11-07 16:21 ` Michael S. Tsirkin
2013-11-07 16:29 ` Paolo Bonzini
2 siblings, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2013-11-07 16:21 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: marcel.a, qemu-devel, lcapitulino
On Thu, Nov 07, 2013 at 05:14:35PM +0100, Paolo Bonzini wrote:
> This fixes the problems with the misalignment of the master abort region.
> See patch 2 for details, patch 1 is just a preparatory search-and-replace
> patch.
>
> Paolo Bonzini (2):
> split definitions for exec.c and translate-all.c radix trees
> exec: make address spaces 64-bit wide
Can you please share info on testing you did?
> exec.c | 28 ++++++++++++++++------------
> translate-all.c | 32 ++++++++++++++++++--------------
> translate-all.h | 7 -------
> 3 files changed, 34 insertions(+), 33 deletions(-)
>
> --
> 1.8.4.2
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 16:21 ` [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Michael S. Tsirkin
@ 2013-11-07 16:29 ` Paolo Bonzini
2013-11-07 16:47 ` Michael S. Tsirkin
0 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 16:29 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: marcel.a, qemu-devel, lcapitulino
Il 07/11/2013 17:21, Michael S. Tsirkin ha scritto:
>> > This fixes the problems with the misalignment of the master abort region.
>> > See patch 2 for details, patch 1 is just a preparatory search-and-replace
>> > patch.
>> >
>> > Paolo Bonzini (2):
>> > split definitions for exec.c and translate-all.c radix trees
>> > exec: make address spaces 64-bit wide
> Can you please share info on testing you did?
>
"make check", booting a RHEL guest with both KVM and TCG, Luiz's gdb
crash. I also ran vmexit.flat from kvm-unit-tests and checked that
there was no measurable slowdown.
Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 16:29 ` Paolo Bonzini
@ 2013-11-07 16:47 ` Michael S. Tsirkin
2013-11-07 17:29 ` Paolo Bonzini
0 siblings, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2013-11-07 16:47 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: marcel.a, qemu-devel, lcapitulino
On Thu, Nov 07, 2013 at 05:29:15PM +0100, Paolo Bonzini wrote:
> Il 07/11/2013 17:21, Michael S. Tsirkin ha scritto:
> >> > This fixes the problems with the misalignment of the master abort region.
> >> > See patch 2 for details, patch 1 is just a preparatory search-and-replace
> >> > patch.
> >> >
> >> > Paolo Bonzini (2):
> >> > split definitions for exec.c and translate-all.c radix trees
> >> > exec: make address spaces 64-bit wide
> > Can you please share info on testing you did?
> >
>
> "make check", booting a RHEL guest with both KVM and TCG, Luiz's gdb
> crash. I also ran vmexit.flat from kvm-unit-tests and checked that
> there was no measurable slowdown.
>
> Paolo
That's on kvm with 52 bit address.
But where I would be concerned is systems with e.g. 36 bit address
space where we are doubling the cost of the lookup.
E.g. try i386 and not x86_64.
--
But
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 16:47 ` Michael S. Tsirkin
@ 2013-11-07 17:29 ` Paolo Bonzini
2013-11-07 18:54 ` Michael S. Tsirkin
2013-11-11 16:43 ` Michael S. Tsirkin
0 siblings, 2 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 17:29 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: marcel.a, qemu-devel, lcapitulino
Il 07/11/2013 17:47, Michael S. Tsirkin ha scritto:
> That's on kvm with 52 bit address.
> But where I would be concerned is systems with e.g. 36 bit address
> space where we are doubling the cost of the lookup.
> E.g. try i386 and not x86_64.
Tried now...
P_L2_LEVELS pre-patch post-patch
i386 3 6
x86_64 4 6
I timed the inl_from_qemu test of vmexit.flat with both KVM and TCG. With
TCG there's indeed a visible penalty of 20 cycles for i386 and 10 for x86_64
(you can extrapolate to 30 cycles for TARGET_PHYS_ADDR_SPACE_BITS=32 targets).
These can be more or less entirely ascribed to phys_page_find:
TCG | KVM
pre-patch post-patch | pre-patch post-patch
phys_page_find(i386) 13% 25% | 0.6% 1%
inl_from_qemu cycles(i386) 153 173 | ~12000 ~12000
phys_page_find(x86_64) 18% 25% | 0.8% 1%
inl_from_qemu cycles(x86_64) 163 173 | ~12000 ~12000
Thus this patch costs 0.4% in the worst case for KVM, 12% in the worst case
for TCG. The cycle breakdown is:
60 phys_page_find
28 access_with_adjusted_size
24 address_space_translate_internal
20 address_space_rw
13 io_mem_read
11 address_space_translate
9 memory_region_read_accessor
6 memory_region_access_valid
4 helper_inl
4 memory_access_size
3 cpu_inl
(This run reported 177 cycles per access; the total is 182 due to rounding).
It is probably possible to shave at least 10 cycles from the functions below,
or to make the depth of the tree dynamic so that you would save even more
compared to 1.6.0.
Also, compiling with "-fstack-protector" instead of "-fstack-protector-all",
as suggested a while ago by rth, is already giving a savings of 20 cycles.
And of course, if this were a realistic test, KVM's 60x penalty would
be a severe problem---but it isn't, because this is not a realistic setting.
Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 17:29 ` Paolo Bonzini
@ 2013-11-07 18:54 ` Michael S. Tsirkin
2013-11-07 19:12 ` Paolo Bonzini
2013-11-11 16:43 ` Michael S. Tsirkin
1 sibling, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2013-11-07 18:54 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: marcel.a, qemu-devel, lcapitulino
On Thu, Nov 07, 2013 at 06:29:40PM +0100, Paolo Bonzini wrote:
> Il 07/11/2013 17:47, Michael S. Tsirkin ha scritto:
> > That's on kvm with 52 bit address.
> > But where I would be concerned is systems with e.g. 36 bit address
> > space where we are doubling the cost of the lookup.
> > E.g. try i386 and not x86_64.
>
> Tried now...
>
> P_L2_LEVELS pre-patch post-patch
> i386 3 6
> x86_64 4 6
>
> I timed the inl_from_qemu test of vmexit.flat with both KVM and TCG. With
> TCG there's indeed a visible penalty of 20 cycles for i386 and 10 for x86_64
> (you can extrapolate to 30 cycles for TARGET_PHYS_ADDR_SPACE_BITS=32 targets).
> These can be more or less entirely ascribed to phys_page_find:
>
> TCG | KVM
> pre-patch post-patch | pre-patch post-patch
> phys_page_find(i386) 13% 25% | 0.6% 1%
> inl_from_qemu cycles(i386) 153 173 | ~12000 ~12000
I'm a bit confused by the numbers above. The % of phys_page_find has
grown from 13% to 25% (almost double, which is kind of expected
give we have twice the # of levels). But overhead in # of cycles only went from 153 to
173? Maybe the test is a bit wrong for tcg - how about unrolling the
loop in kvm unit test?
diff --git a/x86/vmexit.c b/x86/vmexit.c
index 957d0cc..405d545 100644
--- a/x86/vmexit.c
+++ b/x86/vmexit.c
@@ -40,6 +40,15 @@ static unsigned int inl(unsigned short port)
{
unsigned int val;
asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
+ asm volatile("inl %w1, %0" : "=a"(val) : "Nd"(port));
return val;
}
Then you have to divide the reported result by 10.
> phys_page_find(x86_64) 18% 25% | 0.8% 1%
> inl_from_qemu cycles(x86_64) 163 173 | ~12000 ~12000
>
> Thus this patch costs 0.4% in the worst case for KVM, 12% in the worst case
> for TCG. The cycle breakdown is:
>
> 60 phys_page_find
> 28 access_with_adjusted_size
> 24 address_space_translate_internal
> 20 address_space_rw
> 13 io_mem_read
> 11 address_space_translate
> 9 memory_region_read_accessor
> 6 memory_region_access_valid
> 4 helper_inl
> 4 memory_access_size
> 3 cpu_inl
>
> (This run reported 177 cycles per access; the total is 182 due to rounding).
> It is probably possible to shave at least 10 cycles from the functions below,
> or to make the depth of the tree dynamic so that you would save even more
> compared to 1.6.0.
>
> Also, compiling with "-fstack-protector" instead of "-fstack-protector-all",
> as suggested a while ago by rth, is already giving a savings of 20 cycles.
>
Is it true that with TCG this affects more than just MMIO
as phys_page_find will also sometimes run on CPU accesses to memory?
> And of course, if this were a realistic test, KVM's 60x penalty would
> be a severe problem---but it isn't, because this is not a realistic setting.
>
> Paolo
Well, for this argument to carry the day we'd need to design
a realistic test which isn't easy :)
--
MST
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 18:54 ` Michael S. Tsirkin
@ 2013-11-07 19:12 ` Paolo Bonzini
0 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-07 19:12 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: lcapitulino, qemu-devel, marcel.a
Il 07/11/2013 19:54, Michael S. Tsirkin ha scritto:
> On Thu, Nov 07, 2013 at 06:29:40PM +0100, Paolo Bonzini wrote:
>> Il 07/11/2013 17:47, Michael S. Tsirkin ha scritto:
>>> That's on kvm with 52 bit address.
>>> But where I would be concerned is systems with e.g. 36 bit address
>>> space where we are doubling the cost of the lookup.
>>> E.g. try i386 and not x86_64.
>>
>> Tried now...
>>
>> P_L2_LEVELS pre-patch post-patch
>> i386 3 6
>> x86_64 4 6
>>
>> I timed the inl_from_qemu test of vmexit.flat with both KVM and TCG. With
>> TCG there's indeed a visible penalty of 20 cycles for i386 and 10 for x86_64
>> (you can extrapolate to 30 cycles for TARGET_PHYS_ADDR_SPACE_BITS=32 targets).
>> These can be more or less entirely ascribed to phys_page_find:
>>
>> TCG | KVM
>> pre-patch post-patch | pre-patch post-patch
>> phys_page_find(i386) 13% 25% | 0.6% 1%
>> inl_from_qemu cycles(i386) 153 173 | ~12000 ~12000
>
> I'm a bit confused by the numbers above. The % of phys_page_find has
> grown from 13% to 25% (almost double, which is kind of expected
> give we have twice the # of levels).
Yes.
> But overhead in # of cycles only went from 153 to
> 173?
new cycles / old cycles = 173 / 153 = 113%
% outside phys_page_find + % in phys_page_find*2 = 87% + 13%*2 = 113%
> Maybe the test is a bit wrong for tcg - how about unrolling the
> loop in kvm unit test?
Done that already. :)
>> Also, compiling with "-fstack-protector" instead of "-fstack-protector-all",
>> as suggested a while ago by rth, is already giving a savings of 20 cycles.
>
> Is it true that with TCG this affects more than just MMIO
> as phys_page_find will also sometimes run on CPU accesses to memory?
Yes. I tried benchmarking with perf the boot of a RHEL guest, which has
TCG | KVM
pre-patch post-patch | pre-patch post-patch
3% 5.8% | 0.9% 1.7%
This is actually higher than usual for KVM because there are many VGA
access during GRUB.
>> And of course, if this were a realistic test, KVM's 60x penalty would
>> be a severe problem---but it isn't, because this is not a realistic setting.
>
> Well, for this argument to carry the day we'd need to design
> a realistic test which isn't easy :)
Yes, I guess the number that matters is the extra 2% penalty for TCG
(the part that doesn't come from MMIO).
Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide
2013-11-07 16:14 ` [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide Paolo Bonzini
@ 2013-11-10 10:31 ` Michael S. Tsirkin
2013-11-11 10:15 ` Paolo Bonzini
0 siblings, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2013-11-10 10:31 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: lcapitulino, qemu-devel, marcel.a
On Thu, Nov 07, 2013 at 05:14:37PM +0100, Paolo Bonzini wrote:
> As an alternative to commit 818f86b (exec: limit system memory
> size, 2013-11-04) let's just make all address spaces 64-bit wide.
> This eliminates problems with phys_page_find ignoring bits above
> TARGET_PHYS_ADDR_SPACE_BITS and address_space_translate_internal
> consequently messing up the computations.
>
> In Luiz's reported crash, at startup gdb attempts to read from address
> 0xffffffffffffffe6 to 0xffffffffffffffff inclusive. The region it gets
> is the newly introduced master abort region, which is as big as the PCI
> address space (see pci_bus_init). Due to a typo that's only 2^63-1,
> not 2^64. But we get it anyway because phys_page_find ignores the upper
> bits of the physical address. In address_space_translate_internal then
>
> diff = int128_sub(section->mr->size, int128_make64(addr));
> *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
>
> diff becomes negative, and int128_get64 booms.
>
> The size of the PCI address space region should be fixed anyway.
>
> Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
So this causes a 12% performance regression on some TCG
tests, I think we should look into a smarter
datastructure to solve the issues.
> ---
> exec.c | 8 ++------
> 1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index 9e2fc4b..d5ce3da 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -89,7 +89,7 @@ struct PhysPageEntry {
> };
>
> /* Size of the L2 (and L3, etc) page tables. */
> -#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
> +#define ADDR_SPACE_BITS 64
>
> #define P_L2_BITS 10
> #define P_L2_SIZE (1 << P_L2_BITS)
> @@ -1750,11 +1750,7 @@ static void memory_map_init(void)
> {
> system_memory = g_malloc(sizeof(*system_memory));
>
> - assert(ADDR_SPACE_BITS <= 64);
> -
> - memory_region_init(system_memory, NULL, "system",
> - ADDR_SPACE_BITS == 64 ?
> - UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
> + memory_region_init(system_memory, NULL, "system", UINT64_MAX);
> address_space_init(&address_space_memory, system_memory, "memory");
>
> system_io = g_malloc(sizeof(*system_io));
> --
> 1.8.4.2
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide
2013-11-10 10:31 ` Michael S. Tsirkin
@ 2013-11-11 10:15 ` Paolo Bonzini
0 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-11 10:15 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: lcapitulino, qemu-devel, marcel.a
Il 10/11/2013 11:31, Michael S. Tsirkin ha scritto:
> > Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>
> So this causes a 12% performance regression on some TCG
> tests, I think we should look into a smarter
> datastructure to solve the issues.
It causes a 12% performance regression in a single testcase where KVM
has a 150x performance regression. This says a lot about the relevance
of the testcase.
In any case, I have patches to avoid the regression. For 1.7 we can
just revert the patches, for 1.8 we can apply this patch together with
the optimizations that avoid introducing a regression.
Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-07 17:29 ` Paolo Bonzini
2013-11-07 18:54 ` Michael S. Tsirkin
@ 2013-11-11 16:43 ` Michael S. Tsirkin
2013-11-11 16:57 ` Paolo Bonzini
1 sibling, 1 reply; 13+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:43 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: marcel.a, qemu-devel, lcapitulino
On Thu, Nov 07, 2013 at 06:29:40PM +0100, Paolo Bonzini wrote:
> Il 07/11/2013 17:47, Michael S. Tsirkin ha scritto:
> > That's on kvm with 52 bit address.
> > But where I would be concerned is systems with e.g. 36 bit address
> > space where we are doubling the cost of the lookup.
> > E.g. try i386 and not x86_64.
>
> Tried now...
>
> P_L2_LEVELS pre-patch post-patch
> i386 3 6
> x86_64 4 6
>
> I timed the inl_from_qemu test of vmexit.flat with both KVM and TCG. With
> TCG there's indeed a visible penalty of 20 cycles for i386 and 10 for x86_64
> (you can extrapolate to 30 cycles for TARGET_PHYS_ADDR_SPACE_BITS=32 targets).
So how did you measure this exactly?
> These can be more or less entirely ascribed to phys_page_find:
>
> TCG | KVM
> pre-patch post-patch | pre-patch post-patch
> phys_page_find(i386) 13% 25% | 0.6% 1%
> inl_from_qemu cycles(i386) 153 173 | ~12000 ~12000
> phys_page_find(x86_64) 18% 25% | 0.8% 1%
> inl_from_qemu cycles(x86_64) 163 173 | ~12000 ~12000
>
> Thus this patch costs 0.4% in the worst case for KVM, 12% in the worst case
> for TCG. The cycle breakdown is:
>
> 60 phys_page_find
> 28 access_with_adjusted_size
> 24 address_space_translate_internal
> 20 address_space_rw
> 13 io_mem_read
> 11 address_space_translate
> 9 memory_region_read_accessor
> 6 memory_region_access_valid
> 4 helper_inl
> 4 memory_access_size
> 3 cpu_inl
>
> (This run reported 177 cycles per access; the total is 182 due to rounding).
> It is probably possible to shave at least 10 cycles from the functions below,
> or to make the depth of the tree dynamic so that you would save even more
> compared to 1.6.0.
>
> Also, compiling with "-fstack-protector" instead of "-fstack-protector-all",
> as suggested a while ago by rth, is already giving a savings of 20 cycles.
>
> And of course, if this were a realistic test, KVM's 60x penalty would
> be a severe problem---but it isn't, because this is not a realistic setting.
>
> Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes
2013-11-11 16:43 ` Michael S. Tsirkin
@ 2013-11-11 16:57 ` Paolo Bonzini
0 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2013-11-11 16:57 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: marcel.a, qemu-devel, lcapitulino
Il 11/11/2013 17:43, Michael S. Tsirkin ha scritto:
> On Thu, Nov 07, 2013 at 06:29:40PM +0100, Paolo Bonzini wrote:
>> Il 07/11/2013 17:47, Michael S. Tsirkin ha scritto:
>>> That's on kvm with 52 bit address.
>>> But where I would be concerned is systems with e.g. 36 bit address
>>> space where we are doubling the cost of the lookup.
>>> E.g. try i386 and not x86_64.
>>
>> Tried now...
>>
>> P_L2_LEVELS pre-patch post-patch
>> i386 3 6
>> x86_64 4 6
>>
>> I timed the inl_from_qemu test of vmexit.flat with both KVM and TCG. With
>> TCG there's indeed a visible penalty of 20 cycles for i386 and 10 for x86_64
>> (you can extrapolate to 30 cycles for TARGET_PHYS_ADDR_SPACE_BITS=32 targets).
>
> So how did you measure this exactly?
I mention extrapolation because x86 is TARGET_PHYS_ADDR_SPACE_BITS=36,
not 32.
Paolo
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2013-11-11 16:58 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-07 16:14 [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 1/2] split definitions for exec.c and translate-all.c radix trees Paolo Bonzini
2013-11-07 16:14 ` [Qemu-devel] [PATCH 2/2] exec: make address spaces 64-bit wide Paolo Bonzini
2013-11-10 10:31 ` Michael S. Tsirkin
2013-11-11 10:15 ` Paolo Bonzini
2013-11-07 16:21 ` [Qemu-devel] [PATCH 0/2] exec: alternative fix for master abort woes Michael S. Tsirkin
2013-11-07 16:29 ` Paolo Bonzini
2013-11-07 16:47 ` Michael S. Tsirkin
2013-11-07 17:29 ` Paolo Bonzini
2013-11-07 18:54 ` Michael S. Tsirkin
2013-11-07 19:12 ` Paolo Bonzini
2013-11-11 16:43 ` Michael S. Tsirkin
2013-11-11 16:57 ` Paolo Bonzini
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).