[Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]_phys
@ 2008-10-14 19:59 Anthony Liguori
  2008-10-20 16:36 ` Blue Swirl
  0 siblings, 1 reply; 3+ messages in thread
From: Anthony Liguori @ 2008-10-14 19:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori

This patch optimizes the ld and st functions that operate on physical addresses.
Right now, a number of them default to cpu_phys_memory_{read,write} which is
very slow.  As long as the operations are aligned, it is safe to just lookup
the page and directly read/write the data via ld_p or st_p.

This patch introduces a common function since all of these functions are
roughly the same.  I've tested x86 and sparc with Linux and Windows guests.

I'm pretty confident that this new code is functionally equivalent but I wanted
to have someone else confirm this.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/exec.c b/exec.c
index f1fcec8..71336b5 100644
--- a/exec.c
+++ b/exec.c
@@ -2911,43 +2911,12 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
     }
 }
 
-
-/* warning: addr must be aligned */
-uint32_t ldl_phys(target_phys_addr_t addr)
-{
-    int io_index;
-    uint8_t *ptr;
-    uint32_t val;
-    unsigned long pd;
-    PhysPageDesc *p;
-
-    p = phys_page_find(addr >> TARGET_PAGE_BITS);
-    if (!p) {
-        pd = IO_MEM_UNASSIGNED;
-    } else {
-        pd = p->phys_offset;
-    }
-
-    if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
-        !(pd & IO_MEM_ROMD)) {
-        /* I/O case */
-        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
-        val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
-    } else {
-        /* RAM case */
-        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
-            (addr & ~TARGET_PAGE_MASK);
-        val = ldl_p(ptr);
-    }
-    return val;
-}
-
 /* warning: addr must be aligned */
-uint64_t ldq_phys(target_phys_addr_t addr)
+static uint64_t ld_phys(target_phys_addr_t addr, int size)
 {
     int io_index;
     uint8_t *ptr;
-    uint64_t val;
+    uint64_t val = 0;
     unsigned long pd;
     PhysPageDesc *p;
 
@@ -2962,76 +2931,69 @@ uint64_t ldq_phys(target_phys_addr_t addr)
         !(pd & IO_MEM_ROMD)) {
         /* I/O case */
         io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        switch (size) {
+        case 1:
+            val = io_mem_read[io_index][0](io_mem_opaque[io_index], addr);
+            break;
+        case 2:
+            val = io_mem_read[io_index][1](io_mem_opaque[io_index], addr);
+            break;
+        case 4:
+            val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
+            break;
+        case 8:
 #ifdef TARGET_WORDS_BIGENDIAN
-        val = (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr) << 32;
-        val |= io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4);
+            val = (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr) << 32;
+            val |= io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4);
 #else
-        val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
-        val |= (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4) << 32;
+            val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
+            val |= (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4) << 32;
 #endif
+            break;
+        }
     } else {
         /* RAM case */
         ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
             (addr & ~TARGET_PAGE_MASK);
-        val = ldq_p(ptr);
+        switch (size) {
+        case 1: val = ldub_p(ptr); break;
+        case 2: val = lduw_p(ptr); break;
+        case 4: val = ldl_p(ptr); break;
+        case 8: val = ldq_p(ptr); break;
+        }
     }
     return val;
 }
 
-/* XXX: optimize */
-uint32_t ldub_phys(target_phys_addr_t addr)
+/* warning: addr must be aligned */
+uint32_t ldl_phys(target_phys_addr_t addr)
 {
-    uint8_t val;
-    cpu_physical_memory_read(addr, &val, 1);
-    return val;
+    return ld_phys(addr, 4);
 }
 
-/* XXX: optimize */
-uint32_t lduw_phys(target_phys_addr_t addr)
+/* warning: addr must be aligned */
+uint64_t ldq_phys(target_phys_addr_t addr)
 {
-    uint16_t val;
-    cpu_physical_memory_read(addr, (uint8_t *)&val, 2);
-    return tswap16(val);
+    return ld_phys(addr, 8);
 }
 
-/* warning: addr must be aligned. The ram page is not masked as dirty
-   and the code inside is not invalidated. It is useful if the dirty
-   bits are used to track modified PTEs */
-void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
+uint32_t ldub_phys(target_phys_addr_t addr)
 {
-    int io_index;
-    uint8_t *ptr;
-    unsigned long pd;
-    PhysPageDesc *p;
-
-    p = phys_page_find(addr >> TARGET_PAGE_BITS);
-    if (!p) {
-        pd = IO_MEM_UNASSIGNED;
-    } else {
-        pd = p->phys_offset;
-    }
+    return ld_phys(addr, 1);
+}
 
-    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
-        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
-    } else {
-        unsigned long addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
-        ptr = phys_ram_base + addr1;
-        stl_p(ptr, val);
-
-        if (unlikely(in_migration)) {
-            if (!cpu_physical_memory_is_dirty(addr1)) {
-                /* invalidate code */
-                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
-                /* set dirty bit */
-                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
-                    (0xff & ~CODE_DIRTY_FLAG);
-            }
-        }
-    }
+uint32_t lduw_phys(target_phys_addr_t addr)
+{
+    if (unlikely(addr & 1)) {
+        uint16_t val;
+        cpu_physical_memory_read(addr, (uint8_t *)&val, 2);
+        return tswap16(val);
+    } else
+        return ld_phys(addr, 2);
 }
 
-void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val)
+/* warning: must be aligned */
+static void st_phys(target_phys_addr_t addr, int size, uint64_t val, int dirty)
 {
     int io_index;
     uint8_t *ptr;
@@ -3047,47 +3009,41 @@ void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val)
 
     if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
         io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        switch (size) {
+        case 1:
+            io_mem_write[io_index][0](io_mem_opaque[io_index], addr, val);
+            break;
+        case 2:
+            io_mem_write[io_index][1](io_mem_opaque[io_index], addr, val);
+            break;
+        case 4:
+            io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+            break;
+        case 8:
 #ifdef TARGET_WORDS_BIGENDIAN
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val >> 32);
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val);
+            io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val >> 32);
+            io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val);
 #else
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val >> 32);
+            io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+            io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val >> 32);
 #endif
-    } else {
-        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
-            (addr & ~TARGET_PAGE_MASK);
-        stq_p(ptr, val);
-    }
-}
-
-/* warning: addr must be aligned */
-void stl_phys(target_phys_addr_t addr, uint32_t val)
-{
-    int io_index;
-    uint8_t *ptr;
-    unsigned long pd;
-    PhysPageDesc *p;
-
-    p = phys_page_find(addr >> TARGET_PAGE_BITS);
-    if (!p) {
-        pd = IO_MEM_UNASSIGNED;
-    } else {
-        pd = p->phys_offset;
-    }
-
-    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
-        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
-        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+            break;
+        }
     } else {
         unsigned long addr1;
+
         addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
         /* RAM case */
-        ptr = phys_ram_base + addr1;
-        stl_p(ptr, val);
-        if (!cpu_physical_memory_is_dirty(addr1)) {
+        ptr = phys_ram_base + addr;
+        switch (size) {
+        case 1: stb_p(ptr, val); break;
+        case 2: stw_p(ptr, val); break;
+        case 4: stl_p(ptr, val); break;
+        case 8: stq_p(ptr, val); break;
+        }
+        if (dirty && !cpu_physical_memory_is_dirty(addr1)) {
             /* invalidate code */
-            tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
+            tb_invalidate_phys_page_range(addr1, addr1 + size, 0);
             /* set dirty bit */
             phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
                 (0xff & ~CODE_DIRTY_FLAG);
@@ -3095,25 +3051,44 @@ void stl_phys(target_phys_addr_t addr, uint32_t val)
     }
 }
 
-/* XXX: optimize */
+/* warning: addr must be aligned. The ram page is not masked as dirty
+   and the code inside is not invalidated. It is useful if the dirty
+   bits are used to track modified PTEs */
+void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
+{
+    st_phys(addr, 4, val, 0);
+}
+
+/* warning: addr must be aligned */
+void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val)
+{
+    st_phys(addr, 8, val, 0);
+}
+
+/* warning: addr must be aligned */
+void stl_phys(target_phys_addr_t addr, uint32_t val)
+{
+    st_phys(addr, 4, val, 1);
+}
+
 void stb_phys(target_phys_addr_t addr, uint32_t val)
 {
-    uint8_t v = val;
-    cpu_physical_memory_write(addr, &v, 1);
+    st_phys(addr, 1, val, 1);
 }
 
-/* XXX: optimize */
 void stw_phys(target_phys_addr_t addr, uint32_t val)
 {
-    uint16_t v = tswap16(val);
-    cpu_physical_memory_write(addr, (const uint8_t *)&v, 2);
+    if (unlikely(addr & 1)) {
+        uint16_t v = tswap16(val);
+        cpu_physical_memory_write(addr, (const uint8_t *)&v, 2);
+    } else
+        st_phys(addr, 2, val, 1);
 }
 
-/* XXX: optimize */
+/* warning: addr must be aligned */
 void stq_phys(target_phys_addr_t addr, uint64_t val)
 {
-    val = tswap64(val);
-    cpu_physical_memory_write(addr, (const uint8_t *)&val, 8);
+    st_phys(addr, 8, val, 1);
 }
 
 #endif

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]_phys
  2008-10-14 19:59 [Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]_phys Anthony Liguori
@ 2008-10-20 16:36 ` Blue Swirl
  2008-10-21 13:37   ` Anthony Liguori
  0 siblings, 1 reply; 3+ messages in thread
From: Blue Swirl @ 2008-10-20 16:36 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori

On 10/14/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch optimizes the ld and st functions that operate on physical addresses.
>  Right now, a number of them default to cpu_phys_memory_{read,write} which is
>  very slow.  As long as the operations are aligned, it is safe to just lookup
>  the page and directly read/write the data via ld_p or st_p.
>
>  This patch introduces a common function since all of these functions are
>  roughly the same.  I've tested x86 and sparc with Linux and Windows guests.
>
>  I'm pretty confident that this new code is functionally equivalent but I wanted
>  to have someone else confirm this.

Why there are special cases for lduw and stw?

I'd add 'inline' to the common functions, otherwise looks OK.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]_phys
  2008-10-20 16:36 ` Blue Swirl
@ 2008-10-21 13:37   ` Anthony Liguori
  0 siblings, 0 replies; 3+ messages in thread
From: Anthony Liguori @ 2008-10-21 13:37 UTC (permalink / raw)
  To: qemu-devel

Blue Swirl wrote:
> On 10/14/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> This patch optimizes the ld and st functions that operate on physical addresses.
>>  Right now, a number of them default to cpu_phys_memory_{read,write} which is
>>  very slow.  As long as the operations are aligned, it is safe to just lookup
>>  the page and directly read/write the data via ld_p or st_p.
>>
>>  This patch introduces a common function since all of these functions are
>>  roughly the same.  I've tested x86 and sparc with Linux and Windows guests.
>>
>>  I'm pretty confident that this new code is functionally equivalent but I wanted
>>  to have someone else confirm this.
>>     
>
> Why there are special cases for lduw and stw?
>   

I figured the 'u' in lduw stood for unaligned.  The "optimization" only 
works if the physical address is aligned because it doesn't handle the 
case where a word would cross a physical page boundary.  This is why it 
falls back to cpu_physical_memory_rw which can handle this.

ldub is the same, but since it's a single byte, it can't cross a page 
boundary so I don't think the special casing is necessary (or really 
possible :-)).

I wasn't sure about stw.  The current code didn't assume alignment but 
the name doesn't imply that alignment is necessary.  I added a fallback 
case just to be on the safe side in case any code depends on it today.

FWIW, I'm still not sure this optimization really makes anything faster.

> I'd add 'inline' to the common functions, otherwise looks OK.
>   

In general, I tend to avoid inline unless it's required for correctness 
(like in a header file in combination with static).  In general, the 
compiler can make better decisions about inlining than a person can.  
I'm not at all interested in starting a long thread about the merits of 
'inline' but suffice to say that this is not an uncommon view.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-10-21 13:37 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-14 19:59 [Qemu-devel] [PATCH][RFC] Optimize ld[bwlq]_phys and st[bwlq]_phys Anthony Liguori
2008-10-20 16:36 ` Blue Swirl
2008-10-21 13:37   ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).