[Qemu-devel] Faster, generic IO/DMA model with vectored AIO?

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
@ 2007-10-27 12:56 Blue Swirl
  2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl
  2007-10-28  1:29 ` [Qemu-devel] " Paul Brook
  0 siblings, 2 replies; 8+ messages in thread
From: Blue Swirl @ 2007-10-27 12:56 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1287 bytes --]

Hi,

I changed Slirp output to use vectored IO to avoid the slowdown from
memcpy (see the patch for the work in progress, gives a small
performance improvement). But then I got the idea that using AIO would
be nice at the outgoing end of the network IO processing. In fact,
vectored AIO model could even be used for the generic DMA! The benefit
is that no buffering or copying should be needed.

Instead of
void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                            int len, int is_write);
and its device variant, we'd have something like
int qemu_lio_listio(int mode, struct GenericAIOcb *list[], unsigned
int nent, IOCompletionFunc *cb);

Each stage would translate the IO list and callback as needed and only
the final stage would perform the IO or memcpy. This would be used in
each stage of the chain memory<->IOMMU<->device<->SLIRP<->host network
device. Of course some kind of host support for vectored AIO for these
devices is required. On target side, devices that can do
scatter/gather DMA would benefit most.

For the specific Sparc32 case, unfortunately Lance bus byte swapping
makes buffering necessary at that stage, unless we can make N vectors
with just a single byte faster than memcpy + bswap of memory block
with size N.

Comments?

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: slirp_iov.diff --]
[-- Type: text/x-diff; name=slirp_iov.diff, Size: 9315 bytes --]

Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2007-10-27 07:06:30.000000000 +0000
+++ qemu/vl.c	2007-10-27 11:06:09.000000000 +0000
@@ -1540,8 +1540,46 @@
 }
 
 /***********************************************************/
-/* character device */
+/* Helpers for vectored IO*/
+static void qemu_readv_with_read(void *opaque, IOReadHandler *fd_read,
+                                 const struct qemu_iovec *vector, int count)
+{
+#if 1
+    int i, currlen = 0;
+    char buf[8192];
+
+    if (fd_read) {
+        for (i = 0; i < count; i++) {
+            if (currlen + vector[i].iov_len < sizeof(buf))
+                memcpy(&buf[currlen], vector[i].iov_base, vector[i].iov_len);
+            else
+                fprintf(stderr, "bad currlen %d iov.len %ld\n", currlen, vector[i].iov_len);
+            currlen += vector[i].iov_len;
+        }
+        fd_read(opaque, buf, currlen);
+    }
+#else
+    int i;
+
+    if (fd_read)
+        for (i = 0; i < count; i++)
+            fd_read(opaque, vector[i].iov_base, vector[i].iov_len);
+#endif
+}
 
+static void qemu_read_with_readv(void *opaque, IOReadvHandler *fd_readv,
+                                 const uint8_t *buf, int size)
+{
+    struct qemu_iovec iov;
+
+    iov.iov_base = buf;
+    iov.iov_len = size;
+    if (fd_readv)
+        fd_readv(opaque, &iov, 1);
+}
+
+/***********************************************************/
+/* character device */
 static void qemu_chr_event(CharDriverState *s, int event)
 {
     if (!s->chr_event)
@@ -3573,6 +3611,18 @@
     return vc;
 }
 
+VLANClientState *qemu_new_vlan_client_iov(VLANState *vlan,
+                                          IOReadvHandler *fd_readv,
+                                          IOCanRWHandler *fd_can_read,
+                                          void *opaque)
+{
+    VLANClientState *vc;
+
+    vc = qemu_new_vlan_client(vlan, NULL, fd_can_read, opaque);
+    vc->fd_readv = fd_readv;
+    return vc;
+}
+
 int qemu_can_send_packet(VLANClientState *vc1)
 {
     VLANState *vlan = vc1->vlan;
@@ -3598,7 +3648,26 @@
 #endif
     for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
         if (vc != vc1) {
-            vc->fd_read(vc->opaque, buf, size);
+            if (vc->fd_read)
+                vc->fd_read(vc->opaque, buf, size);
+            else if (vc->fd_readv)
+                qemu_read_with_readv(vc->opaque, vc->fd_readv, buf, size);
+        }
+    }
+}
+
+void qemu_send_packet_iov(VLANClientState *vc1, const struct qemu_iovec *vector,
+                          int count)
+{
+    VLANState *vlan = vc1->vlan;
+    VLANClientState *vc;
+
+    for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
+        if (vc != vc1) {
+            if (vc->fd_readv)
+                vc->fd_readv(vc->opaque, vector, count);
+            else if (vc->fd_read)
+                qemu_readv_with_read(vc->opaque, vc->fd_read, vector, count);
         }
     }
 }
@@ -3626,6 +3695,13 @@
     qemu_send_packet(slirp_vc, pkt, pkt_len);
 }
 
+void slirp_output_iov(const struct qemu_iovec *vector, int count)
+{
+    if (!slirp_vc)
+        return;
+    qemu_send_packet_iov(slirp_vc, vector, count);
+}
+
 static void slirp_receive(void *opaque, const uint8_t *buf, int size)
 {
 #if 0
@@ -4944,13 +5020,12 @@
 
 static IOHandlerRecord *first_io_handler;
 
-/* XXX: fd_read_poll should be suppressed, but an API change is
-   necessary in the character devices to suppress fd_can_read(). */
-int qemu_set_fd_handler2(int fd,
-                         IOCanRWHandler *fd_read_poll,
-                         IOHandler *fd_read,
-                         IOHandler *fd_write,
-                         void *opaque)
+static IOHandlerRecord *
+qemu_set_fd_handler3(int fd,
+                     IOCanRWHandler *fd_read_poll,
+                     IOHandler *fd_read,
+                     IOHandler *fd_write,
+                     void *opaque)
 {
     IOHandlerRecord **pioh, *ioh;
 
@@ -4973,17 +5048,38 @@
         }
         ioh = qemu_mallocz(sizeof(IOHandlerRecord));
         if (!ioh)
-            return -1;
+            return NULL;
         ioh->next = first_io_handler;
         first_io_handler = ioh;
     found:
         ioh->fd = fd;
         ioh->fd_read_poll = fd_read_poll;
         ioh->fd_read = fd_read;
+#if 0
+        if (!fd_read)
+            ioh->fd_readv = NULL;
+#endif
         ioh->fd_write = fd_write;
         ioh->opaque = opaque;
         ioh->deleted = 0;
     }
+    return ioh;
+}
+
+/* XXX: fd_read_poll should be suppressed, but an API change is
+   necessary in the character devices to suppress fd_can_read(). */
+int qemu_set_fd_handler2(int fd,
+                         IOCanRWHandler *fd_read_poll,
+                         IOHandler *fd_read,
+                         IOHandler *fd_write,
+                         void *opaque)
+{
+    IOHandlerRecord *ioh;
+
+    ioh = qemu_set_fd_handler3(fd, NULL, fd_read, fd_write, opaque);
+    if (!ioh)
+        return -1;
+
     return 0;
 }
 
@@ -4995,6 +5091,25 @@
     return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque);
 }
 
+#if 0
+int qemu_set_fd_handler_iov(int fd,
+                            IOHandler *fd_readv,
+                            IOHandler *fd_writev,
+                            void *opaque)
+{
+    IOHandlerRecord *ioh;
+
+    ioh = qemu_set_fd_handler3(fd, NULL, NULL, NULL, opaque);
+    if (!ioh)
+        return -1;
+
+    ioh->fd_readv = fd_readv;
+    ioh->fd_writev = fd_writev;
+
+    return 0;
+}
+#endif
+
 /***********************************************************/
 /* Polling handling */
 
Index: qemu/vl.h
===================================================================
--- qemu.orig/vl.h	2007-10-27 07:11:15.000000000 +0000
+++ qemu/vl.h	2007-10-27 08:03:00.000000000 +0000
@@ -263,6 +263,12 @@
 /* async I/O support */
 
 typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
+struct qemu_iovec {
+    void *iov_base;
+    size_t iov_len;
+};
+typedef void IOReadvHandler(void *opaque, const struct qemu_iovec *vector,
+                            int count);
 typedef int IOCanRWHandler(void *opaque);
 typedef void IOHandler(void *opaque);
 
@@ -275,6 +281,10 @@
                         IOHandler *fd_read,
                         IOHandler *fd_write,
                         void *opaque);
+int qemu_set_fd_handler_iov(int fd,
+                            IOHandler *fd_readv,
+                            IOHandler *fd_writev,
+                            void *opaque);
 
 /* Polling handling */
 
@@ -396,6 +406,7 @@
 
 struct VLANClientState {
     IOReadHandler *fd_read;
+    IOReadvHandler *fd_readv;
     /* Packets may still be sent if this returns zero.  It's used to
        rate-limit the slirp code.  */
     IOCanRWHandler *fd_can_read;
@@ -417,8 +428,14 @@
                                       IOReadHandler *fd_read,
                                       IOCanRWHandler *fd_can_read,
                                       void *opaque);
+VLANClientState *qemu_new_vlan_client_iov(VLANState *vlan,
+                                          IOReadvHandler *fd_readv,
+                                          IOCanRWHandler *fd_can_read,
+                                          void *opaque);
 int qemu_can_send_packet(VLANClientState *vc);
 void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
+void qemu_send_packet_iov(VLANClientState *vc, const struct qemu_iovec *vector,
+                          int count);
 void qemu_handler_true(void *opaque);
 
 void do_info_network(void);
Index: qemu/slirp/libslirp.h
===================================================================
--- qemu.orig/slirp/libslirp.h	2007-10-27 07:06:13.000000000 +0000
+++ qemu/slirp/libslirp.h	2007-10-27 08:18:10.000000000 +0000
@@ -17,6 +17,8 @@
 /* you must provide the following functions: */
 int slirp_can_output(void);
 void slirp_output(const uint8_t *pkt, int pkt_len);
+struct qemu_iovec;
+void slirp_output_iov(const struct qemu_iovec *vector, int count);
 
 int slirp_redir(int is_udp, int host_port,
                 struct in_addr guest_addr, int guest_port);
Index: qemu/slirp/slirp.c
===================================================================
--- qemu.orig/slirp/slirp.c	2007-10-27 07:06:19.000000000 +0000
+++ qemu/slirp/slirp.c	2007-10-27 10:44:18.000000000 +0000
@@ -636,19 +636,22 @@
 /* output the IP packet to the ethernet device */
 void if_encap(const uint8_t *ip_data, int ip_data_len)
 {
-    uint8_t buf[1600];
-    struct ethhdr *eh = (struct ethhdr *)buf;
-
-    if (ip_data_len + ETH_HLEN > sizeof(buf))
-        return;
+    struct ethhdr buf, *eh = &buf;
+    struct {
+        void *data;
+        size_t len;
+    } iov[2];
 
     memcpy(eh->h_dest, client_ethaddr, ETH_ALEN);
     memcpy(eh->h_source, special_ethaddr, ETH_ALEN - 1);
     /* XXX: not correct */
     eh->h_source[5] = CTL_ALIAS;
     eh->h_proto = htons(ETH_P_IP);
-    memcpy(buf + sizeof(struct ethhdr), ip_data, ip_data_len);
-    slirp_output(buf, ip_data_len + ETH_HLEN);
+    iov[0].data = &buf;
+    iov[0].len = sizeof(struct ethhdr);
+    iov[1].data = ip_data;
+    iov[1].len = ip_data_len;
+    slirp_output_iov(iov, 2);
 }
 
 int slirp_redir(int is_udp, int host_port,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Qemu-devel] Re: Faster, generic IO/DMA model with vectored AIO?
  2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl
@ 2007-10-27 16:53 ` Blue Swirl
  2007-10-28  1:29 ` [Qemu-devel] " Paul Brook
  1 sibling, 0 replies; 8+ messages in thread
From: Blue Swirl @ 2007-10-27 16:53 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1077 bytes --]

On 10/27/07, Blue Swirl <blauwirbel@gmail.com> wrote:
> I changed Slirp output to use vectored IO to avoid the slowdown from
> memcpy (see the patch for the work in progress, gives a small
> performance improvement). But then I got the idea that using AIO would
> be nice at the outgoing end of the network IO processing. In fact,
> vectored AIO model could even be used for the generic DMA! The benefit
> is that no buffering or copying should be needed.

I made a sketch of the API, please have a look at the patch.

> Each stage would translate the IO list and callback as needed and only
> the final stage would perform the IO or memcpy. This would be used in
> each stage of the chain memory<->IOMMU<->device<->SLIRP<->host network
> device. Of course some kind of host support for vectored AIO for these
> devices is required. On target side, devices that can do
> scatter/gather DMA would benefit most.

Inside Qemu the vectors would use target physical addresses (struct
qemu_iovec), but at some point the addresses would change to host
pointers suitable for real AIO.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: gdma_aiov.diff --]
[-- Type: text/x-diff; name=gdma_aiov.diff, Size: 3246 bytes --]

Index: qemu/vl.h
===================================================================
--- qemu.orig/vl.h	2007-10-27 14:58:09.000000000 +0000
+++ qemu/vl.h	2007-10-27 16:51:40.000000000 +0000
@@ -746,6 +746,85 @@
 
 #include "hw/irq.h"
 
+/* Generic DMA API */
+
+typedef void DMADriverCompletionFunc(void *opaque, int ret);
+
+struct qemu_iovec {
+    target_phys_addr_t iov_base;
+    size_t iov_len;
+};
+
+typedef struct qemu_bus qemu_bus;
+
+typedef struct DMADriverAIOCB {
+    void *opaque;
+    int type;
+    int nent;
+    struct aiocb **aiocb;
+    DMADriverCompletionFunc *cb;
+    struct DMADriverAIOCB *next;
+} DMADriverAIOCB;
+
+typedef void DMARWHandler(void *opaque,
+                          const struct qemu_iovec *dst_vector,
+                          int dst_count,
+                          const struct qemu_iovec *src_vector,
+                          int src_count);
+
+qemu_bus *bus_init(unsigned int bus_bits, DMARWHandler north_handler,
+                   void *north_handler_opaque, DMARWHandler south_handler,
+                   void *south_handler_opaque);
+
+/* Direction CPU->bridge->device/memory */
+void bus_rw_south(qemu_bus *bus,
+                  const struct qemu_iovec *dst_vector,
+                  int dst_count,
+                  const struct qemu_iovec *src_vector,
+                  int src_count,
+                  int is_write);
+
+static inline void bus_read_south(qemu_bus *bus,
+                                  const struct qemu_iovec *dst_vector,
+                                  int dst_count,
+                                  const struct qemu_iovec *src_vector,
+                                  int src_count)
+{
+    bus_rw_south(bus, dst_vector, dst_count, src_vector, src_count, 0);
+}
+static inline void bus_write_south(qemu_bus *bus,
+                                   const struct qemu_iovec *dst_vector,
+                                   int dst_count,
+                                   const struct qemu_iovec *src_vector,
+                                   int src_count)
+{
+    bus_rw_south(bus, dst_vector, dst_count, src_vector, src_count, 1);
+}
+/* From device towards CPU/memory (DMA) */
+void bus_rw_north(qemu_bus *bus,
+                  const struct qemu_iovec *dst_vector,
+                  int dst_count,
+                  const struct qemu_iovec *src_vector,
+                  int src_count,
+                  int is_write);
+
+static inline void bus_read_north(qemu_bus *bus,
+                                  const struct qemu_iovec *dst_vector,
+                                  int dst_count,
+                                  const struct qemu_iovec *src_vector,
+                                  int src_count)
+{
+    bus_rw_north(bus, dst_vector, dst_count, src_vector, src_count, 0);
+}
+static inline void bus_write_north(qemu_bus *bus,
+                                   const struct qemu_iovec *dst_vector,
+                                   int dst_count,
+                                   const struct qemu_iovec *src_vector,
+                                   int src_count)
+{
+    bus_rw_north(bus, dst_vector, dst_count, src_vector, src_count, 1);
+}
+
 /* ISA bus */
 
 extern target_phys_addr_t isa_mem_base;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl
  2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl
@ 2007-10-28  1:29 ` Paul Brook
  2007-10-28  9:09   ` Blue Swirl
  1 sibling, 1 reply; 8+ messages in thread
From: Paul Brook @ 2007-10-28  1:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

> I changed Slirp output to use vectored IO to avoid the slowdown from
> memcpy (see the patch for the work in progress, gives a small
> performance improvement). But then I got the idea that using AIO would
> be nice at the outgoing end of the network IO processing. In fact,
> vectored AIO model could even be used for the generic DMA! The benefit
> is that no buffering or copying should be needed.

An interesting idea, however I don't want to underestimate the difficulty of 
implementing this correctly.  I suspect to get real benefits you need to 
support zero-copy async operation all the way through.  Things get really 
hairy if you allow some operations to complete synchronously, and some to be 
deferred. 

I've done async operation for SCSI and USB. The latter is really not pretty, 
and the former has some notable warts. A generic IODMA framework needs to 
make sure it covers these requirements without making things worse. Hopefully 
it'll also help fix the things that are wrong with them.

> For the specific Sparc32 case, unfortunately Lance bus byte swapping
> makes buffering necessary at that stage, unless we can make N vectors
> with just a single byte faster than memcpy + bswap of memory block
> with size N.

We really want to be dealing with largeish blocks. The {ptr,size} vector is 64 
or 128 bytes per element, so the overhead on blocks < 64 bytes if going to be 
really brutal. Also time taken to do address translation will be O(number of 
vectors).

> Inside Qemu the vectors would use target physical addresses (struct
> qemu_iovec), but at some point the addresses would change to host
> pointers suitable for real AIO.

Phrases like "at some point" worry me :-)

I think it would be good to get a top-down description of what each different 
entity (initiating device, host endpoint, bus translation, memory) is 
responsible for, and how they all fit together.

I have some ideas, but without more detailed investigation can't tell if they 
will actually work in practice, or if they fit into the code fragments you've 
posted. My suspicion is they don't as I can't make head or tail of how your 
gdma_aiov.diff patch would be used in practice.

Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-28  1:29 ` [Qemu-devel] " Paul Brook
@ 2007-10-28  9:09   ` Blue Swirl
  2007-10-28 19:10     ` Jamie Lokier
  2007-10-28 20:55     ` Blue Swirl
  0 siblings, 2 replies; 8+ messages in thread
From: Blue Swirl @ 2007-10-28  9:09 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On 10/28/07, Paul Brook <paul@codesourcery.com> wrote:
> > I changed Slirp output to use vectored IO to avoid the slowdown from
> > memcpy (see the patch for the work in progress, gives a small
> > performance improvement). But then I got the idea that using AIO would
> > be nice at the outgoing end of the network IO processing. In fact,
> > vectored AIO model could even be used for the generic DMA! The benefit
> > is that no buffering or copying should be needed.
>
> An interesting idea, however I don't want to underestimate the difficulty of
> implementing this correctly.  I suspect to get real benefits you need to
> support zero-copy async operation all the way through.  Things get really
> hairy if you allow some operations to complete synchronously, and some to be
> deferred.

Zero-copy can be the first goal, async may come later. I hope we can
do the change in stages, perhaps introducing temporary conversion
helpers as needed.

> I've done async operation for SCSI and USB. The latter is really not pretty,
> and the former has some notable warts. A generic IODMA framework needs to
> make sure it covers these requirements without making things worse. Hopefully
> it'll also help fix the things that are wrong with them.
>
> > For the specific Sparc32 case, unfortunately Lance bus byte swapping
> > makes buffering necessary at that stage, unless we can make N vectors
> > with just a single byte faster than memcpy + bswap of memory block
> > with size N.
>
> We really want to be dealing with largeish blocks. The {ptr,size} vector is 64
> or 128 bytes per element, so the overhead on blocks < 64 bytes if going to be
> really brutal. Also time taken to do address translation will be O(number of
> vectors).

That's what I suspected as well.

> > Inside Qemu the vectors would use target physical addresses (struct
> > qemu_iovec), but at some point the addresses would change to host
> > pointers suitable for real AIO.
>
> Phrases like "at some point" worry me :-)
>
> I think it would be good to get a top-down description of what each different
> entity (initiating device, host endpoint, bus translation, memory) is
> responsible for, and how they all fit together.
>
>
> I have some ideas, but without more detailed investigation can't tell if they
> will actually work in practice, or if they fit into the code fragments you've
> posted. My suspicion is they don't as I can't make head or tail of how your
> gdma_aiov.diff patch would be used in practice.

Ok, I'll try to make a mental exercise with this chain:
SCSI->ESP->ESPDMA->IOMMU->memory write. Scenario: SCSI read issued, 8k
size. I'll track the address+size vectors at each stage.

scsi-disk uses host memory addresses. ESP uses addresses ranging from
0 to end of request. ESPDMA forces the MS byte to be 0xfc. IOMMU
translates page 0xfc000000 to 0x1000 and page 0xfc001000 to 0x4000.
Memory translates 0x1000 to phys_ram_base + 0x1000, likewise for
0x4000. From this point on, we will be using host memory addresses
again. Each stage may change the callback if needed.

Currently scsi-disk provides a buffer. For true zero copy, this needs
to be changed so that instead the buffer is provided by the caller at
each stage until we reach the host memory. But I'll use the scsi-disk
buffer for now.

Initially the (address, size) vectors provided by scsi-disk is:
src_vector = (&SCSIDevice->SCSIRequest->dma_buf[0], 8192). What's the
destination vector, (NULL, 0)? scsi-disk calls bus_write_north, which
transfers control to ESP.

ESP changes the vectors to
src_vector = (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst_vector
= (0, 8192). Calls bus_write_north -> ESPDMA.

ESPDMA:
src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst (0xfc000000, 8192).
-> IOMMU.

After IOMMU:
src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst ((0x1000, 4096),
(0x4000, 4096)).

After memory:
src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst ((phys_ram_base
+ 0x1000, 4096), (phys_ram_base + 0x4000, 4096)).

scsi-disk or memory (which?) can now perform the memcpy.

But now we also have the information to perform the disk read without
copying. Do we need the source vectors at all?

Let's try the other direction, SCSI write. Other parameters are unchanged.

Now the destination is scsi-disk buffer, source vector will be translated.
src=(NULL, 0), dst =(&SCSIDevice->SCSIRequest->dma_buf[0], 8192).
scsi-disk calls bus_read_north, which transfers control to ESP.

ESP:
src = (0, 8192), dst unchanged.

ESPDMA:
src = (0xfc000000, 8192), dst unchanged.

IOMMU:
src ((0x1000, 4096), (0x4000, 4096)).

Memory:
src ((phys_ram_base + 0x1000, 4096), (phys_ram_base + 0x4000, 4096)).

Now having made this exercise, I think we only need a translation
function. It changes the addresses and adds a callback to handle the
intermediate buffers. Let's try this improved model in a more complex
scenario: SLIRP TCP socket (host) -> SLIRP IP -> SLIRP interface ->
Lance -> LEDMA -> IOMMU -> memory.

SLIRP IP adds IP headers, SLIRP Ethernet link adds Ethernet headers.
LEDMA must buffer the data to perform byte swapping and MS byte will
be forced to 0xfc. For IOMMU we reuse the disk parameters.

We need to give a buffer for host recvmsg(). How can we determine the
buffer size? We really know that only after the packet has been
received. Let's pick a packet size of 4096.

TCP:
(0, 4096)

IP adds iphdr:
((0, sizeof(iphdr)), (sizeof(iphdr), 4096))

Link adds Ethernet header:
((0, sizeof(ethhdr), (sizeof(ethhdr), sizeof(iphdr)), (sizeof(ehthdr)
+ sizeof(iphdr), 4096))

Lance searches the receive descriptors for a buffer. We need to set a
bit to indicate that the buffer is in use, so a callback is needed.
The translated vectors are:
((0x1000, sizeof(ethhdr), (0x1000 + sizeof(ethhdr), sizeof(iphdr)),
(0x1000 + sizeof(ehthdr) + sizeof(iphdr), 4096)), callback
lance_rmd_store.

LEDMA provides a bswap translation buffer. So, for TCP the final vectors are:
(&DMAState->lebuffer[0], s(eth)), (&DMAState->lebuffer[s(eth)],
s(ip)), (&DMAState->lebuffer[s(eth)+ s(ip)], 4096)), callbacks
(le_bswap_buffer, lance_rmd_store).

TCP recv writes to lebuffer with AIO, callback le_bswap_buffer issued.

le_bswap_buffer performs bswap, wants to copy the buffer to target
memory. The new vector is:
(0xfc000000, 4k+stuff) (can we merge the vectors at this point?),
callback lance_rmd_store.

IOMMU:
((0x1000, 4096), (0x4000, stuff))

Memory:
((phys_ram_base + 0x1000, 4096), (phys_ram_base + 0x4000, stuff)).

lance_rmd_store is called: memcpy from lebuffer to destination,
updates the descriptor (another translation), raises IRQ etc.

I think this should work.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-28  9:09   ` Blue Swirl
@ 2007-10-28 19:10     ` Jamie Lokier
  2007-10-29 19:33       ` Blue Swirl
  2007-10-28 20:55     ` Blue Swirl
  1 sibling, 1 reply; 8+ messages in thread
From: Jamie Lokier @ 2007-10-28 19:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paul Brook

Blue Swirl wrote:
> Currently scsi-disk provides a buffer. For true zero copy, this needs
> to be changed so that instead the buffer is provided by the caller at
> each stage until we reach the host memory. But I'll use the scsi-disk
> buffer for now.

This might actually work in Qemu.

But in general, a zero-copy I/O interface needs to allow for the
possibility that either the source of data, or the sink, might need to
be in charge of buffer allocations for a particular sequence.
Otherwise you get situations where the data has to be copied to meet a
technical constraint of a source of a sink, and the copy could have
been avoided if the addresses were allocated to meet that constraint
in the first place.  The most common technical constraint is probably
the need for large contiguous blocks.

I deal with this in my own program by having an I/O call from source
to sink for requesting memory (through a chain of sources/sinks like
your example if necessary), but only when the source is preparing to
do an I/O and hasn't yet prepared the data.  If the data is already
prepared before setting up the I/O for a write, then there's no point
asking the sink to allocate memory, and if it has to anyway (e.g. if
it needs a large contiguous block), that's an unavoidable copy anyway.

A couple of examples of sinks with constraints are:

   - Can't use writev().  E.g. you're using a slightly old Linux
     kernel, want to do AIO, and it doesn't have async writev(), only async
     write().

   - Writing to sound card through memory-mapped ring buffer.  The
     sink is the code which opens /dev/dsp, and then it can provide
     buffers for zero-copy only if it picks the address where data
     will be prepared.

   - Async I/O using "database writer" style separate processes which
     actually do the writes synchronously, and the data is passed to
     them using shared memory.  For this, the sink is the code which
     sends a request to one of the writer processes, and it must use a
     buffer which is in the mapped shared memory.

-- Jamie

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-28 19:10     ` Jamie Lokier
@ 2007-10-29 19:33       ` Blue Swirl
  2007-10-30 20:09         ` Blue Swirl
  0 siblings, 1 reply; 8+ messages in thread
From: Blue Swirl @ 2007-10-29 19:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paul Brook

On 10/28/07, Jamie Lokier <jamie@shareable.org> wrote:
> Blue Swirl wrote:
> > Currently scsi-disk provides a buffer. For true zero copy, this needs
> > to be changed so that instead the buffer is provided by the caller at
> > each stage until we reach the host memory. But I'll use the scsi-disk
> > buffer for now.
>
> This might actually work in Qemu.
>
> But in general, a zero-copy I/O interface needs to allow for the
> possibility that either the source of data, or the sink, might need to
> be in charge of buffer allocations for a particular sequence.
> Otherwise you get situations where the data has to be copied to meet a
> technical constraint of a source of a sink, and the copy could have
> been avoided if the addresses were allocated to meet that constraint
> in the first place.  The most common technical constraint is probably
> the need for large contiguous blocks.
>
> I deal with this in my own program by having an I/O call from source
> to sink for requesting memory (through a chain of sources/sinks like
> your example if necessary), but only when the source is preparing to
> do an I/O and hasn't yet prepared the data.  If the data is already
> prepared before setting up the I/O for a write, then there's no point
> asking the sink to allocate memory, and if it has to anyway (e.g. if
> it needs a large contiguous block), that's an unavoidable copy anyway.
>
> A couple of examples of sinks with constraints are:
>
>    - Can't use writev().  E.g. you're using a slightly old Linux
>      kernel, want to do AIO, and it doesn't have async writev(), only async
>      write().
>
>    - Writing to sound card through memory-mapped ring buffer.  The
>      sink is the code which opens /dev/dsp, and then it can provide
>      buffers for zero-copy only if it picks the address where data
>      will be prepared.
>
>    - Async I/O using "database writer" style separate processes which
>      actually do the writes synchronously, and the data is passed to
>      them using shared memory.  For this, the sink is the code which
>      sends a request to one of the writer processes, and it must use a
>      buffer which is in the mapped shared memory.

I think this also shows that the system may become quite complex. Some
kind of hooks may be needed before and after the transfer.

We could cache the resolved addresses to overcome the additional setup
overhead. Each stage should install cache invalidation callbacks or a
method to call for recalculation of the addresses. For example IOMMU
or ESPDMA mappings change very often.

IO vector based API seems to be hard to use, so a simple list should
be better. The vectors may not be compatible with the host anyway.
I'll make a new version.

It's good to get some feedback. Designing a high performance IO
framework suitable for all use cases seems to be very challenging.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-29 19:33       ` Blue Swirl
@ 2007-10-30 20:09         ` Blue Swirl
  0 siblings, 0 replies; 8+ messages in thread
From: Blue Swirl @ 2007-10-30 20:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paul Brook

[-- Attachment #1: Type: text/plain, Size: 1206 bytes --]

On 10/29/07, Blue Swirl <blauwirbel@gmail.com> wrote:
> We could cache the resolved addresses to overcome the additional setup
> overhead. Each stage should install cache invalidation callbacks or a
> method to call for recalculation of the addresses. For example IOMMU
> or ESPDMA mappings change very often.

I meant to write "don't change very often."

This version actually resolves the host memory address so that
scsi-disk could (with some additional plumbing) write directly to
final destination.

I think both pre- and postprocessing hooks may be needed, but those
are not implemented yet.

What about error handling? For example, first page is OK but second is
not. Truncate all further blocks and install a post-processing hook
that raises a bus error?

Example output:
esp
DMADriverAIOCB 0x27433f0
IO ranges:
base 0000000000000000 len 0000000000000800
Prehooks:
Posthooks:
espdma
DMADriverAIOCB 0x27433f0
IO ranges:
base 00000000fe00000a len 0000000000000800
Prehooks:
Posthooks:
iommu
DMADriverAIOCB 0x27433f0
IO ranges:
base 0000000007fe100a len 0000000000000800
Prehooks:
Posthooks:
physical
DMADriverAIOCB 0x27433f0
IO ranges:
base 00002b8e6f82200a len 0000000000000800
Prehooks:
Posthooks:

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: gdma_aiov.diff --]
[-- Type: text/x-diff; name=gdma_aiov.diff, Size: 16048 bytes --]

Index: qemu/vl.h
===================================================================
--- qemu.orig/vl.h	2007-10-29 16:59:37.000000000 +0000
+++ qemu/vl.h	2007-10-30 19:08:35.000000000 +0000
@@ -746,6 +746,109 @@
 
 #include "hw/irq.h"
 
+/* Generic DMA API */
+
+typedef void DMADriverCompletionFunc(void *opaque, int ret);
+
+typedef struct qemu_iolist {
+    target_phys_addr_t iov_base;
+    target_phys_addr_t iov_len;
+    struct qemu_iolist *next;
+} qemu_iolist;
+
+typedef struct DMADriverAIOCB DMADriverAIOCB;
+
+typedef DMADriverAIOCB *
+DMATranslationHandler(void *opaque, DMADriverAIOCB *request, int is_write);
+
+typedef struct DMACompletionEntry {
+    DMATranslationHandler *func;
+    void *opaque;
+    struct DMACompletionEntry *next;
+} DMACompletionEntry;
+
+struct DMADriverAIOCB {
+    qemu_iolist *iolist;
+    DMACompletionEntry *prehook;
+    DMACompletionEntry *posthook;
+    struct DMADriverAIOCB *next;
+};
+
+typedef struct qemu_bus {
+    unsigned int bus_bits;
+    DMATranslationHandler *north_handler;
+    void *north_handler_opaque;
+    DMATranslationHandler *south_handler;
+    void *south_handler_opaque;
+} qemu_bus;
+
+static inline qemu_bus *
+bus_init(unsigned int bus_bits,
+         DMATranslationHandler north_handler,
+         void *north_handler_opaque,
+         DMATranslationHandler south_handler,
+         void *south_handler_opaque)
+{
+    qemu_bus *bus;
+
+    bus = qemu_mallocz(sizeof(qemu_bus));
+    bus->bus_bits = bus_bits;
+    bus->north_handler = north_handler;
+    bus->north_handler_opaque = north_handler_opaque;
+    bus->south_handler = south_handler;
+    bus->south_handler_opaque = south_handler_opaque;
+    return bus;
+}
+
+/* Direction CPU->bridge->device/memory */
+static inline DMADriverAIOCB *
+bus_translate_south(qemu_bus *bus, DMADriverAIOCB *request, int is_write)
+{
+    return bus->south_handler(bus->south_handler_opaque, request, is_write);
+}
+
+/* From device towards CPU/memory (DMA) */
+static inline DMADriverAIOCB *
+bus_translate_north(qemu_bus *bus, DMADriverAIOCB *request, int is_write)
+{
+    return bus->north_handler(bus->north_handler_opaque, request, is_write);
+}
+
+static inline DMADriverAIOCB *
+bus_build_aiocb(target_phys_addr_t addr, target_phys_addr_t len)
+{
+    DMADriverAIOCB *d;
+
+    d = qemu_mallocz(sizeof(DMADriverAIOCB));
+    d->iolist = qemu_mallocz(sizeof(qemu_iolist));
+    d->iolist->iov_base = addr;
+    d->iolist->iov_len = len;
+    return d;
+}
+
+#if 1 || DEBUG_GDMA
+static inline void
+bus_dump_aiocb(DMADriverAIOCB *d)
+{
+    qemu_iolist *io;
+    DMACompletionEntry *e;
+
+    fprintf(stderr, "DMADriverAIOCB %p\nIO ranges:\n", d);
+    for (io = d->iolist; io != NULL; io = io->next) {
+        fprintf(stderr, "base " TARGET_FMT_plx " len " TARGET_FMT_plx "\n",
+                io->iov_base, io->iov_len);
+    }
+    fprintf(stderr, "Prehooks:\n");
+    for (e = d->prehook; e != NULL; e = e->next) {
+        fprintf(stderr, "func %p opaque %p\n", e->func, e->opaque);
+    }
+    fprintf(stderr, "Posthooks:\n");
+    for (e = d->posthook; e != NULL; e = e->next) {
+        fprintf(stderr, "func %p opaque %p\n", e->func, e->opaque);
+    }
+}
+#endif
+
 /* ISA bus */
 
 extern target_phys_addr_t isa_mem_base;
@@ -1253,7 +1356,8 @@
 extern QEMUMachine ss5_machine, ss10_machine;
 
 /* iommu.c */
-void *iommu_init(target_phys_addr_t addr);
+void *iommu_init(target_phys_addr_t addr, qemu_bus *parent_bus,
+                 qemu_bus **bus);
 void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr,
                                  uint8_t *buf, int len, int is_write);
 static inline void sparc_iommu_memory_read(void *opaque,
@@ -1308,11 +1412,13 @@
 /* esp.c */
 void esp_scsi_attach(void *opaque, BlockDriverState *bd, int id);
 void *esp_init(BlockDriverState **bd, target_phys_addr_t espaddr,
-               void *dma_opaque, qemu_irq irq, qemu_irq *reset);
+               void *dma_opaque, qemu_irq irq, qemu_irq *reset,
+               qemu_bus *parent_bus, qemu_bus **bus);
 
 /* sparc32_dma.c */
 void *sparc32_dma_init(target_phys_addr_t daddr, qemu_irq parent_irq,
-                       void *iommu, qemu_irq **dev_irq, qemu_irq **reset);
+                       void *iommu, qemu_irq **dev_irq, qemu_irq **reset,
+                       qemu_bus *parent_bus, qemu_bus **bus);
 void ledma_memory_read(void *opaque, target_phys_addr_t addr,
                        uint8_t *buf, int len, int do_bswap);
 void ledma_memory_write(void *opaque, target_phys_addr_t addr,
@@ -1428,6 +1534,8 @@
    scsi_{read,write}_data.  */
 void scsi_read_data(SCSIDevice *s, uint32_t tag);
 int scsi_write_data(SCSIDevice *s, uint32_t tag);
+void scsi_read_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d);
+int scsi_write_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d);
 void scsi_cancel_io(SCSIDevice *s, uint32_t tag);
 uint8_t *scsi_get_buf(SCSIDevice *s, uint32_t tag);
 
Index: qemu/hw/sun4m.c
===================================================================
--- qemu.orig/hw/sun4m.c	2007-10-29 16:59:37.000000000 +0000
+++ qemu/hw/sun4m.c	2007-10-30 19:09:12.000000000 +0000
@@ -306,6 +306,32 @@
     env->halted = 1;
 }
 
+static DMADriverAIOCB *
+physical_memory_bus_translate_north(void *opaque, DMADriverAIOCB *request,
+                                    int is_write)
+{
+    qemu_iolist *io;
+
+    for (io = request->iolist; io != NULL; io = io->next) {
+        if (io->iov_base < phys_ram_size)
+            io->iov_base += (unsigned long)phys_ram_base;
+        else
+            io->iov_len = 0;
+    }
+    fprintf(stderr, "physical\n");
+    bus_dump_aiocb(request);
+    return request;
+}
+
+static DMADriverAIOCB *
+physical_memory_bus_translate_south(void *opaque,
+                                    DMADriverAIOCB *request,
+                                    int is_write)
+{
+    // Does not exist?
+    return request;
+}
+
 static void *sun4m_hw_init(const struct hwdef *hwdef, int RAM_size,
                            DisplayState *ds, const char *cpu_model)
 
@@ -317,6 +343,8 @@
     qemu_irq *cpu_irqs[MAX_CPUS], *slavio_irq, *slavio_cpu_irq,
         *espdma_irq, *ledma_irq;
     qemu_irq *esp_reset, *le_reset;
+    qemu_bus *memory_bus, *iommu_bus, *espdma_bus, *ledma_bus, *esp_bus,
+        *scsi_bus;
 
     /* init CPUs */
     sparc_find_by_name(cpu_model, &def);
@@ -345,7 +373,12 @@
     /* allocate RAM */
     cpu_register_physical_memory(0, RAM_size, 0);
 
-    iommu = iommu_init(hwdef->iommu_base);
+    memory_bus = bus_init(TARGET_PHYS_ADDR_BITS,
+                          physical_memory_bus_translate_north,
+                          NULL,
+                          physical_memory_bus_translate_south,
+                          NULL);
+    iommu = iommu_init(hwdef->iommu_base, memory_bus, &iommu_bus);
     slavio_intctl = slavio_intctl_init(hwdef->intctl_base,
                                        hwdef->intctl_base + 0x10000ULL,
                                        &hwdef->intbit_to_level[0],
@@ -354,11 +387,12 @@
                                        hwdef->clock_irq);
 
     espdma = sparc32_dma_init(hwdef->dma_base, slavio_irq[hwdef->esp_irq],
-                              iommu, &espdma_irq, &esp_reset);
+                              iommu, &espdma_irq, &esp_reset, iommu_bus,
+                              &espdma_bus);
 
     ledma = sparc32_dma_init(hwdef->dma_base + 16ULL,
                              slavio_irq[hwdef->le_irq], iommu, &ledma_irq,
-                             &le_reset);
+                             &le_reset, iommu_bus, &ledma_bus);
 
     if (graphic_depth != 8 && graphic_depth != 24) {
         fprintf(stderr, "qemu: Unsupported depth: %d\n", graphic_depth);
@@ -392,7 +426,7 @@
     fdctrl_init(slavio_irq[hwdef->fd_irq], 0, 1, hwdef->fd_base, fd_table);
 
     main_esp = esp_init(bs_table, hwdef->esp_base, espdma, *espdma_irq,
-                        esp_reset);
+                        esp_reset, espdma_bus, &esp_bus);
 
     for (i = 0; i < MAX_DISKS; i++) {
         if (bs_table[i]) {
Index: qemu/hw/iommu.c
===================================================================
--- qemu.orig/hw/iommu.c	2007-10-29 16:59:37.000000000 +0000
+++ qemu/hw/iommu.c	2007-10-30 19:40:15.000000000 +0000
@@ -104,6 +104,7 @@
     target_phys_addr_t addr;
     uint32_t regs[IOMMU_NREGS];
     target_phys_addr_t iostart;
+    qemu_bus *bus;
 } IOMMUState;
 
 static uint32_t iommu_mem_readw(void *opaque, target_phys_addr_t addr)
@@ -244,6 +245,64 @@
     s->regs[IOMMU_AFAR] = addr;
 }
 
+static DMADriverAIOCB *
+iommu_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    IOMMUState *s = opaque;
+    qemu_iolist *io, *new_io, *next_io;
+    uint32_t flags;
+    int l, first;
+    target_phys_addr_t addr, prev_addr, len, page, phys_addr;
+
+    for (io = request->iolist; io != NULL; io = next_io) {
+        addr = io->iov_base;
+        len = io->iov_len;
+        next_io = io->next;
+        first = 1;
+        while (len > 0) {
+            page = addr & TARGET_PAGE_MASK;
+            l = (page + TARGET_PAGE_SIZE) - addr;
+            if (l > len)
+                l = len;
+            flags = iommu_page_get_flags(s, page);
+            if (!(flags & IOPTE_VALID)) {
+                return NULL;
+            }
+            if (is_write) {
+                if (!(flags & IOPTE_WRITE)) {
+                    return NULL;
+                }
+            }
+            phys_addr = iommu_translate_pa(s, addr, flags);
+            if (!first && addr != (prev_addr + TARGET_PAGE_SIZE)) {
+                new_io = qemu_mallocz(sizeof(qemu_iolist));
+                new_io->iov_base = phys_addr;
+                new_io->iov_len = len;
+                new_io->next = io->next;
+                io->next = new_io;
+            } else {
+                io->iov_base = phys_addr;
+                io->iov_len = len;
+            }
+            prev_addr = page;
+            len -= l;
+            addr += l;
+            first = 0;
+        }
+    }
+    fprintf(stderr, "iommu\n");
+    bus_dump_aiocb(request);
+    bus_translate_north(s->bus, request, is_write);
+    return request;
+}
+
+static DMADriverAIOCB *
+iommu_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    // 1:1 mapping
+    return request;
+}
+
 void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr,
                            uint8_t *buf, int len, int is_write)
 {
@@ -311,7 +370,7 @@
     s->regs[IOMMU_CTRL] = IOMMU_VERSION;
 }
 
-void *iommu_init(target_phys_addr_t addr)
+void *iommu_init(target_phys_addr_t addr, qemu_bus *parent_bus, qemu_bus **bus)
 {
     IOMMUState *s;
     int iommu_io_memory;
@@ -321,9 +380,12 @@
         return NULL;
 
     s->addr = addr;
+    s->bus = parent_bus;
 
     iommu_io_memory = cpu_register_io_memory(0, iommu_mem_read, iommu_mem_write, s);
     cpu_register_physical_memory(addr, IOMMU_NREGS * 4, iommu_io_memory);
+    *bus = bus_init(32, iommu_bus_translate_north, s,
+                    iommu_bus_translate_south, s);
 
     register_savevm("iommu", addr, 2, iommu_save, iommu_load, s);
     qemu_register_reset(iommu_reset, s);
Index: qemu/hw/sparc32_dma.c
===================================================================
--- qemu.orig/hw/sparc32_dma.c	2007-10-29 16:59:37.000000000 +0000
+++ qemu/hw/sparc32_dma.c	2007-10-30 19:29:06.000000000 +0000
@@ -60,6 +60,7 @@
     qemu_irq irq;
     void *iommu;
     qemu_irq dev_reset;
+    qemu_bus *bus;
 };
 
 /* Note: on sparc, the lance 16 bit bus is swapped */
@@ -128,6 +129,27 @@
     }
 }
 
+static DMADriverAIOCB *
+espdma_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    DMAState *s = opaque;
+    qemu_iolist *io;
+
+    for (io = request->iolist; io != NULL; io = io->next)
+        io->iov_base = (target_phys_addr_t)s->dmaregs[1];
+    fprintf(stderr, "espdma\n");
+    bus_dump_aiocb(request);
+    bus_translate_north(s->bus, request, is_write);
+    return request;
+}
+
+static DMADriverAIOCB *
+espdma_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    // 1:1 mapping
+    return request;
+}
+
 void espdma_memory_read(void *opaque, uint8_t *buf, int len)
 {
     DMAState *s = opaque;
@@ -238,7 +260,8 @@
 }
 
 void *sparc32_dma_init(target_phys_addr_t daddr, qemu_irq parent_irq,
-                       void *iommu, qemu_irq **dev_irq, qemu_irq **reset)
+                       void *iommu, qemu_irq **dev_irq, qemu_irq **reset,
+                       qemu_bus *parent_bus, qemu_bus **bus)
 {
     DMAState *s;
     int dma_io_memory;
@@ -258,6 +281,9 @@
     *dev_irq = qemu_allocate_irqs(dma_set_irq, s, 1);
 
     *reset = &s->dev_reset;
+    s->bus = parent_bus;
+    *bus = bus_init(32, espdma_bus_translate_north, s,
+                    espdma_bus_translate_south, s);
 
     return s;
 }
Index: qemu/hw/esp.c
===================================================================
--- qemu.orig/hw/esp.c	2007-10-29 16:59:37.000000000 +0000
+++ qemu/hw/esp.c	2007-10-30 19:27:46.000000000 +0000
@@ -74,6 +74,7 @@
     uint8_t *async_buf;
     uint32_t async_len;
     void *dma_opaque;
+    qemu_bus *bus;
 };
 
 #define STAT_DO 0x00
@@ -144,15 +145,25 @@
     datalen = scsi_send_command(s->current_dev, 0, &buf[1], lun);
     s->ti_size = datalen;
     if (datalen != 0) {
+        DMADriverAIOCB *d;
+
         s->rregs[4] = STAT_IN | STAT_TC;
         s->dma_left = 0;
         s->dma_counter = 0;
         if (datalen > 0) {
+            d = bus_build_aiocb(0, datalen);
+            fprintf(stderr, "esp\n");
+            bus_dump_aiocb(d);
+            bus_translate_north(s->bus, d, 1);
             s->rregs[4] |= STAT_DI;
-            scsi_read_data(s->current_dev, 0);
+            scsi_read_data_aio(s->current_dev, 0, d);
         } else {
+            d = bus_build_aiocb(0, -datalen);
+            fprintf(stderr, "esp\n");
+            bus_dump_aiocb(d);
+            bus_translate_north(s->bus, d, 0);
             s->rregs[4] |= STAT_DO;
-            scsi_write_data(s->current_dev, 0);
+            scsi_write_data_aio(s->current_dev, 0, d);
         }
     }
     s->rregs[5] = INTR_BS | INTR_FC;
@@ -330,6 +341,22 @@
     }
 }
 
+static DMADriverAIOCB *
+esp_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    // 1:1 mapping?
+    fprintf(stderr, "esp\n");
+    bus_dump_aiocb(request);
+    return request;
+}
+
+static DMADriverAIOCB *
+esp_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write)
+{
+    // 1:1 mapping
+    return request;
+}
+
 static void esp_reset(void *opaque)
 {
     ESPState *s = opaque;
@@ -575,7 +602,8 @@
 }
 
 void *esp_init(BlockDriverState **bd, target_phys_addr_t espaddr,
-               void *dma_opaque, qemu_irq irq, qemu_irq *reset)
+               void *dma_opaque, qemu_irq irq, qemu_irq *reset,
+               qemu_bus *parent_bus, qemu_bus **bus)
 {
     ESPState *s;
     int esp_io_memory;
@@ -587,9 +615,11 @@
     s->bd = bd;
     s->irq = irq;
     s->dma_opaque = dma_opaque;
-
+    s->bus = parent_bus;
     esp_io_memory = cpu_register_io_memory(0, esp_mem_read, esp_mem_write, s);
     cpu_register_physical_memory(espaddr, ESP_SIZE, esp_io_memory);
+    *bus = bus_init(32, esp_bus_translate_north, s,
+                    esp_bus_translate_south, s);
 
     esp_reset(s);
 
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2007-10-29 16:59:37.000000000 +0000
+++ qemu/hw/scsi-disk.c	2007-10-30 18:55:19.000000000 +0000
@@ -199,6 +199,16 @@
     r->sector_count -= n;
 }
 
+void scsi_read_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d)
+{
+    scsi_read_data(s, tag);
+}
+
+int scsi_write_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d)
+{
+    return scsi_write_data(s, tag);
+}
+
 static void scsi_write_complete(void * opaque, int ret)
 {
     SCSIRequest *r = (SCSIRequest *)opaque;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO?
  2007-10-28  9:09   ` Blue Swirl
  2007-10-28 19:10     ` Jamie Lokier
@ 2007-10-28 20:55     ` Blue Swirl
  1 sibling, 0 replies; 8+ messages in thread
From: Blue Swirl @ 2007-10-28 20:55 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 469 bytes --]

I made a new patch sketching the system. It doesn't even compile, but
it should give a view how this would be put into work.

On the down side, new memory needs to be allocated for generation of
new vectors from previous ones, that may kill some of the performance.
Also, supporting DMA to MMIO registers can't be done with pure
translation only.

I'm not too happy about this model anymore, maybe the model I proposed
earlier is better. But it can't get to zero copy.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: gdma_aiov.diff --]
[-- Type: text/x-diff; name=gdma_aiov.diff, Size: 7136 bytes --]

Index: qemu/vl.h
===================================================================
--- qemu.orig/vl.h	2007-10-28 13:26:35.000000000 +0000
+++ qemu/vl.h	2007-10-28 20:25:27.000000000 +0000
@@ -746,6 +746,78 @@
 
 #include "hw/irq.h"
 
+/* Generic DMA API */
+
+typedef void DMADriverCompletionFunc(void *opaque, int ret);
+
+struct qemu_iovec {
+    target_phys_addr_t iov_base;
+    size_t iov_len;
+};
+
+typedef struct DMADriverAIOCB DMADriverAIOCB;
+
+typedef DMADriverAIOCB *
+DMATranslationHandler(void *opaque, DMADriverAIOCB *request);
+
+typedef struct DMACompletionEntry {
+    DMATranslationHandler *func;
+    void *opaque;
+    struct DMACompletionEntry *next;
+} DMACompletionEntry;
+
+struct DMADriverAIOCB {
+    unsigned int nent;
+    struct qemu_iovec *vector;
+    DMACompletionEntry *cb;
+    struct DMADriverAIOCB *next;
+};
+
+typedef struct qemu_bus {
+    unsigned int bus_bits;
+    DMATranslationHandler *north_handler;
+    void *north_handler_opaque;
+    DMATranslationHandler *south_handler;
+    void *south_handler_opaque;
+} qemu_bus;
+
+qemu_bus *
+bus_init(unsigned int bus_bits,
+         DMATranslationHandler north_handler,
+         void *north_handler_opaque,
+         DMATranslationHandler south_handler,
+         void *south_handler_opaque);
+
+/* Direction CPU->bridge->device/memory */
+static inline DMADriverAIOCB *
+bus_translate_south(qemu_bus *bus, DMADriverAIOCB *request)
+{
+    return bus->south_handler(bus->south_handler_opaque, request);
+}
+
+/* From device towards CPU/memory (DMA) */
+static inline DMADriverAIOCB *
+bus_translate_north(qemu_bus *bus, DMADriverAIOCB *request)
+{
+    return bus->north_handler(bus->north_handler_opaque, request);
+}
+
+static inline DMADriverAIOCB *
+bus_build_aiocb(const struct qemu_iovec *vector, unsigned int count)
+{
+    DMADriverAIOCB *d;
+
+    d = qemu_mallocz(sizeof(DMADriverAIOCB));
+    d->nent = count;
+    d->vector = qemu_mallocz(count * sizeof(struct qemu_iovec));
+    memcpy(d->vector, vector, count * sizeof(struct qemu_iovec));
+    return d;
+}
+
+DMADriverAIOCB *physical_memory_bus_translate_north(void *opaque,
+                                                    DMADriverAIOCB *request);
+DMADriverAIOCB *physical_memory_bus_translate_south(void *opaque,
+                                                    DMADriverAIOCB *request);
 /* ISA bus */
 
 extern target_phys_addr_t isa_mem_base;
Index: qemu/hw/sun4m.c
===================================================================
--- qemu.orig/hw/sun4m.c	2007-10-28 19:12:59.000000000 +0000
+++ qemu/hw/sun4m.c	2007-10-28 20:24:59.000000000 +0000
@@ -306,6 +306,27 @@
     env->halted = 1;
 }
 
+DMADriverAIOCB *physical_memory_bus_translate_north(void *opaque,
+                                                    DMADriverAIOCB *request)
+{
+    unsigned int i;
+
+    for (i = 0; i < request->nent; i++) {
+        if (request->vector[i].iov_base < phys_ram_size)
+            request->vector[i].iov_base += (unsigned long)phys_ram_base;
+        else
+            request->vector[i].iov_len = 0;
+    }
+    return request;
+}
+
+DMADriverAIOCB *physical_memory_bus_translate_south(void *opaque,
+                                                    DMADriverAIOCB *request)
+{
+    // Does not exist?
+    return request;
+}
+
 static void *sun4m_hw_init(const struct hwdef *hwdef, int RAM_size,
                            DisplayState *ds, const char *cpu_model)
 
Index: qemu/hw/iommu.c
===================================================================
--- qemu.orig/hw/iommu.c	2007-10-28 19:12:35.000000000 +0000
+++ qemu/hw/iommu.c	2007-10-28 20:29:23.000000000 +0000
@@ -244,6 +244,24 @@
     s->regs[IOMMU_AFAR] = addr;
 }
 
+DMADriverAIOCB *iommu_bus_translate_north(void *opaque,
+                                          DMADriverAIOCB *request)
+{
+    unsigned int i;
+    // alloc a new vector
+    for (i = 0; i < request->nent; i++) {
+        //translate_vector(request->vector[i]);
+    }
+    return request;
+}
+
+DMADriverAIOCB *iommu_bus_translate_south(void *opaque,
+                                          DMADriverAIOCB *request)
+{
+    // 1:1 mapping
+    return request;
+}
+
 void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr,
                            uint8_t *buf, int len, int is_write)
 {
Index: qemu/hw/sparc32_dma.c
===================================================================
--- qemu.orig/hw/sparc32_dma.c	2007-10-28 19:12:54.000000000 +0000
+++ qemu/hw/sparc32_dma.c	2007-10-28 20:30:45.000000000 +0000
@@ -128,6 +128,25 @@
     }
 }
 
+DMADriverAIOCB *espdma_bus_translate_north(void *opaque,
+                                           DMADriverAIOCB *request)
+{
+    DMAState *s = opaque;
+    unsigned int i;
+
+    for (i = 0; i < request->nent; i++) {
+        request->vector[i].iov_base |= s->dmaregs[1];
+    }
+    return request;
+}
+
+DMADriverAIOCB *espdma_bus_translate_south(void *opaque,
+                                           DMADriverAIOCB *request)
+{
+    // 1:1 mapping
+    return request;
+}
+
 void espdma_memory_read(void *opaque, uint8_t *buf, int len)
 {
     DMAState *s = opaque;
Index: qemu/hw/esp.c
===================================================================
--- qemu.orig/hw/esp.c	2007-10-28 18:57:27.000000000 +0000
+++ qemu/hw/esp.c	2007-10-28 20:37:57.000000000 +0000
@@ -144,15 +144,25 @@
     datalen = scsi_send_command(s->current_dev, 0, &buf[1], lun);
     s->ti_size = datalen;
     if (datalen != 0) {
+        DMADriverAIOCB *d;
+        struct qemu_iovec iov;
+
         s->rregs[4] = STAT_IN | STAT_TC;
         s->dma_left = 0;
         s->dma_counter = 0;
+        iov.iov_base = 0;
         if (datalen > 0) {
+            iov.iov_len = datalen;
+            d = bus_build_aiocb(&iov, 1);
+            bus_translate_north(s->bus, d);
             s->rregs[4] |= STAT_DI;
-            scsi_read_data(s->current_dev, 0);
+            scsi_read_data(s->current_dev, 0, d);
         } else {
+            iov.iov_len = -datalen;
+            d = bus_build_aiocb(&iov, 1);
+            bus_translate_north(s->bus, d);
             s->rregs[4] |= STAT_DO;
-            scsi_write_data(s->current_dev, 0);
+            scsi_write_data(s->current_dev, 0, d);
         }
     }
     s->rregs[5] = INTR_BS | INTR_FC;
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2007-10-28 20:30:35.000000000 +0000
+++ qemu/hw/scsi-disk.c	2007-10-28 20:38:49.000000000 +0000
@@ -162,7 +162,7 @@
 }
 
 /* Read more data from scsi device into buffer.  */
-void scsi_read_data(SCSIDevice *s, uint32_t tag)
+void scsi_read_data(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *request)
 {
     SCSIRequest *r;
     uint32_t n;
@@ -191,7 +191,7 @@
         n = SCSI_DMA_BUF_SIZE / 512;
 
     r->buf_len = n * 512;
-    r->aiocb = bdrv_aio_read(s->bdrv, r->sector, r->dma_buf, n,
+    r->aiocb = bdrv_aio_read(s->bdrv, r->sector, request, n,
                              scsi_read_complete, r);
     if (r->aiocb == NULL)
         scsi_command_complete(r, SENSE_HARDWARE_ERROR);

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-10-30 20:09 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl
2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl
2007-10-28  1:29 ` [Qemu-devel] " Paul Brook
2007-10-28  9:09   ` Blue Swirl
2007-10-28 19:10     ` Jamie Lokier
2007-10-29 19:33       ` Blue Swirl
2007-10-30 20:09         ` Blue Swirl
2007-10-28 20:55     ` Blue Swirl

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).