* [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? @ 2007-10-27 12:56 Blue Swirl 2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl 2007-10-28 1:29 ` [Qemu-devel] " Paul Brook 0 siblings, 2 replies; 8+ messages in thread From: Blue Swirl @ 2007-10-27 12:56 UTC (permalink / raw) To: qemu-devel [-- Attachment #1: Type: text/plain, Size: 1287 bytes --] Hi, I changed Slirp output to use vectored IO to avoid the slowdown from memcpy (see the patch for the work in progress, gives a small performance improvement). But then I got the idea that using AIO would be nice at the outgoing end of the network IO processing. In fact, vectored AIO model could even be used for the generic DMA! The benefit is that no buffering or copying should be needed. Instead of void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, int len, int is_write); and its device variant, we'd have something like int qemu_lio_listio(int mode, struct GenericAIOcb *list[], unsigned int nent, IOCompletionFunc *cb); Each stage would translate the IO list and callback as needed and only the final stage would perform the IO or memcpy. This would be used in each stage of the chain memory<->IOMMU<->device<->SLIRP<->host network device. Of course some kind of host support for vectored AIO for these devices is required. On target side, devices that can do scatter/gather DMA would benefit most. For the specific Sparc32 case, unfortunately Lance bus byte swapping makes buffering necessary at that stage, unless we can make N vectors with just a single byte faster than memcpy + bswap of memory block with size N. Comments? [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: slirp_iov.diff --] [-- Type: text/x-diff; name=slirp_iov.diff, Size: 9315 bytes --] Index: qemu/vl.c =================================================================== --- qemu.orig/vl.c 2007-10-27 07:06:30.000000000 +0000 +++ qemu/vl.c 2007-10-27 11:06:09.000000000 +0000 @@ -1540,8 +1540,46 @@ } /***********************************************************/ -/* character device */ +/* Helpers for vectored IO*/ +static void qemu_readv_with_read(void *opaque, IOReadHandler *fd_read, + const struct qemu_iovec *vector, int count) +{ +#if 1 + int i, currlen = 0; + char buf[8192]; + + if (fd_read) { + for (i = 0; i < count; i++) { + if (currlen + vector[i].iov_len < sizeof(buf)) + memcpy(&buf[currlen], vector[i].iov_base, vector[i].iov_len); + else + fprintf(stderr, "bad currlen %d iov.len %ld\n", currlen, vector[i].iov_len); + currlen += vector[i].iov_len; + } + fd_read(opaque, buf, currlen); + } +#else + int i; + + if (fd_read) + for (i = 0; i < count; i++) + fd_read(opaque, vector[i].iov_base, vector[i].iov_len); +#endif +} +static void qemu_read_with_readv(void *opaque, IOReadvHandler *fd_readv, + const uint8_t *buf, int size) +{ + struct qemu_iovec iov; + + iov.iov_base = buf; + iov.iov_len = size; + if (fd_readv) + fd_readv(opaque, &iov, 1); +} + +/***********************************************************/ +/* character device */ static void qemu_chr_event(CharDriverState *s, int event) { if (!s->chr_event) @@ -3573,6 +3611,18 @@ return vc; } +VLANClientState *qemu_new_vlan_client_iov(VLANState *vlan, + IOReadvHandler *fd_readv, + IOCanRWHandler *fd_can_read, + void *opaque) +{ + VLANClientState *vc; + + vc = qemu_new_vlan_client(vlan, NULL, fd_can_read, opaque); + vc->fd_readv = fd_readv; + return vc; +} + int qemu_can_send_packet(VLANClientState *vc1) { VLANState *vlan = vc1->vlan; @@ -3598,7 +3648,26 @@ #endif for(vc = vlan->first_client; vc != NULL; vc = vc->next) { if (vc != vc1) { - vc->fd_read(vc->opaque, buf, size); + if (vc->fd_read) + vc->fd_read(vc->opaque, buf, size); + else if (vc->fd_readv) + qemu_read_with_readv(vc->opaque, vc->fd_readv, buf, size); + } + } +} + +void qemu_send_packet_iov(VLANClientState *vc1, const struct qemu_iovec *vector, + int count) +{ + VLANState *vlan = vc1->vlan; + VLANClientState *vc; + + for(vc = vlan->first_client; vc != NULL; vc = vc->next) { + if (vc != vc1) { + if (vc->fd_readv) + vc->fd_readv(vc->opaque, vector, count); + else if (vc->fd_read) + qemu_readv_with_read(vc->opaque, vc->fd_read, vector, count); } } } @@ -3626,6 +3695,13 @@ qemu_send_packet(slirp_vc, pkt, pkt_len); } +void slirp_output_iov(const struct qemu_iovec *vector, int count) +{ + if (!slirp_vc) + return; + qemu_send_packet_iov(slirp_vc, vector, count); +} + static void slirp_receive(void *opaque, const uint8_t *buf, int size) { #if 0 @@ -4944,13 +5020,12 @@ static IOHandlerRecord *first_io_handler; -/* XXX: fd_read_poll should be suppressed, but an API change is - necessary in the character devices to suppress fd_can_read(). */ -int qemu_set_fd_handler2(int fd, - IOCanRWHandler *fd_read_poll, - IOHandler *fd_read, - IOHandler *fd_write, - void *opaque) +static IOHandlerRecord * +qemu_set_fd_handler3(int fd, + IOCanRWHandler *fd_read_poll, + IOHandler *fd_read, + IOHandler *fd_write, + void *opaque) { IOHandlerRecord **pioh, *ioh; @@ -4973,17 +5048,38 @@ } ioh = qemu_mallocz(sizeof(IOHandlerRecord)); if (!ioh) - return -1; + return NULL; ioh->next = first_io_handler; first_io_handler = ioh; found: ioh->fd = fd; ioh->fd_read_poll = fd_read_poll; ioh->fd_read = fd_read; +#if 0 + if (!fd_read) + ioh->fd_readv = NULL; +#endif ioh->fd_write = fd_write; ioh->opaque = opaque; ioh->deleted = 0; } + return ioh; +} + +/* XXX: fd_read_poll should be suppressed, but an API change is + necessary in the character devices to suppress fd_can_read(). */ +int qemu_set_fd_handler2(int fd, + IOCanRWHandler *fd_read_poll, + IOHandler *fd_read, + IOHandler *fd_write, + void *opaque) +{ + IOHandlerRecord *ioh; + + ioh = qemu_set_fd_handler3(fd, NULL, fd_read, fd_write, opaque); + if (!ioh) + return -1; + return 0; } @@ -4995,6 +5091,25 @@ return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque); } +#if 0 +int qemu_set_fd_handler_iov(int fd, + IOHandler *fd_readv, + IOHandler *fd_writev, + void *opaque) +{ + IOHandlerRecord *ioh; + + ioh = qemu_set_fd_handler3(fd, NULL, NULL, NULL, opaque); + if (!ioh) + return -1; + + ioh->fd_readv = fd_readv; + ioh->fd_writev = fd_writev; + + return 0; +} +#endif + /***********************************************************/ /* Polling handling */ Index: qemu/vl.h =================================================================== --- qemu.orig/vl.h 2007-10-27 07:11:15.000000000 +0000 +++ qemu/vl.h 2007-10-27 08:03:00.000000000 +0000 @@ -263,6 +263,12 @@ /* async I/O support */ typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size); +struct qemu_iovec { + void *iov_base; + size_t iov_len; +}; +typedef void IOReadvHandler(void *opaque, const struct qemu_iovec *vector, + int count); typedef int IOCanRWHandler(void *opaque); typedef void IOHandler(void *opaque); @@ -275,6 +281,10 @@ IOHandler *fd_read, IOHandler *fd_write, void *opaque); +int qemu_set_fd_handler_iov(int fd, + IOHandler *fd_readv, + IOHandler *fd_writev, + void *opaque); /* Polling handling */ @@ -396,6 +406,7 @@ struct VLANClientState { IOReadHandler *fd_read; + IOReadvHandler *fd_readv; /* Packets may still be sent if this returns zero. It's used to rate-limit the slirp code. */ IOCanRWHandler *fd_can_read; @@ -417,8 +428,14 @@ IOReadHandler *fd_read, IOCanRWHandler *fd_can_read, void *opaque); +VLANClientState *qemu_new_vlan_client_iov(VLANState *vlan, + IOReadvHandler *fd_readv, + IOCanRWHandler *fd_can_read, + void *opaque); int qemu_can_send_packet(VLANClientState *vc); void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); +void qemu_send_packet_iov(VLANClientState *vc, const struct qemu_iovec *vector, + int count); void qemu_handler_true(void *opaque); void do_info_network(void); Index: qemu/slirp/libslirp.h =================================================================== --- qemu.orig/slirp/libslirp.h 2007-10-27 07:06:13.000000000 +0000 +++ qemu/slirp/libslirp.h 2007-10-27 08:18:10.000000000 +0000 @@ -17,6 +17,8 @@ /* you must provide the following functions: */ int slirp_can_output(void); void slirp_output(const uint8_t *pkt, int pkt_len); +struct qemu_iovec; +void slirp_output_iov(const struct qemu_iovec *vector, int count); int slirp_redir(int is_udp, int host_port, struct in_addr guest_addr, int guest_port); Index: qemu/slirp/slirp.c =================================================================== --- qemu.orig/slirp/slirp.c 2007-10-27 07:06:19.000000000 +0000 +++ qemu/slirp/slirp.c 2007-10-27 10:44:18.000000000 +0000 @@ -636,19 +636,22 @@ /* output the IP packet to the ethernet device */ void if_encap(const uint8_t *ip_data, int ip_data_len) { - uint8_t buf[1600]; - struct ethhdr *eh = (struct ethhdr *)buf; - - if (ip_data_len + ETH_HLEN > sizeof(buf)) - return; + struct ethhdr buf, *eh = &buf; + struct { + void *data; + size_t len; + } iov[2]; memcpy(eh->h_dest, client_ethaddr, ETH_ALEN); memcpy(eh->h_source, special_ethaddr, ETH_ALEN - 1); /* XXX: not correct */ eh->h_source[5] = CTL_ALIAS; eh->h_proto = htons(ETH_P_IP); - memcpy(buf + sizeof(struct ethhdr), ip_data, ip_data_len); - slirp_output(buf, ip_data_len + ETH_HLEN); + iov[0].data = &buf; + iov[0].len = sizeof(struct ethhdr); + iov[1].data = ip_data; + iov[1].len = ip_data_len; + slirp_output_iov(iov, 2); } int slirp_redir(int is_udp, int host_port, ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Qemu-devel] Re: Faster, generic IO/DMA model with vectored AIO? 2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl @ 2007-10-27 16:53 ` Blue Swirl 2007-10-28 1:29 ` [Qemu-devel] " Paul Brook 1 sibling, 0 replies; 8+ messages in thread From: Blue Swirl @ 2007-10-27 16:53 UTC (permalink / raw) To: qemu-devel [-- Attachment #1: Type: text/plain, Size: 1077 bytes --] On 10/27/07, Blue Swirl <blauwirbel@gmail.com> wrote: > I changed Slirp output to use vectored IO to avoid the slowdown from > memcpy (see the patch for the work in progress, gives a small > performance improvement). But then I got the idea that using AIO would > be nice at the outgoing end of the network IO processing. In fact, > vectored AIO model could even be used for the generic DMA! The benefit > is that no buffering or copying should be needed. I made a sketch of the API, please have a look at the patch. > Each stage would translate the IO list and callback as needed and only > the final stage would perform the IO or memcpy. This would be used in > each stage of the chain memory<->IOMMU<->device<->SLIRP<->host network > device. Of course some kind of host support for vectored AIO for these > devices is required. On target side, devices that can do > scatter/gather DMA would benefit most. Inside Qemu the vectors would use target physical addresses (struct qemu_iovec), but at some point the addresses would change to host pointers suitable for real AIO. [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: gdma_aiov.diff --] [-- Type: text/x-diff; name=gdma_aiov.diff, Size: 3246 bytes --] Index: qemu/vl.h =================================================================== --- qemu.orig/vl.h 2007-10-27 14:58:09.000000000 +0000 +++ qemu/vl.h 2007-10-27 16:51:40.000000000 +0000 @@ -746,6 +746,85 @@ #include "hw/irq.h" +/* Generic DMA API */ + +typedef void DMADriverCompletionFunc(void *opaque, int ret); + +struct qemu_iovec { + target_phys_addr_t iov_base; + size_t iov_len; +}; + +typedef struct qemu_bus qemu_bus; + +typedef struct DMADriverAIOCB { + void *opaque; + int type; + int nent; + struct aiocb **aiocb; + DMADriverCompletionFunc *cb; + struct DMADriverAIOCB *next; +} DMADriverAIOCB; + +typedef void DMARWHandler(void *opaque, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count); + +qemu_bus *bus_init(unsigned int bus_bits, DMARWHandler north_handler, + void *north_handler_opaque, DMARWHandler south_handler, + void *south_handler_opaque); + +/* Direction CPU->bridge->device/memory */ +void bus_rw_south(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count, + int is_write); + +static inline void bus_read_south(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count) +{ + bus_rw_south(bus, dst_vector, dst_count, src_vector, src_count, 0); +} +static inline void bus_write_south(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count) +{ + bus_rw_south(bus, dst_vector, dst_count, src_vector, src_count, 1); +} +/* From device towards CPU/memory (DMA) */ +void bus_rw_north(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count, + int is_write); + +static inline void bus_read_north(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count) +{ + bus_rw_north(bus, dst_vector, dst_count, src_vector, src_count, 0); +} +static inline void bus_write_north(qemu_bus *bus, + const struct qemu_iovec *dst_vector, + int dst_count, + const struct qemu_iovec *src_vector, + int src_count) +{ + bus_rw_north(bus, dst_vector, dst_count, src_vector, src_count, 1); +} + /* ISA bus */ extern target_phys_addr_t isa_mem_base; ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl 2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl @ 2007-10-28 1:29 ` Paul Brook 2007-10-28 9:09 ` Blue Swirl 1 sibling, 1 reply; 8+ messages in thread From: Paul Brook @ 2007-10-28 1:29 UTC (permalink / raw) To: qemu-devel; +Cc: Blue Swirl > I changed Slirp output to use vectored IO to avoid the slowdown from > memcpy (see the patch for the work in progress, gives a small > performance improvement). But then I got the idea that using AIO would > be nice at the outgoing end of the network IO processing. In fact, > vectored AIO model could even be used for the generic DMA! The benefit > is that no buffering or copying should be needed. An interesting idea, however I don't want to underestimate the difficulty of implementing this correctly. I suspect to get real benefits you need to support zero-copy async operation all the way through. Things get really hairy if you allow some operations to complete synchronously, and some to be deferred. I've done async operation for SCSI and USB. The latter is really not pretty, and the former has some notable warts. A generic IODMA framework needs to make sure it covers these requirements without making things worse. Hopefully it'll also help fix the things that are wrong with them. > For the specific Sparc32 case, unfortunately Lance bus byte swapping > makes buffering necessary at that stage, unless we can make N vectors > with just a single byte faster than memcpy + bswap of memory block > with size N. We really want to be dealing with largeish blocks. The {ptr,size} vector is 64 or 128 bytes per element, so the overhead on blocks < 64 bytes if going to be really brutal. Also time taken to do address translation will be O(number of vectors). > Inside Qemu the vectors would use target physical addresses (struct > qemu_iovec), but at some point the addresses would change to host > pointers suitable for real AIO. Phrases like "at some point" worry me :-) I think it would be good to get a top-down description of what each different entity (initiating device, host endpoint, bus translation, memory) is responsible for, and how they all fit together. I have some ideas, but without more detailed investigation can't tell if they will actually work in practice, or if they fit into the code fragments you've posted. My suspicion is they don't as I can't make head or tail of how your gdma_aiov.diff patch would be used in practice. Paul ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-28 1:29 ` [Qemu-devel] " Paul Brook @ 2007-10-28 9:09 ` Blue Swirl 2007-10-28 19:10 ` Jamie Lokier 2007-10-28 20:55 ` Blue Swirl 0 siblings, 2 replies; 8+ messages in thread From: Blue Swirl @ 2007-10-28 9:09 UTC (permalink / raw) To: Paul Brook; +Cc: qemu-devel On 10/28/07, Paul Brook <paul@codesourcery.com> wrote: > > I changed Slirp output to use vectored IO to avoid the slowdown from > > memcpy (see the patch for the work in progress, gives a small > > performance improvement). But then I got the idea that using AIO would > > be nice at the outgoing end of the network IO processing. In fact, > > vectored AIO model could even be used for the generic DMA! The benefit > > is that no buffering or copying should be needed. > > An interesting idea, however I don't want to underestimate the difficulty of > implementing this correctly. I suspect to get real benefits you need to > support zero-copy async operation all the way through. Things get really > hairy if you allow some operations to complete synchronously, and some to be > deferred. Zero-copy can be the first goal, async may come later. I hope we can do the change in stages, perhaps introducing temporary conversion helpers as needed. > I've done async operation for SCSI and USB. The latter is really not pretty, > and the former has some notable warts. A generic IODMA framework needs to > make sure it covers these requirements without making things worse. Hopefully > it'll also help fix the things that are wrong with them. > > > For the specific Sparc32 case, unfortunately Lance bus byte swapping > > makes buffering necessary at that stage, unless we can make N vectors > > with just a single byte faster than memcpy + bswap of memory block > > with size N. > > We really want to be dealing with largeish blocks. The {ptr,size} vector is 64 > or 128 bytes per element, so the overhead on blocks < 64 bytes if going to be > really brutal. Also time taken to do address translation will be O(number of > vectors). That's what I suspected as well. > > Inside Qemu the vectors would use target physical addresses (struct > > qemu_iovec), but at some point the addresses would change to host > > pointers suitable for real AIO. > > Phrases like "at some point" worry me :-) > > I think it would be good to get a top-down description of what each different > entity (initiating device, host endpoint, bus translation, memory) is > responsible for, and how they all fit together. > > > I have some ideas, but without more detailed investigation can't tell if they > will actually work in practice, or if they fit into the code fragments you've > posted. My suspicion is they don't as I can't make head or tail of how your > gdma_aiov.diff patch would be used in practice. Ok, I'll try to make a mental exercise with this chain: SCSI->ESP->ESPDMA->IOMMU->memory write. Scenario: SCSI read issued, 8k size. I'll track the address+size vectors at each stage. scsi-disk uses host memory addresses. ESP uses addresses ranging from 0 to end of request. ESPDMA forces the MS byte to be 0xfc. IOMMU translates page 0xfc000000 to 0x1000 and page 0xfc001000 to 0x4000. Memory translates 0x1000 to phys_ram_base + 0x1000, likewise for 0x4000. From this point on, we will be using host memory addresses again. Each stage may change the callback if needed. Currently scsi-disk provides a buffer. For true zero copy, this needs to be changed so that instead the buffer is provided by the caller at each stage until we reach the host memory. But I'll use the scsi-disk buffer for now. Initially the (address, size) vectors provided by scsi-disk is: src_vector = (&SCSIDevice->SCSIRequest->dma_buf[0], 8192). What's the destination vector, (NULL, 0)? scsi-disk calls bus_write_north, which transfers control to ESP. ESP changes the vectors to src_vector = (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst_vector = (0, 8192). Calls bus_write_north -> ESPDMA. ESPDMA: src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst (0xfc000000, 8192). -> IOMMU. After IOMMU: src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst ((0x1000, 4096), (0x4000, 4096)). After memory: src (&SCSIDevice->SCSIRequest->dma_buf[0], 8192), dst ((phys_ram_base + 0x1000, 4096), (phys_ram_base + 0x4000, 4096)). scsi-disk or memory (which?) can now perform the memcpy. But now we also have the information to perform the disk read without copying. Do we need the source vectors at all? Let's try the other direction, SCSI write. Other parameters are unchanged. Now the destination is scsi-disk buffer, source vector will be translated. src=(NULL, 0), dst =(&SCSIDevice->SCSIRequest->dma_buf[0], 8192). scsi-disk calls bus_read_north, which transfers control to ESP. ESP: src = (0, 8192), dst unchanged. ESPDMA: src = (0xfc000000, 8192), dst unchanged. IOMMU: src ((0x1000, 4096), (0x4000, 4096)). Memory: src ((phys_ram_base + 0x1000, 4096), (phys_ram_base + 0x4000, 4096)). Now having made this exercise, I think we only need a translation function. It changes the addresses and adds a callback to handle the intermediate buffers. Let's try this improved model in a more complex scenario: SLIRP TCP socket (host) -> SLIRP IP -> SLIRP interface -> Lance -> LEDMA -> IOMMU -> memory. SLIRP IP adds IP headers, SLIRP Ethernet link adds Ethernet headers. LEDMA must buffer the data to perform byte swapping and MS byte will be forced to 0xfc. For IOMMU we reuse the disk parameters. We need to give a buffer for host recvmsg(). How can we determine the buffer size? We really know that only after the packet has been received. Let's pick a packet size of 4096. TCP: (0, 4096) IP adds iphdr: ((0, sizeof(iphdr)), (sizeof(iphdr), 4096)) Link adds Ethernet header: ((0, sizeof(ethhdr), (sizeof(ethhdr), sizeof(iphdr)), (sizeof(ehthdr) + sizeof(iphdr), 4096)) Lance searches the receive descriptors for a buffer. We need to set a bit to indicate that the buffer is in use, so a callback is needed. The translated vectors are: ((0x1000, sizeof(ethhdr), (0x1000 + sizeof(ethhdr), sizeof(iphdr)), (0x1000 + sizeof(ehthdr) + sizeof(iphdr), 4096)), callback lance_rmd_store. LEDMA provides a bswap translation buffer. So, for TCP the final vectors are: (&DMAState->lebuffer[0], s(eth)), (&DMAState->lebuffer[s(eth)], s(ip)), (&DMAState->lebuffer[s(eth)+ s(ip)], 4096)), callbacks (le_bswap_buffer, lance_rmd_store). TCP recv writes to lebuffer with AIO, callback le_bswap_buffer issued. le_bswap_buffer performs bswap, wants to copy the buffer to target memory. The new vector is: (0xfc000000, 4k+stuff) (can we merge the vectors at this point?), callback lance_rmd_store. IOMMU: ((0x1000, 4096), (0x4000, stuff)) Memory: ((phys_ram_base + 0x1000, 4096), (phys_ram_base + 0x4000, stuff)). lance_rmd_store is called: memcpy from lebuffer to destination, updates the descriptor (another translation), raises IRQ etc. I think this should work. ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-28 9:09 ` Blue Swirl @ 2007-10-28 19:10 ` Jamie Lokier 2007-10-29 19:33 ` Blue Swirl 2007-10-28 20:55 ` Blue Swirl 1 sibling, 1 reply; 8+ messages in thread From: Jamie Lokier @ 2007-10-28 19:10 UTC (permalink / raw) To: qemu-devel; +Cc: Paul Brook Blue Swirl wrote: > Currently scsi-disk provides a buffer. For true zero copy, this needs > to be changed so that instead the buffer is provided by the caller at > each stage until we reach the host memory. But I'll use the scsi-disk > buffer for now. This might actually work in Qemu. But in general, a zero-copy I/O interface needs to allow for the possibility that either the source of data, or the sink, might need to be in charge of buffer allocations for a particular sequence. Otherwise you get situations where the data has to be copied to meet a technical constraint of a source of a sink, and the copy could have been avoided if the addresses were allocated to meet that constraint in the first place. The most common technical constraint is probably the need for large contiguous blocks. I deal with this in my own program by having an I/O call from source to sink for requesting memory (through a chain of sources/sinks like your example if necessary), but only when the source is preparing to do an I/O and hasn't yet prepared the data. If the data is already prepared before setting up the I/O for a write, then there's no point asking the sink to allocate memory, and if it has to anyway (e.g. if it needs a large contiguous block), that's an unavoidable copy anyway. A couple of examples of sinks with constraints are: - Can't use writev(). E.g. you're using a slightly old Linux kernel, want to do AIO, and it doesn't have async writev(), only async write(). - Writing to sound card through memory-mapped ring buffer. The sink is the code which opens /dev/dsp, and then it can provide buffers for zero-copy only if it picks the address where data will be prepared. - Async I/O using "database writer" style separate processes which actually do the writes synchronously, and the data is passed to them using shared memory. For this, the sink is the code which sends a request to one of the writer processes, and it must use a buffer which is in the mapped shared memory. -- Jamie ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-28 19:10 ` Jamie Lokier @ 2007-10-29 19:33 ` Blue Swirl 2007-10-30 20:09 ` Blue Swirl 0 siblings, 1 reply; 8+ messages in thread From: Blue Swirl @ 2007-10-29 19:33 UTC (permalink / raw) To: qemu-devel; +Cc: Paul Brook On 10/28/07, Jamie Lokier <jamie@shareable.org> wrote: > Blue Swirl wrote: > > Currently scsi-disk provides a buffer. For true zero copy, this needs > > to be changed so that instead the buffer is provided by the caller at > > each stage until we reach the host memory. But I'll use the scsi-disk > > buffer for now. > > This might actually work in Qemu. > > But in general, a zero-copy I/O interface needs to allow for the > possibility that either the source of data, or the sink, might need to > be in charge of buffer allocations for a particular sequence. > Otherwise you get situations where the data has to be copied to meet a > technical constraint of a source of a sink, and the copy could have > been avoided if the addresses were allocated to meet that constraint > in the first place. The most common technical constraint is probably > the need for large contiguous blocks. > > I deal with this in my own program by having an I/O call from source > to sink for requesting memory (through a chain of sources/sinks like > your example if necessary), but only when the source is preparing to > do an I/O and hasn't yet prepared the data. If the data is already > prepared before setting up the I/O for a write, then there's no point > asking the sink to allocate memory, and if it has to anyway (e.g. if > it needs a large contiguous block), that's an unavoidable copy anyway. > > A couple of examples of sinks with constraints are: > > - Can't use writev(). E.g. you're using a slightly old Linux > kernel, want to do AIO, and it doesn't have async writev(), only async > write(). > > - Writing to sound card through memory-mapped ring buffer. The > sink is the code which opens /dev/dsp, and then it can provide > buffers for zero-copy only if it picks the address where data > will be prepared. > > - Async I/O using "database writer" style separate processes which > actually do the writes synchronously, and the data is passed to > them using shared memory. For this, the sink is the code which > sends a request to one of the writer processes, and it must use a > buffer which is in the mapped shared memory. I think this also shows that the system may become quite complex. Some kind of hooks may be needed before and after the transfer. We could cache the resolved addresses to overcome the additional setup overhead. Each stage should install cache invalidation callbacks or a method to call for recalculation of the addresses. For example IOMMU or ESPDMA mappings change very often. IO vector based API seems to be hard to use, so a simple list should be better. The vectors may not be compatible with the host anyway. I'll make a new version. It's good to get some feedback. Designing a high performance IO framework suitable for all use cases seems to be very challenging. ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-29 19:33 ` Blue Swirl @ 2007-10-30 20:09 ` Blue Swirl 0 siblings, 0 replies; 8+ messages in thread From: Blue Swirl @ 2007-10-30 20:09 UTC (permalink / raw) To: qemu-devel; +Cc: Paul Brook [-- Attachment #1: Type: text/plain, Size: 1206 bytes --] On 10/29/07, Blue Swirl <blauwirbel@gmail.com> wrote: > We could cache the resolved addresses to overcome the additional setup > overhead. Each stage should install cache invalidation callbacks or a > method to call for recalculation of the addresses. For example IOMMU > or ESPDMA mappings change very often. I meant to write "don't change very often." This version actually resolves the host memory address so that scsi-disk could (with some additional plumbing) write directly to final destination. I think both pre- and postprocessing hooks may be needed, but those are not implemented yet. What about error handling? For example, first page is OK but second is not. Truncate all further blocks and install a post-processing hook that raises a bus error? Example output: esp DMADriverAIOCB 0x27433f0 IO ranges: base 0000000000000000 len 0000000000000800 Prehooks: Posthooks: espdma DMADriverAIOCB 0x27433f0 IO ranges: base 00000000fe00000a len 0000000000000800 Prehooks: Posthooks: iommu DMADriverAIOCB 0x27433f0 IO ranges: base 0000000007fe100a len 0000000000000800 Prehooks: Posthooks: physical DMADriverAIOCB 0x27433f0 IO ranges: base 00002b8e6f82200a len 0000000000000800 Prehooks: Posthooks: [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: gdma_aiov.diff --] [-- Type: text/x-diff; name=gdma_aiov.diff, Size: 16048 bytes --] Index: qemu/vl.h =================================================================== --- qemu.orig/vl.h 2007-10-29 16:59:37.000000000 +0000 +++ qemu/vl.h 2007-10-30 19:08:35.000000000 +0000 @@ -746,6 +746,109 @@ #include "hw/irq.h" +/* Generic DMA API */ + +typedef void DMADriverCompletionFunc(void *opaque, int ret); + +typedef struct qemu_iolist { + target_phys_addr_t iov_base; + target_phys_addr_t iov_len; + struct qemu_iolist *next; +} qemu_iolist; + +typedef struct DMADriverAIOCB DMADriverAIOCB; + +typedef DMADriverAIOCB * +DMATranslationHandler(void *opaque, DMADriverAIOCB *request, int is_write); + +typedef struct DMACompletionEntry { + DMATranslationHandler *func; + void *opaque; + struct DMACompletionEntry *next; +} DMACompletionEntry; + +struct DMADriverAIOCB { + qemu_iolist *iolist; + DMACompletionEntry *prehook; + DMACompletionEntry *posthook; + struct DMADriverAIOCB *next; +}; + +typedef struct qemu_bus { + unsigned int bus_bits; + DMATranslationHandler *north_handler; + void *north_handler_opaque; + DMATranslationHandler *south_handler; + void *south_handler_opaque; +} qemu_bus; + +static inline qemu_bus * +bus_init(unsigned int bus_bits, + DMATranslationHandler north_handler, + void *north_handler_opaque, + DMATranslationHandler south_handler, + void *south_handler_opaque) +{ + qemu_bus *bus; + + bus = qemu_mallocz(sizeof(qemu_bus)); + bus->bus_bits = bus_bits; + bus->north_handler = north_handler; + bus->north_handler_opaque = north_handler_opaque; + bus->south_handler = south_handler; + bus->south_handler_opaque = south_handler_opaque; + return bus; +} + +/* Direction CPU->bridge->device/memory */ +static inline DMADriverAIOCB * +bus_translate_south(qemu_bus *bus, DMADriverAIOCB *request, int is_write) +{ + return bus->south_handler(bus->south_handler_opaque, request, is_write); +} + +/* From device towards CPU/memory (DMA) */ +static inline DMADriverAIOCB * +bus_translate_north(qemu_bus *bus, DMADriverAIOCB *request, int is_write) +{ + return bus->north_handler(bus->north_handler_opaque, request, is_write); +} + +static inline DMADriverAIOCB * +bus_build_aiocb(target_phys_addr_t addr, target_phys_addr_t len) +{ + DMADriverAIOCB *d; + + d = qemu_mallocz(sizeof(DMADriverAIOCB)); + d->iolist = qemu_mallocz(sizeof(qemu_iolist)); + d->iolist->iov_base = addr; + d->iolist->iov_len = len; + return d; +} + +#if 1 || DEBUG_GDMA +static inline void +bus_dump_aiocb(DMADriverAIOCB *d) +{ + qemu_iolist *io; + DMACompletionEntry *e; + + fprintf(stderr, "DMADriverAIOCB %p\nIO ranges:\n", d); + for (io = d->iolist; io != NULL; io = io->next) { + fprintf(stderr, "base " TARGET_FMT_plx " len " TARGET_FMT_plx "\n", + io->iov_base, io->iov_len); + } + fprintf(stderr, "Prehooks:\n"); + for (e = d->prehook; e != NULL; e = e->next) { + fprintf(stderr, "func %p opaque %p\n", e->func, e->opaque); + } + fprintf(stderr, "Posthooks:\n"); + for (e = d->posthook; e != NULL; e = e->next) { + fprintf(stderr, "func %p opaque %p\n", e->func, e->opaque); + } +} +#endif + /* ISA bus */ extern target_phys_addr_t isa_mem_base; @@ -1253,7 +1356,8 @@ extern QEMUMachine ss5_machine, ss10_machine; /* iommu.c */ -void *iommu_init(target_phys_addr_t addr); +void *iommu_init(target_phys_addr_t addr, qemu_bus *parent_bus, + qemu_bus **bus); void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr, uint8_t *buf, int len, int is_write); static inline void sparc_iommu_memory_read(void *opaque, @@ -1308,11 +1412,13 @@ /* esp.c */ void esp_scsi_attach(void *opaque, BlockDriverState *bd, int id); void *esp_init(BlockDriverState **bd, target_phys_addr_t espaddr, - void *dma_opaque, qemu_irq irq, qemu_irq *reset); + void *dma_opaque, qemu_irq irq, qemu_irq *reset, + qemu_bus *parent_bus, qemu_bus **bus); /* sparc32_dma.c */ void *sparc32_dma_init(target_phys_addr_t daddr, qemu_irq parent_irq, - void *iommu, qemu_irq **dev_irq, qemu_irq **reset); + void *iommu, qemu_irq **dev_irq, qemu_irq **reset, + qemu_bus *parent_bus, qemu_bus **bus); void ledma_memory_read(void *opaque, target_phys_addr_t addr, uint8_t *buf, int len, int do_bswap); void ledma_memory_write(void *opaque, target_phys_addr_t addr, @@ -1428,6 +1534,8 @@ scsi_{read,write}_data. */ void scsi_read_data(SCSIDevice *s, uint32_t tag); int scsi_write_data(SCSIDevice *s, uint32_t tag); +void scsi_read_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d); +int scsi_write_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d); void scsi_cancel_io(SCSIDevice *s, uint32_t tag); uint8_t *scsi_get_buf(SCSIDevice *s, uint32_t tag); Index: qemu/hw/sun4m.c =================================================================== --- qemu.orig/hw/sun4m.c 2007-10-29 16:59:37.000000000 +0000 +++ qemu/hw/sun4m.c 2007-10-30 19:09:12.000000000 +0000 @@ -306,6 +306,32 @@ env->halted = 1; } +static DMADriverAIOCB * +physical_memory_bus_translate_north(void *opaque, DMADriverAIOCB *request, + int is_write) +{ + qemu_iolist *io; + + for (io = request->iolist; io != NULL; io = io->next) { + if (io->iov_base < phys_ram_size) + io->iov_base += (unsigned long)phys_ram_base; + else + io->iov_len = 0; + } + fprintf(stderr, "physical\n"); + bus_dump_aiocb(request); + return request; +} + +static DMADriverAIOCB * +physical_memory_bus_translate_south(void *opaque, + DMADriverAIOCB *request, + int is_write) +{ + // Does not exist? + return request; +} + static void *sun4m_hw_init(const struct hwdef *hwdef, int RAM_size, DisplayState *ds, const char *cpu_model) @@ -317,6 +343,8 @@ qemu_irq *cpu_irqs[MAX_CPUS], *slavio_irq, *slavio_cpu_irq, *espdma_irq, *ledma_irq; qemu_irq *esp_reset, *le_reset; + qemu_bus *memory_bus, *iommu_bus, *espdma_bus, *ledma_bus, *esp_bus, + *scsi_bus; /* init CPUs */ sparc_find_by_name(cpu_model, &def); @@ -345,7 +373,12 @@ /* allocate RAM */ cpu_register_physical_memory(0, RAM_size, 0); - iommu = iommu_init(hwdef->iommu_base); + memory_bus = bus_init(TARGET_PHYS_ADDR_BITS, + physical_memory_bus_translate_north, + NULL, + physical_memory_bus_translate_south, + NULL); + iommu = iommu_init(hwdef->iommu_base, memory_bus, &iommu_bus); slavio_intctl = slavio_intctl_init(hwdef->intctl_base, hwdef->intctl_base + 0x10000ULL, &hwdef->intbit_to_level[0], @@ -354,11 +387,12 @@ hwdef->clock_irq); espdma = sparc32_dma_init(hwdef->dma_base, slavio_irq[hwdef->esp_irq], - iommu, &espdma_irq, &esp_reset); + iommu, &espdma_irq, &esp_reset, iommu_bus, + &espdma_bus); ledma = sparc32_dma_init(hwdef->dma_base + 16ULL, slavio_irq[hwdef->le_irq], iommu, &ledma_irq, - &le_reset); + &le_reset, iommu_bus, &ledma_bus); if (graphic_depth != 8 && graphic_depth != 24) { fprintf(stderr, "qemu: Unsupported depth: %d\n", graphic_depth); @@ -392,7 +426,7 @@ fdctrl_init(slavio_irq[hwdef->fd_irq], 0, 1, hwdef->fd_base, fd_table); main_esp = esp_init(bs_table, hwdef->esp_base, espdma, *espdma_irq, - esp_reset); + esp_reset, espdma_bus, &esp_bus); for (i = 0; i < MAX_DISKS; i++) { if (bs_table[i]) { Index: qemu/hw/iommu.c =================================================================== --- qemu.orig/hw/iommu.c 2007-10-29 16:59:37.000000000 +0000 +++ qemu/hw/iommu.c 2007-10-30 19:40:15.000000000 +0000 @@ -104,6 +104,7 @@ target_phys_addr_t addr; uint32_t regs[IOMMU_NREGS]; target_phys_addr_t iostart; + qemu_bus *bus; } IOMMUState; static uint32_t iommu_mem_readw(void *opaque, target_phys_addr_t addr) @@ -244,6 +245,64 @@ s->regs[IOMMU_AFAR] = addr; } +static DMADriverAIOCB * +iommu_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write) +{ + IOMMUState *s = opaque; + qemu_iolist *io, *new_io, *next_io; + uint32_t flags; + int l, first; + target_phys_addr_t addr, prev_addr, len, page, phys_addr; + + for (io = request->iolist; io != NULL; io = next_io) { + addr = io->iov_base; + len = io->iov_len; + next_io = io->next; + first = 1; + while (len > 0) { + page = addr & TARGET_PAGE_MASK; + l = (page + TARGET_PAGE_SIZE) - addr; + if (l > len) + l = len; + flags = iommu_page_get_flags(s, page); + if (!(flags & IOPTE_VALID)) { + return NULL; + } + if (is_write) { + if (!(flags & IOPTE_WRITE)) { + return NULL; + } + } + phys_addr = iommu_translate_pa(s, addr, flags); + if (!first && addr != (prev_addr + TARGET_PAGE_SIZE)) { + new_io = qemu_mallocz(sizeof(qemu_iolist)); + new_io->iov_base = phys_addr; + new_io->iov_len = len; + new_io->next = io->next; + io->next = new_io; + } else { + io->iov_base = phys_addr; + io->iov_len = len; + } + prev_addr = page; + len -= l; + addr += l; + first = 0; + } + } + fprintf(stderr, "iommu\n"); + bus_dump_aiocb(request); + bus_translate_north(s->bus, request, is_write); + return request; +} + +static DMADriverAIOCB * +iommu_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write) +{ + // 1:1 mapping + return request; +} + void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr, uint8_t *buf, int len, int is_write) { @@ -311,7 +370,7 @@ s->regs[IOMMU_CTRL] = IOMMU_VERSION; } -void *iommu_init(target_phys_addr_t addr) +void *iommu_init(target_phys_addr_t addr, qemu_bus *parent_bus, qemu_bus **bus) { IOMMUState *s; int iommu_io_memory; @@ -321,9 +380,12 @@ return NULL; s->addr = addr; + s->bus = parent_bus; iommu_io_memory = cpu_register_io_memory(0, iommu_mem_read, iommu_mem_write, s); cpu_register_physical_memory(addr, IOMMU_NREGS * 4, iommu_io_memory); + *bus = bus_init(32, iommu_bus_translate_north, s, + iommu_bus_translate_south, s); register_savevm("iommu", addr, 2, iommu_save, iommu_load, s); qemu_register_reset(iommu_reset, s); Index: qemu/hw/sparc32_dma.c =================================================================== --- qemu.orig/hw/sparc32_dma.c 2007-10-29 16:59:37.000000000 +0000 +++ qemu/hw/sparc32_dma.c 2007-10-30 19:29:06.000000000 +0000 @@ -60,6 +60,7 @@ qemu_irq irq; void *iommu; qemu_irq dev_reset; + qemu_bus *bus; }; /* Note: on sparc, the lance 16 bit bus is swapped */ @@ -128,6 +129,27 @@ } } +static DMADriverAIOCB * +espdma_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write) +{ + DMAState *s = opaque; + qemu_iolist *io; + + for (io = request->iolist; io != NULL; io = io->next) + io->iov_base = (target_phys_addr_t)s->dmaregs[1]; + fprintf(stderr, "espdma\n"); + bus_dump_aiocb(request); + bus_translate_north(s->bus, request, is_write); + return request; +} + +static DMADriverAIOCB * +espdma_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write) +{ + // 1:1 mapping + return request; +} + void espdma_memory_read(void *opaque, uint8_t *buf, int len) { DMAState *s = opaque; @@ -238,7 +260,8 @@ } void *sparc32_dma_init(target_phys_addr_t daddr, qemu_irq parent_irq, - void *iommu, qemu_irq **dev_irq, qemu_irq **reset) + void *iommu, qemu_irq **dev_irq, qemu_irq **reset, + qemu_bus *parent_bus, qemu_bus **bus) { DMAState *s; int dma_io_memory; @@ -258,6 +281,9 @@ *dev_irq = qemu_allocate_irqs(dma_set_irq, s, 1); *reset = &s->dev_reset; + s->bus = parent_bus; + *bus = bus_init(32, espdma_bus_translate_north, s, + espdma_bus_translate_south, s); return s; } Index: qemu/hw/esp.c =================================================================== --- qemu.orig/hw/esp.c 2007-10-29 16:59:37.000000000 +0000 +++ qemu/hw/esp.c 2007-10-30 19:27:46.000000000 +0000 @@ -74,6 +74,7 @@ uint8_t *async_buf; uint32_t async_len; void *dma_opaque; + qemu_bus *bus; }; #define STAT_DO 0x00 @@ -144,15 +145,25 @@ datalen = scsi_send_command(s->current_dev, 0, &buf[1], lun); s->ti_size = datalen; if (datalen != 0) { + DMADriverAIOCB *d; + s->rregs[4] = STAT_IN | STAT_TC; s->dma_left = 0; s->dma_counter = 0; if (datalen > 0) { + d = bus_build_aiocb(0, datalen); + fprintf(stderr, "esp\n"); + bus_dump_aiocb(d); + bus_translate_north(s->bus, d, 1); s->rregs[4] |= STAT_DI; - scsi_read_data(s->current_dev, 0); + scsi_read_data_aio(s->current_dev, 0, d); } else { + d = bus_build_aiocb(0, -datalen); + fprintf(stderr, "esp\n"); + bus_dump_aiocb(d); + bus_translate_north(s->bus, d, 0); s->rregs[4] |= STAT_DO; - scsi_write_data(s->current_dev, 0); + scsi_write_data_aio(s->current_dev, 0, d); } } s->rregs[5] = INTR_BS | INTR_FC; @@ -330,6 +341,22 @@ } } +static DMADriverAIOCB * +esp_bus_translate_north(void *opaque, DMADriverAIOCB *request, int is_write) +{ + // 1:1 mapping? + fprintf(stderr, "esp\n"); + bus_dump_aiocb(request); + return request; +} + +static DMADriverAIOCB * +esp_bus_translate_south(void *opaque, DMADriverAIOCB *request, int is_write) +{ + // 1:1 mapping + return request; +} + static void esp_reset(void *opaque) { ESPState *s = opaque; @@ -575,7 +602,8 @@ } void *esp_init(BlockDriverState **bd, target_phys_addr_t espaddr, - void *dma_opaque, qemu_irq irq, qemu_irq *reset) + void *dma_opaque, qemu_irq irq, qemu_irq *reset, + qemu_bus *parent_bus, qemu_bus **bus) { ESPState *s; int esp_io_memory; @@ -587,9 +615,11 @@ s->bd = bd; s->irq = irq; s->dma_opaque = dma_opaque; - + s->bus = parent_bus; esp_io_memory = cpu_register_io_memory(0, esp_mem_read, esp_mem_write, s); cpu_register_physical_memory(espaddr, ESP_SIZE, esp_io_memory); + *bus = bus_init(32, esp_bus_translate_north, s, + esp_bus_translate_south, s); esp_reset(s); Index: qemu/hw/scsi-disk.c =================================================================== --- qemu.orig/hw/scsi-disk.c 2007-10-29 16:59:37.000000000 +0000 +++ qemu/hw/scsi-disk.c 2007-10-30 18:55:19.000000000 +0000 @@ -199,6 +199,16 @@ r->sector_count -= n; } +void scsi_read_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d) +{ + scsi_read_data(s, tag); +} + +int scsi_write_data_aio(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *d) +{ + return scsi_write_data(s, tag); +} + static void scsi_write_complete(void * opaque, int ret) { SCSIRequest *r = (SCSIRequest *)opaque; ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? 2007-10-28 9:09 ` Blue Swirl 2007-10-28 19:10 ` Jamie Lokier @ 2007-10-28 20:55 ` Blue Swirl 1 sibling, 0 replies; 8+ messages in thread From: Blue Swirl @ 2007-10-28 20:55 UTC (permalink / raw) To: Paul Brook; +Cc: qemu-devel [-- Attachment #1: Type: text/plain, Size: 469 bytes --] I made a new patch sketching the system. It doesn't even compile, but it should give a view how this would be put into work. On the down side, new memory needs to be allocated for generation of new vectors from previous ones, that may kill some of the performance. Also, supporting DMA to MMIO registers can't be done with pure translation only. I'm not too happy about this model anymore, maybe the model I proposed earlier is better. But it can't get to zero copy. [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: gdma_aiov.diff --] [-- Type: text/x-diff; name=gdma_aiov.diff, Size: 7136 bytes --] Index: qemu/vl.h =================================================================== --- qemu.orig/vl.h 2007-10-28 13:26:35.000000000 +0000 +++ qemu/vl.h 2007-10-28 20:25:27.000000000 +0000 @@ -746,6 +746,78 @@ #include "hw/irq.h" +/* Generic DMA API */ + +typedef void DMADriverCompletionFunc(void *opaque, int ret); + +struct qemu_iovec { + target_phys_addr_t iov_base; + size_t iov_len; +}; + +typedef struct DMADriverAIOCB DMADriverAIOCB; + +typedef DMADriverAIOCB * +DMATranslationHandler(void *opaque, DMADriverAIOCB *request); + +typedef struct DMACompletionEntry { + DMATranslationHandler *func; + void *opaque; + struct DMACompletionEntry *next; +} DMACompletionEntry; + +struct DMADriverAIOCB { + unsigned int nent; + struct qemu_iovec *vector; + DMACompletionEntry *cb; + struct DMADriverAIOCB *next; +}; + +typedef struct qemu_bus { + unsigned int bus_bits; + DMATranslationHandler *north_handler; + void *north_handler_opaque; + DMATranslationHandler *south_handler; + void *south_handler_opaque; +} qemu_bus; + +qemu_bus * +bus_init(unsigned int bus_bits, + DMATranslationHandler north_handler, + void *north_handler_opaque, + DMATranslationHandler south_handler, + void *south_handler_opaque); + +/* Direction CPU->bridge->device/memory */ +static inline DMADriverAIOCB * +bus_translate_south(qemu_bus *bus, DMADriverAIOCB *request) +{ + return bus->south_handler(bus->south_handler_opaque, request); +} + +/* From device towards CPU/memory (DMA) */ +static inline DMADriverAIOCB * +bus_translate_north(qemu_bus *bus, DMADriverAIOCB *request) +{ + return bus->north_handler(bus->north_handler_opaque, request); +} + +static inline DMADriverAIOCB * +bus_build_aiocb(const struct qemu_iovec *vector, unsigned int count) +{ + DMADriverAIOCB *d; + + d = qemu_mallocz(sizeof(DMADriverAIOCB)); + d->nent = count; + d->vector = qemu_mallocz(count * sizeof(struct qemu_iovec)); + memcpy(d->vector, vector, count * sizeof(struct qemu_iovec)); + return d; +} + +DMADriverAIOCB *physical_memory_bus_translate_north(void *opaque, + DMADriverAIOCB *request); +DMADriverAIOCB *physical_memory_bus_translate_south(void *opaque, + DMADriverAIOCB *request); /* ISA bus */ extern target_phys_addr_t isa_mem_base; Index: qemu/hw/sun4m.c =================================================================== --- qemu.orig/hw/sun4m.c 2007-10-28 19:12:59.000000000 +0000 +++ qemu/hw/sun4m.c 2007-10-28 20:24:59.000000000 +0000 @@ -306,6 +306,27 @@ env->halted = 1; } +DMADriverAIOCB *physical_memory_bus_translate_north(void *opaque, + DMADriverAIOCB *request) +{ + unsigned int i; + + for (i = 0; i < request->nent; i++) { + if (request->vector[i].iov_base < phys_ram_size) + request->vector[i].iov_base += (unsigned long)phys_ram_base; + else + request->vector[i].iov_len = 0; + } + return request; +} + +DMADriverAIOCB *physical_memory_bus_translate_south(void *opaque, + DMADriverAIOCB *request) +{ + // Does not exist? + return request; +} + static void *sun4m_hw_init(const struct hwdef *hwdef, int RAM_size, DisplayState *ds, const char *cpu_model) Index: qemu/hw/iommu.c =================================================================== --- qemu.orig/hw/iommu.c 2007-10-28 19:12:35.000000000 +0000 +++ qemu/hw/iommu.c 2007-10-28 20:29:23.000000000 +0000 @@ -244,6 +244,24 @@ s->regs[IOMMU_AFAR] = addr; } +DMADriverAIOCB *iommu_bus_translate_north(void *opaque, + DMADriverAIOCB *request) +{ + unsigned int i; + // alloc a new vector + for (i = 0; i < request->nent; i++) { + //translate_vector(request->vector[i]); + } + return request; +} + +DMADriverAIOCB *iommu_bus_translate_south(void *opaque, + DMADriverAIOCB *request) +{ + // 1:1 mapping + return request; +} + void sparc_iommu_memory_rw(void *opaque, target_phys_addr_t addr, uint8_t *buf, int len, int is_write) { Index: qemu/hw/sparc32_dma.c =================================================================== --- qemu.orig/hw/sparc32_dma.c 2007-10-28 19:12:54.000000000 +0000 +++ qemu/hw/sparc32_dma.c 2007-10-28 20:30:45.000000000 +0000 @@ -128,6 +128,25 @@ } } +DMADriverAIOCB *espdma_bus_translate_north(void *opaque, + DMADriverAIOCB *request) +{ + DMAState *s = opaque; + unsigned int i; + + for (i = 0; i < request->nent; i++) { + request->vector[i].iov_base |= s->dmaregs[1]; + } + return request; +} + +DMADriverAIOCB *espdma_bus_translate_south(void *opaque, + DMADriverAIOCB *request) +{ + // 1:1 mapping + return request; +} + void espdma_memory_read(void *opaque, uint8_t *buf, int len) { DMAState *s = opaque; Index: qemu/hw/esp.c =================================================================== --- qemu.orig/hw/esp.c 2007-10-28 18:57:27.000000000 +0000 +++ qemu/hw/esp.c 2007-10-28 20:37:57.000000000 +0000 @@ -144,15 +144,25 @@ datalen = scsi_send_command(s->current_dev, 0, &buf[1], lun); s->ti_size = datalen; if (datalen != 0) { + DMADriverAIOCB *d; + struct qemu_iovec iov; + s->rregs[4] = STAT_IN | STAT_TC; s->dma_left = 0; s->dma_counter = 0; + iov.iov_base = 0; if (datalen > 0) { + iov.iov_len = datalen; + d = bus_build_aiocb(&iov, 1); + bus_translate_north(s->bus, d); s->rregs[4] |= STAT_DI; - scsi_read_data(s->current_dev, 0); + scsi_read_data(s->current_dev, 0, d); } else { + iov.iov_len = -datalen; + d = bus_build_aiocb(&iov, 1); + bus_translate_north(s->bus, d); s->rregs[4] |= STAT_DO; - scsi_write_data(s->current_dev, 0); + scsi_write_data(s->current_dev, 0, d); } } s->rregs[5] = INTR_BS | INTR_FC; Index: qemu/hw/scsi-disk.c =================================================================== --- qemu.orig/hw/scsi-disk.c 2007-10-28 20:30:35.000000000 +0000 +++ qemu/hw/scsi-disk.c 2007-10-28 20:38:49.000000000 +0000 @@ -162,7 +162,7 @@ } /* Read more data from scsi device into buffer. */ -void scsi_read_data(SCSIDevice *s, uint32_t tag) +void scsi_read_data(SCSIDevice *s, uint32_t tag, DMADriverAIOCB *request) { SCSIRequest *r; uint32_t n; @@ -191,7 +191,7 @@ n = SCSI_DMA_BUF_SIZE / 512; r->buf_len = n * 512; - r->aiocb = bdrv_aio_read(s->bdrv, r->sector, r->dma_buf, n, + r->aiocb = bdrv_aio_read(s->bdrv, r->sector, request, n, scsi_read_complete, r); if (r->aiocb == NULL) scsi_command_complete(r, SENSE_HARDWARE_ERROR); ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2007-10-30 20:09 UTC | newest] Thread overview: 8+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-10-27 12:56 [Qemu-devel] Faster, generic IO/DMA model with vectored AIO? Blue Swirl 2007-10-27 16:53 ` [Qemu-devel] " Blue Swirl 2007-10-28 1:29 ` [Qemu-devel] " Paul Brook 2007-10-28 9:09 ` Blue Swirl 2007-10-28 19:10 ` Jamie Lokier 2007-10-29 19:33 ` Blue Swirl 2007-10-30 20:09 ` Blue Swirl 2007-10-28 20:55 ` Blue Swirl
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).