qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
@ 2008-09-26 15:59 Anthony Liguori
  2008-09-26 17:59 ` Ryan Harper
  2008-10-07 15:43 ` Gerd Hoffmann
  0 siblings, 2 replies; 10+ messages in thread
From: Anthony Liguori @ 2008-09-26 15:59 UTC (permalink / raw)
  To: qemu-devel

Revision: 5323
          http://svn.sv.gnu.org/viewvc/?view=rev&root=qemu&revision=5323
Author:   aliguori
Date:     2008-09-26 15:59:29 +0000 (Fri, 26 Sep 2008)

Log Message:
-----------
Implement an fd pool to get real AIO with posix-aio

This patch implements a simple fd pool to allow many AIO requests with
posix-aio.  The result is significantly improved performance (identical to that
reported for linux-aio) for both cache=on and cache=off.

The fundamental problem with posix-aio is that it limits itself to one thread
per-file descriptor.  I don't know why this is, but this patch provides a simple
mechanism to work around this (duplicating the file descriptor).

This isn't a great solution, but it seems like a reasonable intermediate step
between posix-aio and a custom thread-pool to replace it.

Ryan Harper will be posting some performance analysis he did comparing posix-aio
with fd pooling against linux-aio.  The size of the posix-aio thread pool and
the fd pool were largely determined by him based on this analysis.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

Modified Paths:
--------------
    trunk/block-raw-posix.c

Modified: trunk/block-raw-posix.c
===================================================================
--- trunk/block-raw-posix.c	2008-09-26 15:52:17 UTC (rev 5322)
+++ trunk/block-raw-posix.c	2008-09-26 15:59:29 UTC (rev 5323)
@@ -84,10 +84,16 @@
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
+/* posix-aio doesn't allow multiple outstanding requests to a single file
+ * descriptor.  we implement a pool of dup()'d file descriptors to work
+ * around this */
+#define RAW_FD_POOL_SIZE	64
+
 typedef struct BDRVRawState {
     int fd;
     int type;
     unsigned int lseek_err_cnt;
+    int fd_pool[RAW_FD_POOL_SIZE];
 #if defined(__linux__)
     /* linux floppy specific */
     int fd_open_flags;
@@ -109,6 +115,7 @@
 {
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
+    int i;
 
     posix_aio_init();
 
@@ -138,6 +145,8 @@
         return ret;
     }
     s->fd = fd;
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
+        s->fd_pool[i] = -1;
 #if defined(O_DIRECT)
     s->aligned_buf = NULL;
     if (flags & BDRV_O_DIRECT) {
@@ -436,6 +445,7 @@
 
 typedef struct RawAIOCB {
     BlockDriverAIOCB common;
+    int fd;
     struct aiocb aiocb;
     struct RawAIOCB *next;
     int ret;
@@ -447,6 +457,38 @@
     RawAIOCB *first_aio;
 } PosixAioState;
 
+static int raw_fd_pool_get(BDRVRawState *s)
+{
+    int i;
+
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        /* already in use */
+        if (s->fd_pool[i] != -1)
+            continue;
+
+        /* try to dup file descriptor */
+        s->fd_pool[i] = dup(s->fd);
+        if (s->fd_pool[i] != -1)
+            return s->fd_pool[i];
+    }
+
+    /* we couldn't dup the file descriptor so just use the main one */
+    return s->fd;
+}
+
+static void raw_fd_pool_put(RawAIOCB *acb)
+{
+    BDRVRawState *s = acb->common.bs->opaque;
+    int i;
+
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        if (s->fd_pool[i] == acb->fd) {
+            close(s->fd_pool[i]);
+            s->fd_pool[i] = -1;
+        }
+    }
+}
+
 static void posix_aio_read(void *opaque)
 {
     PosixAioState *s = opaque;
@@ -487,6 +529,7 @@
             if (ret == ECANCELED) {
                 /* remove the request */
                 *pacb = acb->next;
+                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
             } else if (ret != EINPROGRESS) {
                 /* end of aio */
@@ -503,6 +546,7 @@
                 *pacb = acb->next;
                 /* call the callback */
                 acb->common.cb(acb->common.opaque, ret);
+                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
                 break;
             } else {
@@ -577,7 +621,8 @@
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->aiocb.aio_fildes = s->fd;
+    acb->fd = raw_fd_pool_get(s);
+    acb->aiocb.aio_fildes = acb->fd;
     acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -684,6 +729,7 @@
             break;
         } else if (*pacb == acb) {
             *pacb = acb->next;
+            raw_fd_pool_put(acb);
             qemu_aio_release(acb);
             break;
         }
@@ -697,6 +743,18 @@
 }
 #endif /* CONFIG_AIO */
 
+static void raw_close_fd_pool(BDRVRawState *s)
+{
+    int i;
+
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        if (s->fd_pool[i] != -1) {
+            close(s->fd_pool[i]);
+            s->fd_pool[i] = -1;
+        }
+    }
+}
+
 static void raw_close(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
@@ -708,6 +766,7 @@
             qemu_free(s->aligned_buf);
 #endif
     }
+    raw_close_fd_pool(s);
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -898,7 +957,7 @@
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
-    int fd, open_flags, ret;
+    int fd, open_flags, ret, i;
 
     posix_aio_init();
 
@@ -963,6 +1022,8 @@
         return ret;
     }
     s->fd = fd;
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
+        s->fd_pool[i] = -1;
 #if defined(__linux__)
     /* close fd so that we can reopen it as needed */
     if (s->type == FTYPE_FD) {
@@ -975,7 +1036,6 @@
 }
 
 #if defined(__linux__)
-
 /* Note: we do not have a reliable method to detect if the floppy is
    present. The current method is to try to open the floppy at every
    I/O and to keep it opened during a few hundreds of ms. */
@@ -991,6 +1051,7 @@
         (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         close(s->fd);
         s->fd = -1;
+        raw_close_fd_pool(s);
 #ifdef DEBUG_FLOPPY
         printf("Floppy closed\n");
 #endif
@@ -1091,6 +1152,7 @@
             if (s->fd >= 0) {
                 close(s->fd);
                 s->fd = -1;
+                raw_close_fd_pool(s);
             }
             fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
             if (fd >= 0) {

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-09-26 15:59 [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio Anthony Liguori
@ 2008-09-26 17:59 ` Ryan Harper
  2008-09-26 18:35   ` Anthony Liguori
  2008-10-07 15:43 ` Gerd Hoffmann
  1 sibling, 1 reply; 10+ messages in thread
From: Ryan Harper @ 2008-09-26 17:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, kvm

* Anthony Liguori <anthony@codemonkey.ws> [2008-09-26 11:03]:
> Revision: 5323
>           http://svn.sv.gnu.org/viewvc/?view=rev&root=qemu&revision=5323
> Author:   aliguori
> Date:     2008-09-26 15:59:29 +0000 (Fri, 26 Sep 2008)
> 
> Log Message:
> -----------
> Implement an fd pool to get real AIO with posix-aio
> 
> This patch implements a simple fd pool to allow many AIO requests with
> posix-aio.  The result is significantly improved performance (identical to that
> reported for linux-aio) for both cache=on and cache=off.
> 
> The fundamental problem with posix-aio is that it limits itself to one thread
> per-file descriptor.  I don't know why this is, but this patch provides a simple
> mechanism to work around this (duplicating the file descriptor).
> 
> This isn't a great solution, but it seems like a reasonable intermediate step
> between posix-aio and a custom thread-pool to replace it.
> 
> Ryan Harper will be posting some performance analysis he did comparing posix-aio
> with fd pooling against linux-aio.  The size of the posix-aio thread pool and
> the fd pool were largely determined by him based on this analysis.

I'll have some more data to post in a bit, but for now, bumping the fd
pool up to 64 and ensuring we init aio to support a thread per fd, we
mostly match linux aio performance with a simpler implementation.  For
randomwrites, fd_pool lags a bit, but I've got other data that shows in
most scenarios, fd_pool matches linux aio performance and does so with
less CPU consumption.

Results:

16k randwrite 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
-----------------------------------+------+------------------+------------------
baremetal (O_DIRECT, aka cache=off)| 61.2 |   13.07          |  19.59
kvm: cache=off posix-aio w/o patch |  4.7 | 3467.44          | 254.08
kvm: cache=off linux-aio           | 61.1 |   75.35          |  19.57
kvm: cache=on  posix-aio w/o patch |127.0 |  115.78          |   9.19
kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35          |   9.30
------------ new results ----------+------+------------------+------------------
kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28          |  49.19
kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86          |  23.66


16k write 1 thread, 74 iodepth     | MB/s | avg sub lat (us) | avg comp lat (ms)
-----------------------------------+------+------------------+------------------
baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90          |   9.45
kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00          | 231.06 
kvm: cache=off linux-aio           |130.0 |   83.83          |   8.99
kvm: cache=on  posix-aio w/o patch |184.0 |   80.46          |   6.35
kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90          |   7.09
------------ new results ----------+------+------------------+------------------
kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24          |  15.43
kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62          |   9.11


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-09-26 17:59 ` Ryan Harper
@ 2008-09-26 18:35   ` Anthony Liguori
  2008-09-26 18:50     ` Ryan Harper
  0 siblings, 1 reply; 10+ messages in thread
From: Anthony Liguori @ 2008-09-26 18:35 UTC (permalink / raw)
  To: Ryan Harper; +Cc: qemu-devel, kvm

Ryan Harper wrote:
> * Anthony Liguori <anthony@codemonkey.ws> [2008-09-26 11:03]:
>   
>> Revision: 5323
>>           http://svn.sv.gnu.org/viewvc/?view=rev&root=qemu&revision=5323
>> Author:   aliguori
>> Date:     2008-09-26 15:59:29 +0000 (Fri, 26 Sep 2008)
>>
>> Log Message:
>> -----------
>> Implement an fd pool to get real AIO with posix-aio
>>
>> This patch implements a simple fd pool to allow many AIO requests with
>> posix-aio.  The result is significantly improved performance (identical to that
>> reported for linux-aio) for both cache=on and cache=off.
>>
>> The fundamental problem with posix-aio is that it limits itself to one thread
>> per-file descriptor.  I don't know why this is, but this patch provides a simple
>> mechanism to work around this (duplicating the file descriptor).
>>
>> This isn't a great solution, but it seems like a reasonable intermediate step
>> between posix-aio and a custom thread-pool to replace it.
>>
>> Ryan Harper will be posting some performance analysis he did comparing posix-aio
>> with fd pooling against linux-aio.  The size of the posix-aio thread pool and
>> the fd pool were largely determined by him based on this analysis.
>>     
>
> I'll have some more data to post in a bit, but for now, bumping the fd
> pool up to 64 and ensuring we init aio to support a thread per fd, we
> mostly match linux aio performance with a simpler implementation.  For
> randomwrites, fd_pool lags a bit, but I've got other data that shows in
> most scenarios, fd_pool matches linux aio performance and does so with
> less CPU consumption.
>
> Results:
>
> 16k randwrite 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
> -----------------------------------+------+------------------+------------------
> baremetal (O_DIRECT, aka cache=off)| 61.2 |   13.07          |  19.59
> kvm: cache=off posix-aio w/o patch |  4.7 | 3467.44          | 254.08
>   

So with posix-aio, once we have many requests, each request is going to 
block until the request completes.  I don't fully understand why the 
average completion latency is so high because in theory, there should be 
no delay between completion and submission.  Maybe it has to do with the 
fact that we spend so much time blocking during submission, that the 
io-thread doesn't get a chance to run.  I bet if we dropped the 
qemu_mutex during submission, the completion latency would drop to a 
very small number.  Not worth actually testing.

> kvm: cache=off linux-aio           | 61.1 |   75.35          |  19.57
>   

The fact that the submission latency is so high confirms what I've been 
about linux-aio submissions being very unoptimal.  That is really quite 
high.

> kvm: cache=on  posix-aio w/o patch |127.0 |  115.78          |   9.19
> kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35          |   9.30
>   

It looks like 127mb/s is pretty close to the optimal cached write time.  
When using caching, writes can complete almost immediately so it's not 
surprising that submission latency is so low (even though it's blocking 
during submission).

I am surprised that w/patch has a latency that's so high.  I think that 
suggests that requests are queuing up.  I bet increasing the aio_num 
field would reduce this number.

> ------------ new results ----------+------+------------------+------------------
> kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28          |  49.19
> kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86          |  23.66
>   

I assume you tried to bump from 64 to something higher and couldn't make 
up the lost bandwidth?

> 16k write 1 thread, 74 iodepth     | MB/s | avg sub lat (us) | avg comp lat (ms)
> -----------------------------------+------+------------------+------------------
> baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90          |   9.45
> kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00          | 231.06 
> kvm: cache=off linux-aio           |130.0 |   83.83          |   8.99
> kvm: cache=on  posix-aio w/o patch |184.0 |   80.46          |   6.35
> kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90          |   7.09
> ------------ new results ----------+------+------------------+------------------
> kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24          |  15.43
> kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62          |   9.11
>   

That's a nice result.  We could probably improve the latency by tweaking 
the queue sizes.

Very nice work!  Thanks for doing the thorough analysis.

Regards,

Anthony Liguori
>
>   

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-09-26 18:35   ` Anthony Liguori
@ 2008-09-26 18:50     ` Ryan Harper
  0 siblings, 0 replies; 10+ messages in thread
From: Ryan Harper @ 2008-09-26 18:50 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Ryan Harper, qemu-devel, kvm

* Anthony Liguori <aliguori@us.ibm.com> [2008-09-26 13:37]:
> Ryan Harper wrote:
> >* Anthony Liguori <anthony@codemonkey.ws> [2008-09-26 11:03]:

> >kvm: cache=on  posix-aio w/o patch |127.0 |  115.78          |   9.19
> >kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35          |   9.30
> >  
> 
> It looks like 127mb/s is pretty close to the optimal cached write time.  
> When using caching, writes can complete almost immediately so it's not 
> surprising that submission latency is so low (even though it's blocking 
> during submission).
> 
> I am surprised that w/patch has a latency that's so high.  I think that 
> suggests that requests are queuing up.  I bet increasing the aio_num 
> field would reduce this number.

Yeah, there is plenty of room to twiddle with the threads and number of
outstanding ios, but that'll take quite a bit of time to generate the
data and compare.

> >------------ new results 
> >----------+------+------------------+------------------
> >kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28          |  49.19
> >kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86          |  23.66
> >  
> 
> I assume you tried to bump from 64 to something higher and couldn't make 
> up the lost bandwidth?

Very slightly, switching to 128 threads/fds gave another 1MB/s. 

> >16k write 1 thread, 74 iodepth     | MB/s | avg sub lat (us) | avg comp 
> >lat (ms)
> >-----------------------------------+------+------------------+------------------
> >baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90          |   9.45
> >kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00          | 231.06 
> >kvm: cache=off linux-aio           |130.0 |   83.83          |   8.99
> >kvm: cache=on  posix-aio w/o patch |184.0 |   80.46          |   6.35
> >kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90          |   7.09
> >------------ new results 
> >----------+------+------------------+------------------
> >kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24          |  15.43
> >kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62          |   9.11
> >  
> 
> That's a nice result.  We could probably improve the latency by tweaking 
> the queue sizes.

Yeah, I was quite pleased to see a simpler solution perform so well.
> 
> Very nice work!  Thanks for doing the thorough analysis.

Thanks, very happy to see a signficant improvement in IO here.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-09-26 15:59 [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio Anthony Liguori
  2008-09-26 17:59 ` Ryan Harper
@ 2008-10-07 15:43 ` Gerd Hoffmann
  2008-10-07 16:01   ` Anthony Liguori
  1 sibling, 1 reply; 10+ messages in thread
From: Gerd Hoffmann @ 2008-10-07 15:43 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:

> The fundamental problem with posix-aio is that it limits itself to one thread
> per-file descriptor.  I don't know why this is, but this patch provides a simple
> mechanism to work around this (duplicating the file descriptor).
> 
> This isn't a great solution, but it seems like a reasonable intermediate step
> between posix-aio and a custom thread-pool to replace it.

Are there plans to support vectored block requests with the thread pool
implementation?

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-10-07 15:43 ` Gerd Hoffmann
@ 2008-10-07 16:01   ` Anthony Liguori
  2008-10-07 20:40     ` Gerd Hoffmann
  0 siblings, 1 reply; 10+ messages in thread
From: Anthony Liguori @ 2008-10-07 16:01 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
> Anthony Liguori wrote:
>
>   
>> The fundamental problem with posix-aio is that it limits itself to one thread
>> per-file descriptor.  I don't know why this is, but this patch provides a simple
>> mechanism to work around this (duplicating the file descriptor).
>>
>> This isn't a great solution, but it seems like a reasonable intermediate step
>> between posix-aio and a custom thread-pool to replace it.
>>     
>
> Are there plans to support vectored block requests with the thread pool
> implementation?
>   

Yes, that's the primary reason for doing a new thread pool 
implementation.  Of course, we need a zero-copy DMA API before such a 
thing would make sense.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
>
>   

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-10-07 16:01   ` Anthony Liguori
@ 2008-10-07 20:40     ` Gerd Hoffmann
  2008-10-07 21:21       ` Anthony Liguori
  0 siblings, 1 reply; 10+ messages in thread
From: Gerd Hoffmann @ 2008-10-07 20:40 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
>> Are there plans to support vectored block requests with the thread pool
>> implementation?
>>   
> 
> Yes, that's the primary reason for doing a new thread pool
> implementation.

Cool.

> Of course, we need a zero-copy DMA API before such a
> thing would make sense.

Hmm, quick check of the IDE code indeed shows a copy happening there.
Why is it needed?  My xen disk backend doesn't copy data.  Does that
mean might have done something wrong?  Does the virtio backend copy data
too?

Disclaimer: I don't know much about ide dma ...

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-10-07 20:40     ` Gerd Hoffmann
@ 2008-10-07 21:21       ` Anthony Liguori
  2008-10-07 22:15         ` Gerd Hoffmann
  0 siblings, 1 reply; 10+ messages in thread
From: Anthony Liguori @ 2008-10-07 21:21 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
> Anthony Liguori wrote:
>   
>>> Are there plans to support vectored block requests with the thread pool
>>> implementation?
>>>   
>>>       
>> Yes, that's the primary reason for doing a new thread pool
>> implementation.
>>     
>
> Cool.
>
>   
>> Of course, we need a zero-copy DMA API before such a
>> thing would make sense.
>>     
>
> Hmm, quick check of the IDE code indeed shows a copy happening there.
> Why is it needed?  My xen disk backend doesn't copy data.  Does that
> mean might have done something wrong?  Does the virtio backend copy data
> too?
>   

It does now, because the cost of splitting up the AIO request for each 
element of the scatter/gather list was considerably higher than the cost 
of copying the data to a linear buffer.

Look back in qemu-devel for a thread about a DMA API.

You can only avoid doing a copy if you do something like phys_ram_base + 
PA.  From an architectural perspective, this is not ideal since it 
doesn't allow for things like IOMMU emulation.  What we need, is a 
zero-copy API at the PCI level.

Regards,

Anthony Liguori

> Disclaimer: I don't know much about ide dma ...
>
> cheers,
>   Gerd
>
>
>   

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-10-07 21:21       ` Anthony Liguori
@ 2008-10-07 22:15         ` Gerd Hoffmann
  2008-10-07 22:39           ` Anthony Liguori
  0 siblings, 1 reply; 10+ messages in thread
From: Gerd Hoffmann @ 2008-10-07 22:15 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
> Gerd Hoffmann wrote:
>> Anthony Liguori wrote:
>>  
>>>> Are there plans to support vectored block requests with the thread pool
>>>> implementation?
>>>>         
>>> Yes, that's the primary reason for doing a new thread pool
>>> implementation.
>>>     
>>
>> Cool.
>>
>>  
>>> Of course, we need a zero-copy DMA API before such a
>>> thing would make sense.
>>>     
>>
>> Hmm, quick check of the IDE code indeed shows a copy happening there.
>> Why is it needed?  My xen disk backend doesn't copy data.  Does that
>> mean might have done something wrong?  Does the virtio backend copy data
>> too?
> 
> It does now, because the cost of splitting up the AIO request for each
> element of the scatter/gather list was considerably higher than the cost
> of copying the data to a linear buffer.

Ok, so virtio will likely stop doing that soon I guess?  With the aio
thread pool and even more with a vectored aio api the need for that
should go away ...

> You can only avoid doing a copy if you do something like phys_ram_base +
> PA.

I actually do this ...

void *xenner_mfn_to_ptr(xen_pfn_t pfn)
{
    ram_addr_t offset;

    offset = cpu_get_physical_page_desc(pfn << PAGE_SHIFT);
    return (void*)phys_ram_base + offset;
}

... which should keep working even in case there are holes in the guest
PA address space and thus guest_pa != phys_ram_base offset.

> From an architectural perspective, this is not ideal since it
> doesn't allow for things like IOMMU emulation.  What we need, is a
> zero-copy API at the PCI level.

Sure, for IDE+SCSI emulation.  For paravirtual drivers it shouldn't be
an issue.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio
  2008-10-07 22:15         ` Gerd Hoffmann
@ 2008-10-07 22:39           ` Anthony Liguori
  0 siblings, 0 replies; 10+ messages in thread
From: Anthony Liguori @ 2008-10-07 22:39 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
> Anthony Liguori wrote:
>   
>> Gerd Hoffmann wrote:
>>     
>>> Anthony Liguori wrote:
>>>  
>>>       
>>>>> Are there plans to support vectored block requests with the thread pool
>>>>> implementation?
>>>>>         
>>>>>           
>>>> Yes, that's the primary reason for doing a new thread pool
>>>> implementation.
>>>>     
>>>>         
>>> Cool.
>>>
>>>  
>>>       
>>>> Of course, we need a zero-copy DMA API before such a
>>>> thing would make sense.
>>>>     
>>>>         
>>> Hmm, quick check of the IDE code indeed shows a copy happening there.
>>> Why is it needed?  My xen disk backend doesn't copy data.  Does that
>>> mean might have done something wrong?  Does the virtio backend copy data
>>> too?
>>>       
>> It does now, because the cost of splitting up the AIO request for each
>> element of the scatter/gather list was considerably higher than the cost
>> of copying the data to a linear buffer.
>>     
>
> Ok, so virtio will likely stop doing that soon I guess?  With the aio
> thread pool and even more with a vectored aio api the need for that
> should go away ...
>   

Right now, it's not a super high priority for us since we're getting 
good performance without it.  More important is to improve emulated SCSI 
performance, get a proper zero copy API in, get virtio merged upstream, 
and then start tackling this stuff.

>> You can only avoid doing a copy if you do something like phys_ram_base +
>> PA.
>>     
>
> I actually do this ...
>
> void *xenner_mfn_to_ptr(xen_pfn_t pfn)
> {
>     ram_addr_t offset;
>
>     offset = cpu_get_physical_page_desc(pfn << PAGE_SHIFT);
>     return (void*)phys_ram_base + offset;
> }
>
> ... which should keep working even in case there are holes in the guest
> PA address space and thus guest_pa != phys_ram_base offset.
>   

Yes, this is what we do in virtio too ATM.

>> From an architectural perspective, this is not ideal since it
>> doesn't allow for things like IOMMU emulation.  What we need, is a
>> zero-copy API at the PCI level.
>>     
>
> Sure, for IDE+SCSI emulation.  For paravirtual drivers it shouldn't be
> an issue.
>   

For Xen, I can maybe see the argument that it shouldn't go through a DMA 
API since it's completely divorced of how normal hardware works.  Since 
virtio drivers are really just a PCI device, I don't think that argument 
applies.

There are good reasons to put in the effort to get a proper DMA API to 
improve SCSI and the e1000's performance.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
>
>
>
>   

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2008-10-07 22:39 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-26 15:59 [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio Anthony Liguori
2008-09-26 17:59 ` Ryan Harper
2008-09-26 18:35   ` Anthony Liguori
2008-09-26 18:50     ` Ryan Harper
2008-10-07 15:43 ` Gerd Hoffmann
2008-10-07 16:01   ` Anthony Liguori
2008-10-07 20:40     ` Gerd Hoffmann
2008-10-07 21:21       ` Anthony Liguori
2008-10-07 22:15         ` Gerd Hoffmann
2008-10-07 22:39           ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).