* [Qemu-devel] [PATCH 1/5] dma-helpers: rename is_write to to_dev
2011-09-07 15:20 [Qemu-devel] [PATCH 0/5] block: preparatory patches for scatter/gather support Paolo Bonzini
@ 2011-09-07 15:20 ` Paolo Bonzini
2011-09-09 11:31 ` Kevin Wolf
2011-09-07 15:20 ` [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code Paolo Bonzini
` (3 subsequent siblings)
4 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-07 15:20 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
dma-helpers.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/dma-helpers.c b/dma-helpers.c
index 4610ea0..ca97e14 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -42,7 +42,7 @@ typedef struct {
BlockDriverAIOCB *acb;
QEMUSGList *sg;
uint64_t sector_num;
- int is_write;
+ int to_dev;
int sg_cur_index;
target_phys_addr_t sg_cur_byte;
QEMUIOVector iov;
@@ -75,7 +75,7 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
for (i = 0; i < dbs->iov.niov; ++i) {
cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
- dbs->iov.iov[i].iov_len, !dbs->is_write,
+ dbs->iov.iov[i].iov_len, !dbs->to_dev,
dbs->iov.iov[i].iov_len);
}
}
@@ -101,7 +101,7 @@ static void dma_bdrv_cb(void *opaque, int ret)
while (dbs->sg_cur_index < dbs->sg->nsg) {
cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
- mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
+ mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->to_dev);
if (!mem)
break;
qemu_iovec_add(&dbs->iov, mem, cur_len);
@@ -143,7 +143,7 @@ static AIOPool dma_aio_pool = {
BlockDriverAIOCB *dma_bdrv_io(
BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
- void *opaque, int is_write)
+ void *opaque, int to_dev)
{
DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
@@ -153,7 +153,7 @@ BlockDriverAIOCB *dma_bdrv_io(
dbs->sector_num = sector_num;
dbs->sg_cur_index = 0;
dbs->sg_cur_byte = 0;
- dbs->is_write = is_write;
+ dbs->to_dev = to_dev;
dbs->io_func = io_func;
dbs->bh = NULL;
qemu_iovec_init(&dbs->iov, sg->nsg);
--
1.7.6
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] dma-helpers: rename is_write to to_dev
2011-09-07 15:20 ` [Qemu-devel] [PATCH 1/5] dma-helpers: rename is_write to to_dev Paolo Bonzini
@ 2011-09-09 11:31 ` Kevin Wolf
0 siblings, 0 replies; 15+ messages in thread
From: Kevin Wolf @ 2011-09-09 11:31 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Am 07.09.2011 17:20, schrieb Paolo Bonzini:
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> dma-helpers.c | 10 +++++-----
> 1 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/dma-helpers.c b/dma-helpers.c
> index 4610ea0..ca97e14 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -42,7 +42,7 @@ typedef struct {
> BlockDriverAIOCB *acb;
> QEMUSGList *sg;
> uint64_t sector_num;
> - int is_write;
> + int to_dev;
> int sg_cur_index;
> target_phys_addr_t sg_cur_byte;
> QEMUIOVector iov;
Should we take the opportunity to make it a bool? I think it would make
the purpose easier to understand.
Kevin
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code
2011-09-07 15:20 [Qemu-devel] [PATCH 0/5] block: preparatory patches for scatter/gather support Paolo Bonzini
2011-09-07 15:20 ` [Qemu-devel] [PATCH 1/5] dma-helpers: rename is_write to to_dev Paolo Bonzini
@ 2011-09-07 15:20 ` Paolo Bonzini
2011-09-09 11:39 ` Kevin Wolf
2011-09-07 15:21 ` [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation Paolo Bonzini
` (2 subsequent siblings)
4 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-07 15:20 UTC (permalink / raw)
To: qemu-devel
Target-independent code cannot construct sglists, but it can take
them from the outside as a black box. Allow this.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
dma.h | 8 ++++++--
qemu-common.h | 1 +
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/dma.h b/dma.h
index a6db5ba..f7e0142 100644
--- a/dma.h
+++ b/dma.h
@@ -15,10 +15,13 @@
#include "hw/hw.h"
#include "block.h"
-typedef struct {
+typedef struct ScatterGatherEntry ScatterGatherEntry;
+
+#if defined(TARGET_PHYS_ADDR_BITS)
+struct ScatterGatherEntry {
target_phys_addr_t base;
target_phys_addr_t len;
-} ScatterGatherEntry;
+};
struct QEMUSGList {
ScatterGatherEntry *sg;
@@ -31,6 +34,7 @@ void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint);
void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
target_phys_addr_t len);
void qemu_sglist_destroy(QEMUSGList *qsg);
+#endif
typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
diff --git a/qemu-common.h b/qemu-common.h
index 404c421..ef9a2bb 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -18,6 +18,7 @@ typedef struct DeviceState DeviceState;
struct Monitor;
typedef struct Monitor Monitor;
+typedef struct QEMUSGList QEMUSGList;
/* we put basic includes here to avoid repeating them in device drivers */
#include <stdlib.h>
--
1.7.6
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code
2011-09-07 15:20 ` [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code Paolo Bonzini
@ 2011-09-09 11:39 ` Kevin Wolf
2011-09-09 11:53 ` Paolo Bonzini
0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2011-09-09 11:39 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Am 07.09.2011 17:20, schrieb Paolo Bonzini:
> Target-independent code cannot construct sglists, but it can take
> them from the outside as a black box. Allow this.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
I was hoping to find the use case for this somewhere in the rest of the
series, but there doesn't seem to be an example.
How can you make use of the dma.h functions when you can't build a
QEMUSGList? Or would you have a target-dependent part that builds it and
passes it to a target-independent part?
Kevin
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code
2011-09-09 11:39 ` Kevin Wolf
@ 2011-09-09 11:53 ` Paolo Bonzini
0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-09 11:53 UTC (permalink / raw)
To: Kevin Wolf; +Cc: qemu-devel
On 09/09/2011 01:39 PM, Kevin Wolf wrote:
> Am 07.09.2011 17:20, schrieb Paolo Bonzini:
>> Target-independent code cannot construct sglists, but it can take
>> them from the outside as a black box. Allow this.
>>
>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
>
> I was hoping to find the use case for this somewhere in the rest of the
> series, but there doesn't seem to be an example.
Indeed, unfortunately not yet.
> How can you make use of the dma.h functions when you can't build a
> QEMUSGList? Or would you have a target-dependent part that builds it and
> passes it to a target-independent part?
Exactly, scsi-disk/scsi-bus will pick the QEMUSGList from the HBA and
treat it opaquely.
Paolo
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-07 15:20 [Qemu-devel] [PATCH 0/5] block: preparatory patches for scatter/gather support Paolo Bonzini
2011-09-07 15:20 ` [Qemu-devel] [PATCH 1/5] dma-helpers: rename is_write to to_dev Paolo Bonzini
2011-09-07 15:20 ` [Qemu-devel] [PATCH 2/5] dma-helpers: allow including from target-independent code Paolo Bonzini
@ 2011-09-07 15:21 ` Paolo Bonzini
2011-09-09 12:14 ` Kevin Wolf
2011-09-07 15:21 ` [Qemu-devel] [PATCH 4/5] scsi-disk: commonize iovec creation between reads and writes Paolo Bonzini
2011-09-07 15:21 ` [Qemu-devel] [PATCH 5/5] scsi-disk: lazily allocate bounce buffer Paolo Bonzini
4 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-07 15:21 UTC (permalink / raw)
To: qemu-devel
This fixes various problems with completion/cancellation:
* If DMA encounters a bounce buffer conflict, and the DMA operation is
canceled before the bottom half fires, bad things happen.
* memory is not unmapped after cancellation, again causing problems
when doing DMA to I/O areas
* cancellation could leak the iovec
and probably more that I've missed. The patch fixes them by sharing
the cleanup code between completion and cancellation. The dma_bdrv_cb
now returns a boolean completed/not completed flag, and the wrapper
dma_continue takes care of tasks to do upon completion.
Most of these are basically impossible in practice, but it is better
to be tidy...
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
dma-helpers.c | 37 ++++++++++++++++++++++++-------------
1 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/dma-helpers.c b/dma-helpers.c
index ca97e14..cc8c4e3 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -58,7 +58,7 @@ static void reschedule_dma(void *opaque)
qemu_bh_delete(dbs->bh);
dbs->bh = NULL;
- dma_bdrv_cb(opaque, 0);
+ dma_bdrv_cb(dbs, 0);
}
static void continue_after_map_failure(void *opaque)
@@ -78,6 +78,21 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
dbs->iov.iov[i].iov_len, !dbs->to_dev,
dbs->iov.iov[i].iov_len);
}
+ qemu_iovec_reset(&dbs->iov);
+}
+
+static void dma_complete(DMAAIOCB *dbs, int ret)
+{
+ dma_bdrv_unmap(dbs);
+ if (dbs->common.cb) {
+ dbs->common.cb(dbs->common.opaque, ret);
+ }
+ qemu_iovec_destroy(&dbs->iov);
+ if (dbs->bh) {
+ qemu_bh_delete(dbs->bh);
+ dbs->bh = NULL;
+ }
+ qemu_aio_release(dbs);
}
static void dma_bdrv_cb(void *opaque, int ret)
@@ -89,12 +104,9 @@ static void dma_bdrv_cb(void *opaque, int ret)
dbs->acb = NULL;
dbs->sector_num += dbs->iov.size / 512;
dma_bdrv_unmap(dbs);
- qemu_iovec_reset(&dbs->iov);
if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
- dbs->common.cb(dbs->common.opaque, ret);
- qemu_iovec_destroy(&dbs->iov);
- qemu_aio_release(dbs);
+ dma_complete(dbs, ret);
return;
}
@@ -120,9 +132,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
dbs->iov.size / 512, dma_bdrv_cb, dbs);
if (!dbs->acb) {
- dma_bdrv_unmap(dbs);
- qemu_iovec_destroy(&dbs->iov);
- return;
+ dbs->common.cb = NULL;
+ dma_complete(dbs, -ENOMEM);
}
}
@@ -131,8 +142,12 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb)
DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
if (dbs->acb) {
- bdrv_aio_cancel(dbs->acb);
+ BlockDriverAIOCB *acb = dbs->acb;
+ dbs->acb = NULL;
+ bdrv_aio_cancel(acb);
}
+ dbs->common.cb = NULL;
+ dma_complete(dbs, 0);
}
static AIOPool dma_aio_pool = {
@@ -158,10 +173,6 @@ BlockDriverAIOCB *dma_bdrv_io(
dbs->bh = NULL;
qemu_iovec_init(&dbs->iov, sg->nsg);
dma_bdrv_cb(dbs, 0);
- if (!dbs->acb) {
- qemu_aio_release(dbs);
- return NULL;
- }
return &dbs->common;
}
--
1.7.6
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-07 15:21 ` [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation Paolo Bonzini
@ 2011-09-09 12:14 ` Kevin Wolf
2011-09-09 12:43 ` Paolo Bonzini
0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2011-09-09 12:14 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Am 07.09.2011 17:21, schrieb Paolo Bonzini:
> This fixes various problems with completion/cancellation:
>
> * If DMA encounters a bounce buffer conflict, and the DMA operation is
> canceled before the bottom half fires, bad things happen.
>
> * memory is not unmapped after cancellation, again causing problems
> when doing DMA to I/O areas
>
> * cancellation could leak the iovec
>
> and probably more that I've missed. The patch fixes them by sharing
> the cleanup code between completion and cancellation. The dma_bdrv_cb
> now returns a boolean completed/not completed flag, and the wrapper
> dma_continue takes care of tasks to do upon completion.
>
> Most of these are basically impossible in practice, but it is better
> to be tidy...
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> dma-helpers.c | 37 ++++++++++++++++++++++++-------------
> 1 files changed, 24 insertions(+), 13 deletions(-)
>
> diff --git a/dma-helpers.c b/dma-helpers.c
> index ca97e14..cc8c4e3 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -58,7 +58,7 @@ static void reschedule_dma(void *opaque)
>
> qemu_bh_delete(dbs->bh);
> dbs->bh = NULL;
> - dma_bdrv_cb(opaque, 0);
> + dma_bdrv_cb(dbs, 0);
> }
>
> static void continue_after_map_failure(void *opaque)
> @@ -78,6 +78,21 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
> dbs->iov.iov[i].iov_len, !dbs->to_dev,
> dbs->iov.iov[i].iov_len);
> }
> + qemu_iovec_reset(&dbs->iov);
> +}
> +
> +static void dma_complete(DMAAIOCB *dbs, int ret)
> +{
> + dma_bdrv_unmap(dbs);
> + if (dbs->common.cb) {
> + dbs->common.cb(dbs->common.opaque, ret);
> + }
> + qemu_iovec_destroy(&dbs->iov);
> + if (dbs->bh) {
> + qemu_bh_delete(dbs->bh);
> + dbs->bh = NULL;
> + }
> + qemu_aio_release(dbs);
> }
>
> static void dma_bdrv_cb(void *opaque, int ret)
> @@ -89,12 +104,9 @@ static void dma_bdrv_cb(void *opaque, int ret)
> dbs->acb = NULL;
> dbs->sector_num += dbs->iov.size / 512;
> dma_bdrv_unmap(dbs);
> - qemu_iovec_reset(&dbs->iov);
>
> if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
> - dbs->common.cb(dbs->common.opaque, ret);
> - qemu_iovec_destroy(&dbs->iov);
> - qemu_aio_release(dbs);
> + dma_complete(dbs, ret);
> return;
> }
>
> @@ -120,9 +132,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
> dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
> dbs->iov.size / 512, dma_bdrv_cb, dbs);
> if (!dbs->acb) {
> - dma_bdrv_unmap(dbs);
> - qemu_iovec_destroy(&dbs->iov);
> - return;
> + dbs->common.cb = NULL;
> + dma_complete(dbs, -ENOMEM);
Why don't we call the callback here? I know that it already was this way
before your patch, but isn't that a bug?
Also, I think it should be -EIO instead of -ENOMEM (even though it
doesn't make any difference if we don't call the callback)
> }
> }
>
> @@ -131,8 +142,12 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb)
> DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
>
> if (dbs->acb) {
> - bdrv_aio_cancel(dbs->acb);
> + BlockDriverAIOCB *acb = dbs->acb;
> + dbs->acb = NULL;
> + bdrv_aio_cancel(acb);
> }
> + dbs->common.cb = NULL;
> + dma_complete(dbs, 0);
Did you consider that there are block drivers that implement
bdrv_aio_cancel() as waiting for completion of outstanding requests? I
think in that case dma_complete() may be called twice. For most of it,
this shouldn't be a problem, but I think it doesn't work with the
qemu_aio_release(dbs).
Kevin
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-09 12:14 ` Kevin Wolf
@ 2011-09-09 12:43 ` Paolo Bonzini
2011-09-09 12:59 ` Kevin Wolf
0 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-09 12:43 UTC (permalink / raw)
To: Kevin Wolf; +Cc: qemu-devel
> > @@ -120,9 +132,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
> > dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
> > dbs->iov.size / 512, dma_bdrv_cb, dbs);
> > if (!dbs->acb) {
> > - dma_bdrv_unmap(dbs);
> > - qemu_iovec_destroy(&dbs->iov);
> > - return;
> > + dbs->common.cb = NULL;
> > + dma_complete(dbs, -ENOMEM);
>
> Why don't we call the callback here? I know that it already was this
> way before your patch, but isn't that a bug?
>
> Also, I think it should be -EIO instead of -ENOMEM (even though it
> doesn't make any difference if we don't call the callback)
If I understood the code correctly, dbs->io_func can only fail if it
fails to get an AIOCB, which is basically out-of-memory. By the way, I
remember reading (from you?) that this is bogus and it would be cleaner
if callers of functions returning an AIOCB just assumed the return value
to be non-NULL.
I checked now, and the following block drivers can return a NULL AIOCB
even if qemu_aio_get succeeds:
* blkdebug (easily fixed ;))
* curl (seems like it also boils down to out-of-memory)
* rbd (can fail in librbd; might defer completion with an error to a
bottom half instead)
* linux-aio (when io_submit fails; might fall back to posix-aio-compat
instead).
> > @@ -131,8 +142,12 @@ static void dma_aio_cancel(BlockDriverAIOCB
> > *acb)
> > DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
> >
> > if (dbs->acb) {
> > - bdrv_aio_cancel(dbs->acb);
> > + BlockDriverAIOCB *acb = dbs->acb;
> > + dbs->acb = NULL;
> > + bdrv_aio_cancel(acb);
> > }
> > + dbs->common.cb = NULL;
> > + dma_complete(dbs, 0);
>
> Did you consider that there are block drivers that implement
> bdrv_aio_cancel() as waiting for completion of outstanding requests? I
> think in that case dma_complete() may be called twice. For most of it,
> this shouldn't be a problem, but I think it doesn't work with the
> qemu_aio_release(dbs).
Right. But then what to do (short of inventing reference counting
of some sort for AIOCBs) with those that don't? Leaking should not
be acceptable, should it?
In short, this patch can be dropped, but it shows more problems. :)
Paolo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-09 12:43 ` Paolo Bonzini
@ 2011-09-09 12:59 ` Kevin Wolf
2011-09-09 13:12 ` Paolo Bonzini
0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2011-09-09 12:59 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Am 09.09.2011 14:43, schrieb Paolo Bonzini:
>>> @@ -120,9 +132,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
>>> dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
>>> dbs->iov.size / 512, dma_bdrv_cb, dbs);
>>> if (!dbs->acb) {
>>> - dma_bdrv_unmap(dbs);
>>> - qemu_iovec_destroy(&dbs->iov);
>>> - return;
>>> + dbs->common.cb = NULL;
>>> + dma_complete(dbs, -ENOMEM);
>>
>> Why don't we call the callback here? I know that it already was this
>> way before your patch, but isn't that a bug?
>>
>> Also, I think it should be -EIO instead of -ENOMEM (even though it
>> doesn't make any difference if we don't call the callback)
>
> If I understood the code correctly, dbs->io_func can only fail if it
> fails to get an AIOCB, which is basically out-of-memory.
Yeah, maybe you're right with the error code. Anyway, should we call the
callback?
> By the way, I
> remember reading (from you?) that this is bogus and it would be cleaner
> if callers of functions returning an AIOCB just assumed the return value
> to be non-NULL.
>
> I checked now, and the following block drivers can return a NULL AIOCB
> even if qemu_aio_get succeeds:
>
> * blkdebug (easily fixed ;))
>
> * curl (seems like it also boils down to out-of-memory)
>
> * rbd (can fail in librbd; might defer completion with an error to a
> bottom half instead)
>
> * linux-aio (when io_submit fails; might fall back to posix-aio-compat
> instead).
I think it would make sense to require block drivers to return a valid
ACB (qemu_aio_get never returns NULL). If they have an error to report
they should schedule a BH that calls the callback.
>>> @@ -131,8 +142,12 @@ static void dma_aio_cancel(BlockDriverAIOCB
>>> *acb)
>>> DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
>>>
>>> if (dbs->acb) {
>>> - bdrv_aio_cancel(dbs->acb);
>>> + BlockDriverAIOCB *acb = dbs->acb;
>>> + dbs->acb = NULL;
>>> + bdrv_aio_cancel(acb);
>>> }
>>> + dbs->common.cb = NULL;
>>> + dma_complete(dbs, 0);
>>
>> Did you consider that there are block drivers that implement
>> bdrv_aio_cancel() as waiting for completion of outstanding requests? I
>> think in that case dma_complete() may be called twice. For most of it,
>> this shouldn't be a problem, but I think it doesn't work with the
>> qemu_aio_release(dbs).
>
> Right. But then what to do (short of inventing reference counting
> of some sort for AIOCBs) with those that don't? Leaking should not
> be acceptable, should it?
Hm, not sure. This whole cancellation stuff is so broken...
Maybe we should really refcount dbs (actually it would be more like a
bool in_cancel that means that dma_complete doesn't release the AIOCB)
> In short, this patch can be dropped, but it shows more problems. :)
I'd rather have it fixed than dropped :-)
Kevin
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-09 12:59 ` Kevin Wolf
@ 2011-09-09 13:12 ` Paolo Bonzini
2011-09-09 13:34 ` Kevin Wolf
0 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-09 13:12 UTC (permalink / raw)
To: Kevin Wolf; +Cc: qemu-devel
On 09/09/2011 02:59 PM, Kevin Wolf wrote:
>>> Also, I think it should be -EIO instead of -ENOMEM (even though it
>>> doesn't make any difference if we don't call the callback)
>>
>> If I understood the code correctly, dbs->io_func can only fail if it
>> fails to get an AIOCB, which is basically out-of-memory.
>
> Yeah, maybe you're right with the error code. Anyway, should we call the
> callback?
Considering that out-of-memory cannot happen and a couple of drivers do
return NULL, you're right about going for EIO and calling the callback.
> I think it would make sense to require block drivers to return a valid
> ACB (qemu_aio_get never returns NULL). If they have an error to report
> they should schedule a BH that calls the callback.
Perhaps you can write it down on the Wiki? There is already a block
driver braindump page, right?
>>> Did you consider that there are block drivers that implement
>>> bdrv_aio_cancel() as waiting for completion of outstanding requests? I
>>> think in that case dma_complete() may be called twice. For most of it,
>>> this shouldn't be a problem, but I think it doesn't work with the
>>> qemu_aio_release(dbs).
>>
>> Right. But then what to do (short of inventing reference counting
>> of some sort for AIOCBs) with those that don't? Leaking should not
>> be acceptable, should it?
>
> Hm, not sure. This whole cancellation stuff is so broken...
>
> Maybe we should really refcount dbs (actually it would be more like a
> bool in_cancel that means that dma_complete doesn't release the AIOCB)
But then it would leak for the drivers that do not wait for completion?
The problem is that the caller specifies what you should do but you do
not know it.
In fact it may be worse than just the qemu_aio_release: if the driver is
waiting for the request to complete, it will write over the bounce
buffer after dma_bdrv_unmap has been called.
In other words, I don't think this is fixable at all without reference
counting _all_ AIOCBs.
>> In short, this patch can be dropped, but it shows more problems. :)
>
> I'd rather have it fixed than dropped :-)
Me too. However, I'm looking at this because SCSI reset cancels all
pending I/O, and I would like to do a SCSI reset (and report unit
attention) just before migration, as an easy way to avoid saving the
pending requests in the migration stream. But if it's just leaking the
iovec on the migration source, I really do not care much, not yet at
least, because it's going to exit anyway. The other problems only
happen with DMA over I/O areas, which virtio-scsi won't do. I don't
think it's really fixable, so I quit. :)
Paolo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-09 13:12 ` Paolo Bonzini
@ 2011-09-09 13:34 ` Kevin Wolf
2011-09-09 13:43 ` Paolo Bonzini
0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2011-09-09 13:34 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Am 09.09.2011 15:12, schrieb Paolo Bonzini:
> On 09/09/2011 02:59 PM, Kevin Wolf wrote:
>>>> Also, I think it should be -EIO instead of -ENOMEM (even though it
>>>> doesn't make any difference if we don't call the callback)
>>>
>>> If I understood the code correctly, dbs->io_func can only fail if it
>>> fails to get an AIOCB, which is basically out-of-memory.
>>
>> Yeah, maybe you're right with the error code. Anyway, should we call the
>> callback?
>
> Considering that out-of-memory cannot happen and a couple of drivers do
> return NULL, you're right about going for EIO and calling the callback.
>
>> I think it would make sense to require block drivers to return a valid
>> ACB (qemu_aio_get never returns NULL). If they have an error to report
>> they should schedule a BH that calls the callback.
>
> Perhaps you can write it down on the Wiki? There is already a block
> driver braindump page, right?
http://wiki.qemu.org/BlockRoadmap
This one? Adding it there now.
>>>> Did you consider that there are block drivers that implement
>>>> bdrv_aio_cancel() as waiting for completion of outstanding requests? I
>>>> think in that case dma_complete() may be called twice. For most of it,
>>>> this shouldn't be a problem, but I think it doesn't work with the
>>>> qemu_aio_release(dbs).
>>>
>>> Right. But then what to do (short of inventing reference counting
>>> of some sort for AIOCBs) with those that don't? Leaking should not
>>> be acceptable, should it?
>>
>> Hm, not sure. This whole cancellation stuff is so broken...
>>
>> Maybe we should really refcount dbs (actually it would be more like a
>> bool in_cancel that means that dma_complete doesn't release the AIOCB)
>
> But then it would leak for the drivers that do not wait for completion?
> The problem is that the caller specifies what you should do but you do
> not know it.
Why would it leak? To clarify, what I'm thinking of is:
static void dma_aio_cancel(BlockDriverAIOCB *acb)
{
DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
if (dbs->acb) {
BlockDriverAIOCB *acb = dbs->acb;
dbs->acb = NULL;
dbs->in_cancel = true;
bdrv_aio_cancel(acb);
dbs->in_cancel = false;
}
dbs->common.cb = NULL;
dma_complete(dbs, 0);
}
And then in dma_complete:
...
if (!dbs->in_cancel) {
qemu_aio_release(dbs);
}
}
So the release that we avoid is the release in the callback that may or
may not be called indirectly by bdrv_aio_cancel. We always call
dma_complete at the end of dma_aio_cancel so that it will be properly freed.
> In fact it may be worse than just the qemu_aio_release: if the driver is
> waiting for the request to complete, it will write over the bounce
> buffer after dma_bdrv_unmap has been called.
How that? dma_bdrv_unmap is called only afterwards, isn't it?
Kevin
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation
2011-09-09 13:34 ` Kevin Wolf
@ 2011-09-09 13:43 ` Paolo Bonzini
0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-09 13:43 UTC (permalink / raw)
To: Kevin Wolf; +Cc: qemu-devel
On 09/09/2011 03:34 PM, Kevin Wolf wrote:
> So the release that we avoid is the release in the callback that may or
> may not be called indirectly by bdrv_aio_cancel. We always call
> dma_complete at the end of dma_aio_cancel so that it will be properly freed.
>
>> In fact it may be worse than just the qemu_aio_release: if the driver is
>> waiting for the request to complete, it will write over the bounce
>> buffer after dma_bdrv_unmap has been called.
>
> How that? dma_bdrv_unmap is called only afterwards, isn't it?
I had missed your point completely. :) Yes, the above should work.
Paolo
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Qemu-devel] [PATCH 4/5] scsi-disk: commonize iovec creation between reads and writes
2011-09-07 15:20 [Qemu-devel] [PATCH 0/5] block: preparatory patches for scatter/gather support Paolo Bonzini
` (2 preceding siblings ...)
2011-09-07 15:21 ` [Qemu-devel] [PATCH 3/5] dma-helpers: rewrite completion/cancellation Paolo Bonzini
@ 2011-09-07 15:21 ` Paolo Bonzini
2011-09-07 15:21 ` [Qemu-devel] [PATCH 5/5] scsi-disk: lazily allocate bounce buffer Paolo Bonzini
4 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-07 15:21 UTC (permalink / raw)
To: qemu-devel
Also, consistently use qiov.size instead of iov.iov_len.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/scsi-disk.c | 42 ++++++++++++++++++------------------------
1 files changed, 18 insertions(+), 24 deletions(-)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 9724d0f..d0ac31e 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -105,6 +105,13 @@ static void scsi_cancel_io(SCSIRequest *req)
r->req.aiocb = NULL;
}
+static uint32_t scsi_init_iovec(SCSIDiskReq *r)
+{
+ r->iov.iov_len = MIN(r->sector_count * 512, SCSI_DMA_BUF_SIZE);
+ qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+ return r->qiov.size / 512;
+}
+
static void scsi_read_complete(void * opaque, int ret)
{
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
@@ -122,12 +129,12 @@ static void scsi_read_complete(void * opaque, int ret)
}
}
- DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
+ DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size);
- n = r->iov.iov_len / 512;
+ n = r->qiov.size / 512;
r->sector += n;
r->sector_count -= n;
- scsi_req_data(&r->req, r->iov.iov_len);
+ scsi_req_data(&r->req, r->qiov.size);
}
static void scsi_flush_complete(void * opaque, int ret)
@@ -178,13 +185,7 @@ static void scsi_read_data(SCSIRequest *req)
return;
}
- n = r->sector_count;
- if (n > SCSI_DMA_BUF_SIZE / 512)
- n = SCSI_DMA_BUF_SIZE / 512;
-
- r->iov.iov_len = n * 512;
- qemu_iovec_init_external(&r->qiov, &r->iov, 1);
-
+ n = scsi_init_iovec(r);
bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
scsi_read_complete, r);
@@ -233,7 +234,6 @@ static void scsi_write_complete(void * opaque, int ret)
{
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- uint32_t len;
uint32_t n;
if (r->req.aiocb != NULL) {
@@ -247,19 +247,15 @@ static void scsi_write_complete(void * opaque, int ret)
}
}
- n = r->iov.iov_len / 512;
+ n = r->qiov.size / 512;
r->sector += n;
r->sector_count -= n;
if (r->sector_count == 0) {
scsi_req_complete(&r->req, GOOD);
} else {
- len = r->sector_count * 512;
- if (len > SCSI_DMA_BUF_SIZE) {
- len = SCSI_DMA_BUF_SIZE;
- }
- r->iov.iov_len = len;
- DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, len);
- scsi_req_data(&r->req, len);
+ scsi_init_iovec(r);
+ DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, r->qiov.size);
+ scsi_req_data(&r->req, r->qiov.size);
}
}
@@ -278,18 +274,16 @@ static void scsi_write_data(SCSIRequest *req)
return;
}
- n = r->iov.iov_len / 512;
+ n = r->qiov.size / 512;
if (n) {
- qemu_iovec_init_external(&r->qiov, &r->iov, 1);
-
bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
- scsi_write_complete, r);
+ scsi_write_complete, r);
if (r->req.aiocb == NULL) {
scsi_write_complete(r, -ENOMEM);
}
} else {
- /* Invoke completion routine to fetch data from host. */
+ /* Called for the first time. Ask the driver to send us more data. */
scsi_write_complete(r, 0);
}
}
--
1.7.6
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [Qemu-devel] [PATCH 5/5] scsi-disk: lazily allocate bounce buffer
2011-09-07 15:20 [Qemu-devel] [PATCH 0/5] block: preparatory patches for scatter/gather support Paolo Bonzini
` (3 preceding siblings ...)
2011-09-07 15:21 ` [Qemu-devel] [PATCH 4/5] scsi-disk: commonize iovec creation between reads and writes Paolo Bonzini
@ 2011-09-07 15:21 ` Paolo Bonzini
4 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2011-09-07 15:21 UTC (permalink / raw)
To: qemu-devel
It will not be needed for reads and writes if the HBA provides a sglist.
In addition, this lets scsi-disk refuse commands with an excessive
allocation length, as well as limit memory on usual well-behaved guests.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/scsi-disk.c | 44 +++++++++++++++++++++++++++++++++-----------
1 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index d0ac31e..4a01e0f 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -54,6 +54,7 @@ typedef struct SCSIDiskReq {
/* Both sector and sector_count are in terms of qemu 512 byte blocks. */
uint64_t sector;
uint32_t sector_count;
+ uint32_t buflen;
struct iovec iov;
QEMUIOVector qiov;
uint32_t status;
@@ -75,13 +76,15 @@ struct SCSIDiskState
};
static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type);
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf);
+static int scsi_disk_emulate_command(SCSIDiskReq *r);
static void scsi_free_request(SCSIRequest *req)
{
SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
- qemu_vfree(r->iov.iov_base);
+ if (r->iov.iov_base) {
+ qemu_vfree(r->iov.iov_base);
+ }
}
/* Helper function for command completion with sense. */
@@ -107,7 +110,13 @@ static void scsi_cancel_io(SCSIRequest *req)
static uint32_t scsi_init_iovec(SCSIDiskReq *r)
{
- r->iov.iov_len = MIN(r->sector_count * 512, SCSI_DMA_BUF_SIZE);
+ SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+ if (!r->iov.iov_base) {
+ r->buflen = SCSI_DMA_BUF_SIZE;
+ r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+ }
+ r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
qemu_iovec_init_external(&r->qiov, &r->iov, 1);
return r->qiov.size / 512;
}
@@ -314,7 +323,7 @@ static void scsi_dma_restart_bh(void *opaque)
scsi_write_data(&r->req);
break;
case SCSI_REQ_STATUS_RETRY_FLUSH:
- ret = scsi_disk_emulate_command(r, r->iov.iov_base);
+ ret = scsi_disk_emulate_command(r);
if (ret == 0) {
scsi_req_complete(&r->req, GOOD);
}
@@ -808,13 +817,31 @@ static int scsi_disk_emulate_read_toc(SCSIRequest *req, uint8_t *outbuf)
return toclen;
}
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
+static int scsi_disk_emulate_command(SCSIDiskReq *r)
{
SCSIRequest *req = &r->req;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
uint64_t nb_sectors;
+ uint8_t *outbuf;
int buflen = 0;
+ if (!r->iov.iov_base) {
+ /*
+ * FIXME: we shouldn't return anything bigger than 4k, but the code
+ * requires the buffer to be as big as req->cmd.xfer in several
+ * places. So, do not allow CDBs with a very large ALLOCATION
+ * LENGTH. The real fix would be to modify scsi_read_data and
+ * dma_buf_read, so that they return data beyond the buflen
+ * as all zeros.
+ */
+ if (req->cmd.xfer > 65536) {
+ goto illegal_request;
+ }
+ r->buflen = MAX(4096, req->cmd.xfer);
+ r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+ }
+
+ outbuf = r->iov.iov_base;
switch (req->cmd.buf[0]) {
case TEST_UNIT_READY:
if (!bdrv_is_inserted(s->bs))
@@ -965,11 +992,9 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
int32_t len;
uint8_t command;
- uint8_t *outbuf;
int rc;
command = buf[0];
- outbuf = (uint8_t *)r->iov.iov_base;
DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", req->lun, req->tag, buf[0]);
#ifdef DEBUG_SCSI
@@ -998,7 +1023,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
case GET_CONFIGURATION:
case SERVICE_ACTION_IN_16:
case VERIFY_10:
- rc = scsi_disk_emulate_command(r, outbuf);
+ rc = scsi_disk_emulate_command(r);
if (rc < 0) {
return 0;
}
@@ -1228,11 +1253,8 @@ static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag,
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, d);
SCSIRequest *req;
- SCSIDiskReq *r;
req = scsi_req_alloc(&scsi_disk_reqops, &s->qdev, tag, lun, hba_private);
- r = DO_UPCAST(SCSIDiskReq, req, req);
- r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE);
return req;
}
--
1.7.6
^ permalink raw reply related [flat|nested] 15+ messages in thread