* [PATCH] Avoid fragment virtio-blk transfers by copying
@ 2008-06-24 18:16 Anthony Liguori
2008-06-24 19:47 ` Marcelo Tosatti
2008-06-25 10:44 ` Avi Kivity
0 siblings, 2 replies; 7+ messages in thread
From: Anthony Liguori @ 2008-06-24 18:16 UTC (permalink / raw)
To: kvm; +Cc: Avi Kivity, Marcelo Tosatti, Anthony Liguori
A major source of performance loss for virtio-blk has been the fact that we
split transfers into multiple requests. This is particularly harmful if you
have striped storage beneath your virtual machine.
This patch copies the request data into a single contiguous buffer to ensure
that we don't split requests. This improves performance from about 80 MB/sec
to about 155 MB/sec with my fibre channel link. 185 MB/sec is what we get on
native so this gets us pretty darn close.
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
index 233e6e7..2ea5669 100644
--- a/qemu/hw/virtio-blk.c
+++ b/qemu/hw/virtio-blk.c
@@ -72,6 +72,7 @@ typedef struct VirtIOBlock
{
VirtIODevice vdev;
BlockDriverState *bs;
+ VirtQueue *vq;
} VirtIOBlock;
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -81,106 +82,138 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
typedef struct VirtIOBlockReq
{
- VirtIODevice *vdev;
- VirtQueue *vq;
- struct iovec in_sg_status;
- unsigned int pending;
- unsigned int len;
- unsigned int elem_idx;
- int status;
+ VirtIOBlock *dev;
+ VirtQueueElement elem;
+ struct virtio_blk_inhdr *in;
+ struct virtio_blk_outhdr *out;
+ size_t size;
+ uint8_t *buffer;
} VirtIOBlockReq;
static void virtio_blk_rw_complete(void *opaque, int ret)
{
VirtIOBlockReq *req = opaque;
- struct virtio_blk_inhdr *in;
- VirtQueueElement elem;
+ VirtIOBlock *s = req->dev;
+
+ /* Copy read data to the guest */
+ if (!ret && !(req->out->type & VIRTIO_BLK_T_OUT)) {
+ size_t offset = 0;
+ int i;
- req->status |= ret;
- if (--req->pending > 0)
- return;
+ for (i = 0; i < req->elem.in_num - 1; i++) {
+ size_t len;
- elem.index = req->elem_idx;
- in = (void *)req->in_sg_status.iov_base;
+ /* Be pretty defensive wrt malicious guests */
+ len = MIN(req->elem.in_sg[i].iov_len,
+ req->size - offset);
- in->status = req->status ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
- virtqueue_push(req->vq, &elem, req->len);
- virtio_notify(req->vdev, req->vq);
+ memcpy(req->elem.in_sg[i].iov_base,
+ req->buffer + offset,
+ len);
+ offset += len;
+ }
+ }
+
+ req->in->status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+ virtqueue_push(s->vq, &req->elem, req->size + sizeof(*req->in));
+ virtio_notify(&s->vdev, s->vq);
+
+ qemu_free(req->buffer);
qemu_free(req);
}
+static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
+{
+ VirtIOBlockReq *req;
+
+ req = qemu_mallocz(sizeof(*req));
+ if (req == NULL)
+ return NULL;
+
+ req->dev = s;
+ if (!virtqueue_pop(s->vq, &req->elem)) {
+ qemu_free(req);
+ return NULL;
+ }
+
+ return req;
+}
+
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBlock *s = to_virtio_blk(vdev);
- VirtQueueElement elem;
VirtIOBlockReq *req;
- unsigned int count;
- while ((count = virtqueue_pop(vq, &elem)) != 0) {
- struct virtio_blk_inhdr *in;
- struct virtio_blk_outhdr *out;
- off_t off;
+ while ((req = virtio_blk_get_request(s))) {
int i;
- if (elem.out_num < 1 || elem.in_num < 1) {
+ if (req->elem.out_num < 1 || req->elem.in_num < 1) {
fprintf(stderr, "virtio-blk missing headers\n");
exit(1);
}
- if (elem.out_sg[0].iov_len != sizeof(*out) ||
- elem.in_sg[elem.in_num - 1].iov_len != sizeof(*in)) {
+ if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
+ req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
fprintf(stderr, "virtio-blk header not in correct element\n");
exit(1);
}
- /*
- * FIXME: limit the number of in-flight requests
- */
- req = qemu_malloc(sizeof(VirtIOBlockReq));
- if (!req)
- return;
- memset(req, 0, sizeof(*req));
- memcpy(&req->in_sg_status, &elem.in_sg[elem.in_num - 1],
- sizeof(req->in_sg_status));
- req->vdev = vdev;
- req->vq = vq;
- req->elem_idx = elem.index;
-
- out = (void *)elem.out_sg[0].iov_base;
- in = (void *)elem.in_sg[elem.in_num - 1].iov_base;
- off = out->sector;
-
- if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
- unsigned int len = sizeof(*in);
-
- in->status = VIRTIO_BLK_S_UNSUPP;
- virtqueue_push(vq, &elem, len);
+ req->out = (void *)req->elem.out_sg[0].iov_base;
+ req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
+
+ if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ unsigned int len = sizeof(*req->in);
+
+ req->in->status = VIRTIO_BLK_S_UNSUPP;
+ virtqueue_push(vq, &req->elem, len);
virtio_notify(vdev, vq);
qemu_free(req);
- } else if (out->type & VIRTIO_BLK_T_OUT) {
- req->pending = elem.out_num - 1;
-
- for (i = 1; i < elem.out_num; i++) {
- req->len += elem.out_sg[i].iov_len;
- bdrv_aio_write(s->bs, off,
- elem.out_sg[i].iov_base,
- elem.out_sg[i].iov_len / 512,
+ } else if (req->out->type & VIRTIO_BLK_T_OUT) {
+ size_t offset;
+
+ for (i = 1; i < req->elem.out_num; i++)
+ req->size += req->elem.out_sg[i].iov_len;
+
+ req->buffer = qemu_malloc(req->size);
+ if (req->buffer == NULL) {
+ qemu_free(req);
+ break;
+ }
+
+ /* We copy the data from the SG list to avoid splitting up the request. This helps
+ performance a lot until we can pass full sg lists as AIO operations */
+ offset = 0;
+ for (i = 1; i < req->elem.out_num; i++) {
+ size_t len;
+
+ len = MIN(req->elem.in_sg[i].iov_len,
+ req->size - offset);
+ memcpy(req->buffer + offset,
+ req->elem.out_sg[i].iov_base,
+ len);
+ offset += len;
+ }
+
+ bdrv_aio_write(s->bs, req->out->sector,
+ req->buffer,
+ req->size / 512,
virtio_blk_rw_complete,
req);
- off += elem.out_sg[i].iov_len / 512;
- }
} else {
- req->pending = elem.in_num - 1;
+ for (i = 0; i < req->elem.in_num - 1; i++)
+ req->size += req->elem.in_sg[i].iov_len;
+
+ req->buffer = qemu_malloc(req->size);
+ if (req->buffer == NULL) {
+ qemu_free(req);
+ break;
+ }
- for (i = 0; i < elem.in_num - 1; i++) {
- req->len += elem.in_sg[i].iov_len;
- bdrv_aio_read(s->bs, off,
- elem.in_sg[i].iov_base,
- elem.in_sg[i].iov_len / 512,
+ bdrv_aio_read(s->bs, req->out->sector,
+ req->buffer,
+ req->size / 512,
virtio_blk_rw_complete,
req);
- off += elem.in_sg[i].iov_len / 512;
- }
}
}
/*
@@ -192,8 +225,6 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
static void virtio_blk_reset(VirtIODevice *vdev)
{
- VirtIOBlock *s = to_virtio_blk(vdev);
-
/*
* This should cancel pending requests, but can't do nicely until there
* are per-device request lists.
@@ -205,7 +236,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
VirtIOBlock *s = to_virtio_blk(vdev);
struct virtio_blk_config blkcfg;
- int64_t capacity;
+ uint64_t capacity;
int cylinders, heads, secs;
bdrv_get_geometry(s->bs, &capacity);
@@ -263,7 +294,7 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);
- virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
+ s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
register_savevm("virtio-blk", virtio_blk_id++, 1,
virtio_blk_save, virtio_blk_load, s);
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-24 18:16 [PATCH] Avoid fragment virtio-blk transfers by copying Anthony Liguori
@ 2008-06-24 19:47 ` Marcelo Tosatti
2008-06-25 10:44 ` Avi Kivity
1 sibling, 0 replies; 7+ messages in thread
From: Marcelo Tosatti @ 2008-06-24 19:47 UTC (permalink / raw)
To: Anthony Liguori; +Cc: kvm, Avi Kivity
On Tue, Jun 24, 2008 at 01:16:24PM -0500, Anthony Liguori wrote:
> A major source of performance loss for virtio-blk has been the fact that we
> split transfers into multiple requests. This is particularly harmful if you
> have striped storage beneath your virtual machine.
>
> This patch copies the request data into a single contiguous buffer to ensure
> that we don't split requests. This improves performance from about 80 MB/sec
> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what we get on
> native so this gets us pretty darn close.
>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Looks good.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-24 18:16 [PATCH] Avoid fragment virtio-blk transfers by copying Anthony Liguori
2008-06-24 19:47 ` Marcelo Tosatti
@ 2008-06-25 10:44 ` Avi Kivity
2008-06-25 15:55 ` Marcelo Tosatti
2008-06-25 16:28 ` Anthony Liguori
1 sibling, 2 replies; 7+ messages in thread
From: Avi Kivity @ 2008-06-25 10:44 UTC (permalink / raw)
To: Anthony Liguori; +Cc: kvm, Marcelo Tosatti
Anthony Liguori wrote:
> A major source of performance loss for virtio-blk has been the fact that we
> split transfers into multiple requests. This is particularly harmful if you
> have striped storage beneath your virtual machine.
>
> This patch copies the request data into a single contiguous buffer to ensure
> that we don't split requests. This improves performance from about 80 MB/sec
> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what we get on
> native so this gets us pretty darn close.
>
>
If the guest issues a request for a terabyte of memory, the host will
try to allocate it and drop to swap/oom. So we need to either fragment
beyond some size, or to avoid copying and thus the need for allocation.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-25 10:44 ` Avi Kivity
@ 2008-06-25 15:55 ` Marcelo Tosatti
2008-06-29 9:56 ` Avi Kivity
2008-06-25 16:28 ` Anthony Liguori
1 sibling, 1 reply; 7+ messages in thread
From: Marcelo Tosatti @ 2008-06-25 15:55 UTC (permalink / raw)
To: Avi Kivity; +Cc: Anthony Liguori, kvm
On Wed, Jun 25, 2008 at 01:44:35PM +0300, Avi Kivity wrote:
> Anthony Liguori wrote:
>> A major source of performance loss for virtio-blk has been the fact that we
>> split transfers into multiple requests. This is particularly harmful if you
>> have striped storage beneath your virtual machine.
>>
>> This patch copies the request data into a single contiguous buffer to ensure
>> that we don't split requests. This improves performance from about 80 MB/sec
>> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what we get on
>> native so this gets us pretty darn close.
>>
>>
>
> If the guest issues a request for a terabyte of memory, the host will
> try to allocate it and drop to swap/oom. So we need to either fragment
> beyond some size, or to avoid copying and thus the need for allocation.
The maximum request size for Linux guests is 512K (after tuning
virtio-blk guest driver, current max is 124K). I'm not sure what the max
number of requests is, but I guess is between 128 and 1024, Anthony?
So with the current configuration your concern is not an issue. BTW,
what is maximum request size for the Windows driver?
Point is that the guest is responsible for limiting the amount of data
in-flight. A malicious guest can only hurt itself by attempting to DoS
the host, with proper memory limits in place. IMO this issue should not
be handled in the virtio-blk backend.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-25 15:55 ` Marcelo Tosatti
@ 2008-06-29 9:56 ` Avi Kivity
0 siblings, 0 replies; 7+ messages in thread
From: Avi Kivity @ 2008-06-29 9:56 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: Anthony Liguori, kvm
Marcelo Tosatti wrote:
> On Wed, Jun 25, 2008 at 01:44:35PM +0300, Avi Kivity wrote:
>
>> Anthony Liguori wrote:
>>
>>> A major source of performance loss for virtio-blk has been the fact that we
>>> split transfers into multiple requests. This is particularly harmful if you
>>> have striped storage beneath your virtual machine.
>>>
>>> This patch copies the request data into a single contiguous buffer to ensure
>>> that we don't split requests. This improves performance from about 80 MB/sec
>>> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what we get on
>>> native so this gets us pretty darn close.
>>>
>>>
>>>
>> If the guest issues a request for a terabyte of memory, the host will
>> try to allocate it and drop to swap/oom. So we need to either fragment
>> beyond some size, or to avoid copying and thus the need for allocation.
>>
>
> The maximum request size for Linux guests is 512K (after tuning
> virtio-blk guest driver, current max is 124K). I'm not sure what the max
> number of requests is, but I guess is between 128 and 1024, Anthony?
>
>
This is assuming 1 page per descriptor. But the len field is 32 bits,
so each descriptor can address up to 4GB.
> Point is that the guest is responsible for limiting the amount of data
> in-flight. A malicious guest can only hurt itself by attempting to DoS
> the host, with proper memory limits in place. IMO this issue should not
> be handled in the virtio-blk backend
I'd like kvm to be safe for use even without the rss controller.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-25 10:44 ` Avi Kivity
2008-06-25 15:55 ` Marcelo Tosatti
@ 2008-06-25 16:28 ` Anthony Liguori
2008-06-29 10:01 ` Avi Kivity
1 sibling, 1 reply; 7+ messages in thread
From: Anthony Liguori @ 2008-06-25 16:28 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm, Marcelo Tosatti
Avi Kivity wrote:
> Anthony Liguori wrote:
>> A major source of performance loss for virtio-blk has been the fact
>> that we
>> split transfers into multiple requests. This is particularly harmful
>> if you
>> have striped storage beneath your virtual machine.
>>
>> This patch copies the request data into a single contiguous buffer to
>> ensure
>> that we don't split requests. This improves performance from about
>> 80 MB/sec
>> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what
>> we get on
>> native so this gets us pretty darn close.
>>
>>
>
> If the guest issues a request for a terabyte of memory, the host will
> try to allocate it and drop to swap/oom.
An unprivileged user should not be able to OOM the kernel by simply
doing memory allocations. Likewise, while it will start swapping, that
should primarily effect the application allocating memory (although yes,
it's consuming IO bandwidth to do the swapping).
As long as we properly handle memory allocation failures, it's my
contention that allowing a guest to allocate unbound amounts of virtual
memory is safe.
> So we need to either fragment beyond some size, or to avoid copying
> and thus the need for allocation.
As Marcelo mentioned, there are very practical limitations on how much
memory can be in-flight on any given queue. A malicious guest could
construct a nasty queue that basically pointed to all of the guest
physical memory for each entry in the queue but that's still a bound size.
Regards,
Anthony Liguori
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Avoid fragment virtio-blk transfers by copying
2008-06-25 16:28 ` Anthony Liguori
@ 2008-06-29 10:01 ` Avi Kivity
0 siblings, 0 replies; 7+ messages in thread
From: Avi Kivity @ 2008-06-29 10:01 UTC (permalink / raw)
To: Anthony Liguori; +Cc: kvm, Marcelo Tosatti
Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>> A major source of performance loss for virtio-blk has been the fact
>>> that we
>>> split transfers into multiple requests. This is particularly
>>> harmful if you
>>> have striped storage beneath your virtual machine.
>>>
>>> This patch copies the request data into a single contiguous buffer
>>> to ensure
>>> that we don't split requests. This improves performance from about
>>> 80 MB/sec
>>> to about 155 MB/sec with my fibre channel link. 185 MB/sec is what
>>> we get on
>>> native so this gets us pretty darn close.
>>>
>>>
>>
>> If the guest issues a request for a terabyte of memory, the host will
>> try to allocate it and drop to swap/oom.
>
> An unprivileged user should not be able to OOM the kernel by simply
> doing memory allocations. Likewise, while it will start swapping,
> that should primarily effect the application allocating memory
> (although yes, it's consuming IO bandwidth to do the swapping).
>
Without the rss controller, there's no reason that this should happen.
Linux will swap not-recently-used pages, whether belonging to the
application or not.
> As long as we properly handle memory allocation failures, it's my
> contention that allowing a guest to allocate unbound amounts of
> virtual memory is safe.
>
It isn't: suppose you oom the machine, even if you have proper
overcommit enabled (which is unlikely) there's no guarantee that the
allocating guest will get the ENOMEM. Some other application might, and
it's unreasonable to expect it to recover.
Consider also a machine with no swap. You've carefully allocated 1G to
a VM, and here it goes and allocates the rest, ooming your fine-tuned host.
>> So we need to either fragment beyond some size, or to avoid copying
>> and thus the need for allocation.
>
> As Marcelo mentioned, there are very practical limitations on how much
> memory can be in-flight on any given queue. A malicious guest could
> construct a nasty queue that basically pointed to all of the guest
> physical memory for each entry in the queue but that's still a bound
> size.
It's bounded, but by a really huge number (4GB x ring length x number of
disks).
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2008-06-29 10:01 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-24 18:16 [PATCH] Avoid fragment virtio-blk transfers by copying Anthony Liguori
2008-06-24 19:47 ` Marcelo Tosatti
2008-06-25 10:44 ` Avi Kivity
2008-06-25 15:55 ` Marcelo Tosatti
2008-06-29 9:56 ` Avi Kivity
2008-06-25 16:28 ` Anthony Liguori
2008-06-29 10:01 ` Avi Kivity
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox