From: Paolo Bonzini <pbonzini@redhat.com>
To: Stefan Hajnoczi <stefanha@gmail.com>
Cc: Kevin Wolf <kwolf@redhat.com>,
Anthony Liguori <aliguori@us.ibm.com>,
Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>,
qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH for-1.2] qed: refuse unaligned zero writes with a backing file
Date: Tue, 28 Aug 2012 16:23:31 +0200 [thread overview]
Message-ID: <503CD463.80009@redhat.com> (raw)
In-Reply-To: <CAJSP0QX1vj9Kg=7hmoQpWZyL5nTPsdnviw0bqehLKXOJMq+hqA@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 901 bytes --]
Il 28/08/2012 15:37, Stefan Hajnoczi ha scritto:
>> > The "right fix" would not be much more complex though, something like this, right?
>> > (untested).
> Yes but it's more complicated. To do a really good job we should
> slice off the first/last clusters if they are unaligned, handle them
> like regular allocating writes, and handle the middle of the request
> as a zero write.
>
> I decided to do the simplest implementation since this scenario only
> occurs in test cases, not real guests.
Yes, I was curious because it reminded me of the patch I did to write
zeroes when I was playing with discard to avoid the large bounce buffer
in qed_aio_write_inplace. That patch takes care of processing clusters
one by one (though that means one L2 write for each and every cluster,
not just the first and last).
It probably causes a performance hit, but anyway I attach it for
completeness.
Paolo
[-- Attachment #2: wz.patch --]
[-- Type: text/x-patch, Size: 7644 bytes --]
>From 98f2978ae5d0f34dca0369fcc727d1e533c0e6b0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 8 Mar 2012 14:48:29 +0100
Subject: [PATCH 1/2] qed: make write-zeroes bounce buffer smaller than a
single cluster
Currently, a write-zeroes operation could allocates memory for the whole
I/O operation if it is not aligned. This is not necessary, because only
two unaligned clusters could be written.
This makes the write-zeroes operation proceed one cluster at a time,
even if all clusters are currently available and zero. This does cause
worse performance due to multiple L2 table writes. However, if zero-write
detection is enabled it means we're not interested in maximum performance.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
block/qed.c | 75 +++++++++++++++++++++++++++++++++++++------------------------
1 file modificato, 46 inserzioni(+), 29 rimozioni(-)
diff --git a/block/qed.c b/block/qed.c
index a02dbfd..bcea346 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -869,7 +869,9 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
/* Free the buffer we may have allocated for zero writes */
if (acb->flags & QED_AIOCB_ZERO) {
qemu_vfree(acb->qiov->iov[0].iov_base);
- acb->qiov->iov[0].iov_base = NULL;
+ qemu_iovec_destroy(acb->qiov);
+ g_free(acb->qiov);
+ acb->qiov = NULL;
}
/* Arrange for a bh to invoke the completion function */
@@ -1096,6 +1098,34 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
}
/**
+ * Calculate the I/O vector
+ *
+ * @acb: Write request
+ * @len: Length in bytes
+ */
+static void qed_prepare_qiov(QEDAIOCB *acb, size_t len)
+{
+ /* Calculate the I/O vector */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ /* Allocate buffer for zero writes */
+ if (!acb->qiov) {
+ BDRVQEDState *s = acb_to_s(acb);
+ char *base;
+
+ acb->qiov = g_malloc(sizeof(QEMUIOVector));
+ base = qemu_blockalign(s->bs, s->header.cluster_size);
+ qemu_iovec_init(acb->qiov, 1);
+ qemu_iovec_add(acb->qiov, base, s->header.cluster_size);
+ memset(base, 0, s->header.cluster_size);
+ }
+ assert(len <= acb->qiov->size);
+ qemu_iovec_add(&acb->cur_qiov, acb->qiov->iov[0].iov_base, len);
+ } else {
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+ }
+}
+
+/**
* Write new data cluster
*
* @acb: Write request
@@ -1124,21 +1154,20 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
acb->cur_nclusters = qed_bytes_to_clusters(s,
qed_offset_into_cluster(s, acb->cur_pos) + len);
- qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
if (acb->flags & QED_AIOCB_ZERO) {
/* Skip ahead if the clusters are already zero */
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
- qed_aio_next_io(acb, 0);
- return;
+ cb = qed_aio_next_io;
+ } else {
+ cb = qed_aio_write_zero_cluster;
}
-
- cb = qed_aio_write_zero_cluster;
} else {
cb = qed_aio_write_prefill;
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
}
+ qed_prepare_qiov(acb, len);
if (qed_should_set_need_check(s)) {
s->header.features |= QED_F_NEED_CHECK;
qed_write_header(s, cb, acb);
@@ -1158,19 +1187,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
*/
static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
{
- /* Allocate buffer for zero writes */
- if (acb->flags & QED_AIOCB_ZERO) {
- struct iovec *iov = acb->qiov->iov;
-
- if (!iov->iov_base) {
- iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
- memset(iov->iov_base, 0, iov->iov_len);
- }
- }
-
- /* Calculate the I/O vector */
acb->cur_cluster = offset;
- qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+ qed_prepare_qiov(acb, len);
/* Do the actual write */
qed_aio_write_main(acb, 0);
@@ -1270,6 +1288,7 @@ static void qed_aio_next_io(void *opaque, int ret)
{
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
+ uint64_t len;
QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
qed_aio_write_data : qed_aio_read_data;
@@ -1291,10 +1310,14 @@ static void qed_aio_next_io(void *opaque, int ret)
return;
}
+ /* Limit buffer size to one cluster when writing zeroes. */
+ len = acb->end_pos - acb->cur_pos;
+ if (acb->flags & QED_AIOCB_ZERO) {
+ len = MIN(len, s->header.cluster_size);
+ }
+
/* Find next cluster and start I/O */
- qed_find_cluster(s, &acb->request,
- acb->cur_pos, acb->end_pos - acb->cur_pos,
- io_fn, acb);
+ qed_find_cluster(s, &acb->request, acb->cur_pos, len, io_fn, acb);
}
static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
@@ -1315,7 +1338,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
acb->request.l2_table = NULL;
- qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+ qemu_iovec_init(&acb->cur_qiov, qiov ? qiov->niov : 1);
/* Start request */
qed_aio_next_io(acb, 0);
@@ -1364,17 +1387,11 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
{
BlockDriverAIOCB *blockacb;
QEDWriteZeroesCB cb = { .done = false };
- QEMUIOVector qiov;
- struct iovec iov;
/* Zero writes start without an I/O buffer. If a buffer becomes necessary
* then it will be allocated during request processing.
*/
- iov.iov_base = NULL,
- iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE,
-
- qemu_iovec_init_external(&qiov, &iov, 1);
- blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+ blockacb = qed_aio_setup(bs, sector_num, NULL, nb_sectors,
qed_co_write_zeroes_cb, &cb,
QED_AIOCB_WRITE | QED_AIOCB_ZERO);
if (!blockacb) {
--
1.7.11.2
>From c74de5ae7650233574fb1572bc3b463864af8a3e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 28 Aug 2012 16:09:23 +0200
Subject: [PATCH 2/2] qed: do copy-on-write for the first and last cluster in
a misaligned write-zero request
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
block/qed.c | 11 +++++++++++
1 file modificato, 11 inserzioni(+)
diff --git a/block/qed.c b/block/qed.c
index bcea346..0f8d06c 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1160,9 +1160,20 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
cb = qed_aio_next_io;
} else {
+ /* For misaligned requests, do normal copy-on-write for the first
+ * and last cluster.
+ */
+ unsigned offset = qed_offset_into_cluster(s, acb->cur_pos);
+ if (offset != 0 || qed_offset_into_cluster(s, offset + len) != 0) {
+ acb->cur_nclusters = 1;
+ len = MIN(len, s->header.cluster_size - offset);
+ goto copy;
+ }
+
cb = qed_aio_write_zero_cluster;
}
} else {
+copy:
cb = qed_aio_write_prefill;
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
}
--
1.7.11.2
next prev parent reply other threads:[~2012-08-28 14:23 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-08-28 13:04 [Qemu-devel] [PATCH for-1.2] qed: refuse unaligned zero writes with a backing file Stefan Hajnoczi
2012-08-28 13:25 ` Paolo Bonzini
2012-08-28 13:37 ` Stefan Hajnoczi
2012-08-28 14:23 ` Paolo Bonzini [this message]
2012-08-29 7:53 ` Stefan Hajnoczi
2012-08-29 9:09 ` Paolo Bonzini
2012-08-28 13:38 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=503CD463.80009@redhat.com \
--to=pbonzini@redhat.com \
--cc=aliguori@us.ibm.com \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@gmail.com \
--cc=stefanha@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.