qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com, pl@kamp.de, stefanha@redhat.com
Subject: [Qemu-devel] [PATCH v2 16/20] raw-posix: add support for write_zeroes on XFS and block devices
Date: Tue, 19 Nov 2013 18:07:39 +0100	[thread overview]
Message-ID: <1384880863-10434-17-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1384880863-10434-1-git-send-email-pbonzini@redhat.com>

The code is similar to the implementation of discard and write_zeroes
with UNMAP.  However, failure must be propagated up to block.c.

The stale page cache problem can be reproduced as follows:

    # modprobe scsi-debug lbpws=1 lbprz=1
    # ./qemu-io /dev/sdXX
    qemu-io> write -P 0xcc 0 2M
    qemu-io> write -z 0 1M
    qemu-io> read -P 0x00 0 512
    Pattern verification failed at offset 0, 512 bytes
    qemu-io> read -v 0 512
    00000000:  cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc  ................
    ...

    # ./qemu-io --cache=none /dev/sdXX
    qemu-io> write -P 0xcc 0 2M
    qemu-io> write -z 0 1M
    qemu-io> read -P 0x00 0 512
    qemu-io> read -v 0 512
    00000000:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    ...

And similarly with discard instead of "write -z".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/raw-aio.h   |  3 +-
 block/raw-posix.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/block/raw-aio.h b/block/raw-aio.h
index c61f159..7ad0a8a 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -21,9 +21,10 @@
 #define QEMU_AIO_IOCTL        0x0004
 #define QEMU_AIO_FLUSH        0x0008
 #define QEMU_AIO_DISCARD      0x0010
+#define QEMU_AIO_WRITE_ZEROES 0x0020
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
-         QEMU_AIO_DISCARD)
+         QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 928ac70..1b43a4e 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -142,6 +142,7 @@ typedef struct BDRVRawState {
     bool is_xfs:1;
 #endif
     bool has_discard:1;
+    bool has_write_zeroes:1;
     bool discard_zeroes:1;
 } BDRVRawState;
 
@@ -327,6 +328,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #endif
 
     s->has_discard = true;
+    s->has_write_zeroes = true;
 
     if (fstat(s->fd, &st) < 0) {
         error_setg_errno(errp, errno, "Could not stat file");
@@ -345,9 +347,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #ifdef __linux__
         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
          * not rely on the contents of discarded blocks unless using O_DIRECT.
+         * Same for BLKZEROOUT.
          */
         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
             s->discard_zeroes = false;
+            s->has_write_zeroes = false;
         }
 #endif
     }
@@ -703,6 +707,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 }
 
 #ifdef CONFIG_XFS
+static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
+{
+    struct xfs_flock64 fl;
+
+    memset(&fl, 0, sizeof(fl));
+    fl.l_whence = SEEK_SET;
+    fl.l_start = offset;
+    fl.l_len = bytes;
+
+    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
+        DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+
 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
     struct xfs_flock64 fl;
@@ -721,6 +742,42 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 }
 #endif
 
+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+{
+    int ret = -EOPNOTSUPP;
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (s->has_write_zeroes == 0) {
+        return -ENOTSUP;
+    }
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+#ifdef BLKZEROOUT
+        do {
+            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
+
+        ret = -errno;
+#endif
+    } else {
+#ifdef CONFIG_XFS
+        if (s->is_xfs) {
+            return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+        }
+#endif
+    }
+
+    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
+        ret == -ENOTTY) {
+        s->has_write_zeroes = false;
+        ret = -ENOTSUP;
+    }
+    return ret;
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
     int ret = -EOPNOTSUPP;
@@ -805,6 +862,9 @@ static int aio_worker(void *arg)
     case QEMU_AIO_DISCARD:
         ret = handle_aiocb_discard(aiocb);
         break;
+    case QEMU_AIO_WRITE_ZEROES:
+        ret = handle_aiocb_write_zeroes(aiocb);
+        break;
     default:
         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
         ret = -EINVAL;
@@ -1257,13 +1317,13 @@ static int coroutine_fn raw_co_write_zeroes(
     BDRVRawState *s = bs->opaque;
 
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return -ENOTSUP;
-    }
-    if (!s->discard_zeroes) {
-        return -ENOTSUP;
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD);
     }
-    return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
-                          QEMU_AIO_DISCARD);
+    return -ENOTSUP;
 }
 
 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -1614,13 +1674,13 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
         return rc;
     }
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return -ENOTSUP;
-    }
-    if (!s->discard_zeroes) {
-        return -ENOTSUP;
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
+    } else if (s->discard_zeroes) {
+        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
     }
-    return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
-                          QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+    return -ENOTSUP;
 }
 
 static int hdev_create(const char *filename, QEMUOptionParameter *options,
-- 
1.8.4.2

  parent reply	other threads:[~2013-11-19 17:08 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-19 17:07 [Qemu-devel] [PATCH v2 00/20] block & scsi: write_zeroes support through the whole stack Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 01/20] block: generalize BlockLimits handling to cover bdrv_aio_discard too Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 02/20] block: add flags to BlockRequest Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 03/20] block: add flags argument to bdrv_co_write_zeroes tracepoint Paolo Bonzini
2013-11-20  9:59   ` Stefan Hajnoczi
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 04/20] block: add bdrv_aio_write_zeroes Paolo Bonzini
2013-11-20 10:02   ` Stefan Hajnoczi
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 05/20] block: handle ENOTSUP from discard in generic code Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 06/20] block: make bdrv_co_do_write_zeroes stricter in producing aligned requests Paolo Bonzini
2013-11-20 10:22   ` Stefan Hajnoczi
2013-11-20 11:01     ` Paolo Bonzini
2013-11-20 14:29       ` Stefan Hajnoczi
2013-11-21 11:30   ` Peter Lieven
2013-11-21 11:37     ` Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 07/20] vpc, vhdx: add get_info Paolo Bonzini
2013-11-20 12:39   ` Stefan Hajnoczi
2013-11-20 12:50     ` Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 08/20] block drivers: add discard/write_zeroes properties to bdrv_get_info implementation Paolo Bonzini
2013-11-21 11:33   ` Peter Lieven
2013-11-21 11:39     ` Paolo Bonzini
2013-11-21 11:48       ` Peter Lieven
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 09/20] block drivers: expose requirement for write same alignment from formats Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 10/20] block/iscsi: remove .bdrv_has_zero_init Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 11/20] block/iscsi: updated copyright Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 12/20] block/iscsi: check WRITE SAME support differently depending on MAY_UNMAP Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 13/20] block/iscsi: use UNMAP to write zeroes if LBPRZ=1 Paolo Bonzini
2013-11-21 11:43   ` Peter Lieven
2013-11-21 11:49     ` Paolo Bonzini
2013-11-21 11:54       ` Peter Lieven
2013-11-21 12:05         ` Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 14/20] raw-posix: implement write_zeroes with MAY_UNMAP for files Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 15/20] raw-posix: implement write_zeroes with MAY_UNMAP for block devices Paolo Bonzini
2013-11-19 17:07 ` Paolo Bonzini [this message]
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 17/20] qemu-iotests: 033 is fast Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 18/20] scsi-disk: catch write protection errors in UNMAP Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 19/20] scsi-disk: reject ANCHOR=1 for UNMAP and WRITE SAME commands Paolo Bonzini
2013-11-19 17:07 ` [Qemu-devel] [PATCH v2 20/20] scsi-disk: correctly implement WRITE SAME Paolo Bonzini
2013-11-19 17:23   ` ronnie sahlberg
2013-11-19 17:27     ` ronnie sahlberg
2013-11-19 17:31     ` Paolo Bonzini
2013-11-20 14:18   ` Stefan Hajnoczi
2013-11-20 14:19     ` Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1384880863-10434-17-git-send-email-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=kwolf@redhat.com \
    --cc=pl@kamp.de \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).