* [Qemu-devel] [PATCH 1/5] block: add enable_write_cache flag
2009-09-04 17:00 [Qemu-devel] [PATCH 0/5] data integrity fixes V2 Christoph Hellwig
@ 2009-09-04 17:01 ` Christoph Hellwig
2009-09-04 17:01 ` [Qemu-devel] [PATCH 2/5] block: use fdatasync instead of fsync if possible Christoph Hellwig
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-09-04 17:01 UTC (permalink / raw)
To: qemu-devel
Add a enable_write_cache flag in the block driver state, and use it to
decide if we claim to have a volatile write cache that needs controlled
flushing from the guest. The flag is off if cache=writethrough is
defined because O_DSYNC guarantees that every write goes to stable
storage, and it is on for cache=none and cache=writeback.
Both scsi-disk and ide now use the new flage, changing from their
defaults of always off (ide) or always on (scsi-disk).
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c 2009-09-04 13:26:01.258522374 -0300
+++ qemu/hw/scsi-disk.c 2009-09-04 13:26:14.318522389 -0300
@@ -710,7 +710,9 @@ static int32_t scsi_send_command(SCSIDev
memset(p,0,20);
p[0] = 8;
p[1] = 0x12;
- p[2] = 4; /* WCE */
+ if (bdrv_enable_write_cache(s->bdrv)) {
+ p[2] = 4; /* WCE */
+ }
p += 20;
}
if ((page == 0x3f || page == 0x2a)
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c 2009-09-04 13:26:01.270522293 -0300
+++ qemu/block.c 2009-09-04 13:43:43.898522158 -0300
@@ -408,6 +408,16 @@ int bdrv_open2(BlockDriverState *bs, con
}
bs->drv = drv;
bs->opaque = qemu_mallocz(drv->instance_size);
+
+ /*
+ * Yes, BDRV_O_NOCACHE aka O_DIRECT means we have to present a
+ * write cache to the guest. We do need the fdatasync to flush
+ * out transactions for block allocations, and we maybe have a
+ * volatile write cache in our backing device to deal with.
+ */
+ if (flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))
+ bs->enable_write_cache = 1;
+
/* Note: for compatibility, we open disk image files as RDWR, and
RDONLY as fallback */
if (!(flags & BDRV_O_FILE))
@@ -918,6 +928,11 @@ int bdrv_is_sg(BlockDriverState *bs)
return bs->sg;
}
+int bdrv_enable_write_cache(BlockDriverState *bs)
+{
+ return bs->enable_write_cache;
+}
+
/* XXX: no longer used */
void bdrv_set_change_cb(BlockDriverState *bs,
void (*change_cb)(void *opaque), void *opaque)
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h 2009-09-04 13:26:01.310521931 -0300
+++ qemu/block_int.h 2009-09-04 13:26:14.318522389 -0300
@@ -152,6 +152,9 @@ struct BlockDriverState {
/* the memory alignment required for the buffers handled by this driver */
int buffer_alignment;
+ /* do we need to tell the quest if we have a volatile write cache? */
+ int enable_write_cache;
+
/* NOTE: the following infos are only hints for real hardware
drivers. They are not used by the block driver */
int cyls, heads, secs, translation;
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h 2009-09-04 13:26:01.314522160 -0300
+++ qemu/block.h 2009-09-04 13:26:14.318522389 -0300
@@ -120,6 +120,7 @@ int bdrv_get_translation_hint(BlockDrive
int bdrv_is_removable(BlockDriverState *bs);
int bdrv_is_read_only(BlockDriverState *bs);
int bdrv_is_sg(BlockDriverState *bs);
+int bdrv_enable_write_cache(BlockDriverState *bs);
int bdrv_is_inserted(BlockDriverState *bs);
int bdrv_media_changed(BlockDriverState *bs);
int bdrv_is_locked(BlockDriverState *bs);
Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c 2009-09-04 13:26:01.266522204 -0300
+++ qemu/hw/ide/core.c 2009-09-04 13:32:31.046522461 -0300
@@ -148,8 +148,11 @@ static void ide_identify(IDEState *s)
put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
/* 14=set to 1, 1=SMART self test, 0=SMART error logging */
put_le16(p + 84, (1 << 14) | 0);
- /* 14 = NOP supported, 0=SMART feature set enabled */
- put_le16(p + 85, (1 << 14) | 1);
+ /* 14 = NOP supported, 5=WCACHE enabled, 0=SMART feature set enabled */
+ if (bdrv_enable_write_cache(s->bs))
+ put_le16(p + 85, (1 << 14) | (1 << 5) | 1);
+ else
+ put_le16(p + 85, (1 << 14) | 1);
/* 13=flush_cache_ext,12=flush_cache,10=lba48 */
put_le16(p + 86, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
/* 14=set to 1, 1=smart self test, 0=smart error logging */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Qemu-devel] [PATCH 3/5] block: add aio_flush operation
2009-09-04 17:00 [Qemu-devel] [PATCH 0/5] data integrity fixes V2 Christoph Hellwig
2009-09-04 17:01 ` [Qemu-devel] [PATCH 1/5] block: add enable_write_cache flag Christoph Hellwig
2009-09-04 17:01 ` [Qemu-devel] [PATCH 2/5] block: use fdatasync instead of fsync if possible Christoph Hellwig
@ 2009-09-04 17:01 ` Christoph Hellwig
2009-09-04 17:02 ` [Qemu-devel] [PATCH 4/5] ide: use bdrv_aio_flush Christoph Hellwig
2009-09-04 17:02 ` [Qemu-devel] [PATCH 5/5] virtio-blk: add volatile writecache feature Christoph Hellwig
4 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-09-04 17:01 UTC (permalink / raw)
To: qemu-devel
Instead stalling the VCPU while serving a cache flush try to do it
asynchronously. Use our good old helper thread pool to issue an
asynchronous fdatasync for raw-posix. Note that while Linux AIO
implements a fdatasync operation it is not useful for us because
it isn't actually implement in asynchronous fashion.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c 2009-09-04 13:43:56.290522252 -0300
+++ qemu/block.c 2009-09-04 13:49:44.430522596 -0300
@@ -54,6 +54,8 @@ static BlockDriverAIOCB *bdrv_aio_readv_
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque);
static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors);
static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
@@ -138,6 +140,10 @@ void bdrv_register(BlockDriver *bdrv)
bdrv->bdrv_read = bdrv_read_em;
bdrv->bdrv_write = bdrv_write_em;
}
+
+ if (!bdrv->bdrv_aio_flush)
+ bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
+
bdrv->next = first_drv;
first_drv = bdrv;
}
@@ -1369,6 +1375,21 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
return ret;
}
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv)
+ return NULL;
+
+ /*
+ * Note that unlike bdrv_flush the driver is reponsible for flushing a
+ * backing image if it exists.
+ */
+ return drv->bdrv_aio_flush(bs, cb, opaque);
+}
+
void bdrv_aio_cancel(BlockDriverAIOCB *acb)
{
acb->pool->cancel(acb);
@@ -1459,6 +1480,25 @@ static BlockDriverAIOCB *bdrv_aio_writev
return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
}
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BlockDriverAIOCBSync *acb;
+
+ acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
+ acb->is_write = 1; /* don't bounce in the completion hadler */
+ acb->qiov = NULL;
+ acb->bounce = NULL;
+ acb->ret = 0;
+
+ if (!acb->bh)
+ acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+
+ bdrv_flush(bs);
+ qemu_bh_schedule(acb->bh);
+ return &acb->common;
+}
+
/**************************************************************/
/* sync block device emulation */
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h 2009-09-04 13:43:56.298521662 -0300
+++ qemu/block.h 2009-09-04 13:48:35.870522299 -0300
@@ -85,6 +85,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque);
void bdrv_aio_cancel(BlockDriverAIOCB *acb);
/* sg packet commands */
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h 2009-09-04 13:43:56.298521662 -0300
+++ qemu/block_int.h 2009-09-04 13:48:35.874522877 -0300
@@ -69,6 +69,8 @@ struct BlockDriver {
BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque);
const char *protocol_name;
int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
Index: qemu/block/raw-posix-aio.h
===================================================================
--- qemu.orig/block/raw-posix-aio.h 2009-09-01 17:01:34.608937388 -0300
+++ qemu/block/raw-posix-aio.h 2009-09-04 13:48:35.874522877 -0300
@@ -17,8 +17,9 @@
#define QEMU_AIO_READ 0x0001
#define QEMU_AIO_WRITE 0x0002
#define QEMU_AIO_IOCTL 0x0004
+#define QEMU_AIO_FLUSH 0x0008
#define QEMU_AIO_TYPE_MASK \
- (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL)
+ (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH)
/* AIO flags */
#define QEMU_AIO_MISALIGNED 0x1000
Index: qemu/block/raw-posix.c
===================================================================
--- qemu.orig/block/raw-posix.c 2009-09-04 13:45:23.278522123 -0300
+++ qemu/block/raw-posix.c 2009-09-04 13:48:35.882521800 -0300
@@ -574,6 +574,18 @@ static BlockDriverAIOCB *raw_aio_writev(
cb, opaque, QEMU_AIO_WRITE);
}
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+
+ return paio_submit(bs, s->aio_ctx, s->fd, 0, NULL, 0,
+ cb, opaque, QEMU_AIO_FLUSH);
+}
+
static void raw_close(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
@@ -749,6 +761,7 @@ static BlockDriver bdrv_raw = {
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
@@ -1002,6 +1015,7 @@ static BlockDriver bdrv_host_device = {
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_read = raw_read,
.bdrv_write = raw_write,
@@ -1096,6 +1110,7 @@ static BlockDriver bdrv_host_floppy = {
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_read = raw_read,
.bdrv_write = raw_write,
@@ -1176,6 +1191,7 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_read = raw_read,
.bdrv_write = raw_write,
@@ -1295,6 +1311,7 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_read = raw_read,
.bdrv_write = raw_write,
Index: qemu/posix-aio-compat.c
===================================================================
--- qemu.orig/posix-aio-compat.c 2009-09-01 17:01:34.624937328 -0300
+++ qemu/posix-aio-compat.c 2009-09-04 13:50:57.134522254 -0300
@@ -134,6 +134,16 @@ static size_t handle_aiocb_ioctl(struct
return aiocb->aio_nbytes;
}
+static size_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
+{
+ int ret;
+
+ ret = fdatasync(aiocb->aio_fildes);
+ if (ret == -1)
+ return -errno;
+ return 0;
+}
+
#ifdef CONFIG_PREADV
static ssize_t
@@ -330,6 +340,9 @@ static void *aio_thread(void *unused)
case QEMU_AIO_WRITE:
ret = handle_aiocb_rw(aiocb);
break;
+ case QEMU_AIO_FLUSH:
+ ret = handle_aiocb_flush(aiocb);
+ break;
case QEMU_AIO_IOCTL:
ret = handle_aiocb_ioctl(aiocb);
break;
@@ -530,8 +543,10 @@ BlockDriverAIOCB *paio_submit(BlockDrive
acb->aio_type = type;
acb->aio_fildes = fd;
acb->ev_signo = SIGUSR2;
- acb->aio_iov = qiov->iov;
- acb->aio_niov = qiov->niov;
+ if (qiov) {
+ acb->aio_iov = qiov->iov;
+ acb->aio_niov = qiov->niov;
+ }
acb->aio_nbytes = nb_sectors * 512;
acb->aio_offset = sector_num * 512;
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Qemu-devel] [PATCH 5/5] virtio-blk: add volatile writecache feature
2009-09-04 17:00 [Qemu-devel] [PATCH 0/5] data integrity fixes V2 Christoph Hellwig
` (3 preceding siblings ...)
2009-09-04 17:02 ` [Qemu-devel] [PATCH 4/5] ide: use bdrv_aio_flush Christoph Hellwig
@ 2009-09-04 17:02 ` Christoph Hellwig
4 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-09-04 17:02 UTC (permalink / raw)
To: qemu-devel
Add a new VIRTIO_BLK_F_WCACHE feature to virtio-blk to indicate that we have
a volatile write cache that needs controlled flushing. Implement a
VIRTIO_BLK_T_FLUSH operation to flush it.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: qemu-kvm/hw/virtio-blk.c
===================================================================
--- qemu-kvm.orig/hw/virtio-blk.c
+++ qemu-kvm/hw/virtio-blk.c
@@ -129,6 +129,13 @@ static void virtio_blk_rw_complete(void
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
}
+static void virtio_blk_flush_complete(void *opaque, int ret)
+{
+ VirtIOBlockReq *req = opaque;
+
+ virtio_blk_req_complete(req, ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK);
+}
+
static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
{
VirtIOBlockReq *req = qemu_mallocz(sizeof(*req));
@@ -252,6 +259,16 @@ static void virtio_blk_handle_scsi(VirtI
}
#endif /* __linux__ */
+static void virtio_blk_handle_flush(VirtIOBlockReq *req)
+{
+ BlockDriverAIOCB *acb;
+
+ acb = bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
+ if (!acb) {
+ virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+ }
+}
+
static void virtio_blk_handle_write(VirtIOBlockReq *req)
{
BlockDriverAIOCB *acb;
@@ -294,7 +311,9 @@ static void virtio_blk_handle_output(Vir
req->out = (void *)req->elem.out_sg[0].iov_base;
req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
- if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ if (req->out->type & VIRTIO_BLK_T_FLUSH) {
+ virtio_blk_handle_flush(req);
+ } else if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
virtio_blk_handle_scsi(req);
} else if (req->out->type & VIRTIO_BLK_T_OUT) {
qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
@@ -382,6 +401,9 @@ static uint32_t virtio_blk_get_features(
features |= (1 << VIRTIO_BLK_F_SEG_MAX);
features |= (1 << VIRTIO_BLK_F_GEOMETRY);
+
+ if (bdrv_enable_write_cache(s->bs))
+ features |= (1 << VIRTIO_BLK_F_WCACHE);
#ifdef __linux__
features |= (1 << VIRTIO_BLK_F_SCSI);
#endif
Index: qemu-kvm/hw/virtio-blk.h
===================================================================
--- qemu-kvm.orig/hw/virtio-blk.h
+++ qemu-kvm/hw/virtio-blk.h
@@ -31,6 +31,7 @@
#define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/
#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */
#define VIRTIO_BLK_F_IDENTIFY 8 /* ATA IDENTIFY supported */
+#define VIRTIO_BLK_F_WCACHE 9 /* write cache enabled */
#define VIRTIO_BLK_ID_LEN 256 /* length of identify u16 array */
#define VIRTIO_BLK_ID_SN 10 /* start of char * serial# */
@@ -55,6 +56,9 @@ struct virtio_blk_config
/* This bit says it's a scsi command, not an actual read or write. */
#define VIRTIO_BLK_T_SCSI_CMD 2
+/* Flush the volatile write cache */
+#define VIRTIO_BLK_T_FLUSH 4
+
/* Barrier before this op. */
#define VIRTIO_BLK_T_BARRIER 0x80000000
^ permalink raw reply [flat|nested] 6+ messages in thread