* [Qemu-devel] barriers: block layer preparations
2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
@ 2009-05-05 12:08 ` Christoph Hellwig
2009-05-05 13:51 ` Avi Kivity
2009-05-05 12:08 ` [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support Christoph Hellwig
` (2 subsequent siblings)
3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:08 UTC (permalink / raw)
To: qemu-devel
Add a flag to BlockDriverState to advertise barriers support, and add a flags
argument to bdrv_aio_readv/writev to allow passing down the barrier flag.
Note that the flags argument to bdrv_aio_readv is for now actually superflous
because write barriers obviously only apply to writes. I kept the read side
argument so the API is symmetric and we can easily add more flags to both
of them.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: qemu/block-qcow.c
===================================================================
--- qemu.orig/block-qcow.c 2009-04-26 16:38:30.509074899 +0200
+++ qemu/block-qcow.c 2009-05-05 11:29:11.545784130 +0200
@@ -587,7 +587,7 @@ static void qcow_aio_read_cb(void *opaqu
acb->hd_iov.iov_len = acb->n * 512;
qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
- &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
if (acb->hd_aiocb == NULL)
goto done;
} else {
@@ -612,7 +612,7 @@ static void qcow_aio_read_cb(void *opaqu
qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
acb->hd_aiocb = bdrv_aio_readv(s->hd,
(acb->cluster_offset >> 9) + index_in_cluster,
- &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
if (acb->hd_aiocb == NULL)
goto done;
}
@@ -630,7 +630,7 @@ done:
static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
QCowAIOCB *acb;
@@ -708,7 +708,7 @@ static void qcow_aio_write_cb(void *opaq
acb->hd_aiocb = bdrv_aio_writev(s->hd,
(cluster_offset >> 9) + index_in_cluster,
&acb->hd_qiov, acb->n,
- qcow_aio_write_cb, acb);
+ qcow_aio_write_cb, acb, 0);
if (acb->hd_aiocb == NULL)
goto done;
return;
@@ -722,7 +722,7 @@ done:
static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
BDRVQcowState *s = bs->opaque;
QCowAIOCB *acb;
Index: qemu/block-qcow2.c
===================================================================
--- qemu.orig/block-qcow2.c 2009-04-26 16:47:40.682949219 +0200
+++ qemu/block-qcow2.c 2009-05-05 11:29:11.546784118 +0200
@@ -1349,7 +1349,7 @@ static void qcow_aio_read_cb(void *opaqu
qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
&acb->hd_qiov, acb->n,
- qcow_aio_read_cb, acb);
+ qcow_aio_read_cb, acb, 0);
if (acb->hd_aiocb == NULL)
goto done;
} else {
@@ -1384,7 +1384,7 @@ static void qcow_aio_read_cb(void *opaqu
qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
acb->hd_aiocb = bdrv_aio_readv(s->hd,
(acb->cluster_offset >> 9) + index_in_cluster,
- &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
if (acb->hd_aiocb == NULL)
goto done;
}
@@ -1427,7 +1427,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDr
static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
QCowAIOCB *acb;
@@ -1498,7 +1498,7 @@ static void qcow_aio_write_cb(void *opaq
acb->hd_aiocb = bdrv_aio_writev(s->hd,
(acb->cluster_offset >> 9) + index_in_cluster,
&acb->hd_qiov, acb->n,
- qcow_aio_write_cb, acb);
+ qcow_aio_write_cb, acb, 2);
if (acb->hd_aiocb == NULL)
goto done;
@@ -1513,7 +1513,7 @@ done:
static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
BDRVQcowState *s = bs->opaque;
QCowAIOCB *acb;
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c 2009-04-26 16:38:30.513074989 +0200
+++ qemu/block.c 2009-05-05 11:29:11.547817490 +0200
@@ -59,10 +59,10 @@ typedef struct BlockDriverAIOCBSync {
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb);
static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors);
@@ -1309,7 +1309,8 @@ char *bdrv_snapshot_dump(char *buf, int
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque,
+ unsigned flags)
{
BlockDriver *drv = bs->drv;
BlockDriverAIOCB *ret;
@@ -1320,7 +1321,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
return NULL;
ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
- cb, opaque);
+ cb, opaque, flags);
if (ret) {
/* Update stats even though technically transfer has not happened. */
@@ -1333,7 +1334,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque,
+ unsigned flags)
{
BlockDriver *drv = bs->drv;
BlockDriverAIOCB *ret;
@@ -1346,7 +1348,7 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
return NULL;
ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
- cb, opaque);
+ cb, opaque, flags);
if (ret) {
/* Update stats even though technically transfer has not happened. */
@@ -1411,14 +1413,14 @@ static BlockDriverAIOCB *bdrv_aio_rw_vec
static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
}
static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
}
@@ -1454,7 +1456,7 @@ static int bdrv_read_em(BlockDriverState
iov.iov_len = nb_sectors * 512;
qemu_iovec_init_external(&qiov, &iov, 1);
acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
- bdrv_rw_em_cb, &async_ret);
+ bdrv_rw_em_cb, &async_ret, 0);
if (acb == NULL)
return -1;
@@ -1478,7 +1480,7 @@ static int bdrv_write_em(BlockDriverStat
iov.iov_len = nb_sectors * 512;
qemu_iovec_init_external(&qiov, &iov, 1);
acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
- bdrv_rw_em_cb, &async_ret);
+ bdrv_rw_em_cb, &async_ret, 0);
if (acb == NULL)
return -1;
while (async_ret == NOT_DONE) {
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h 2009-04-26 16:38:30.513074989 +0200
+++ qemu/block.h 2009-05-05 11:29:11.549784011 +0200
@@ -54,6 +54,12 @@ typedef struct QEMUSnapshotInfo {
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_CACHE_DEF)
+
+/*
+ * Flags for bdrv_aio_readv/bdrv_aio_writev.
+ */
+#define BDRV_IO_BARRIER 0x0001
+
void bdrv_info(Monitor *mon);
void bdrv_info_stats(Monitor *mon);
@@ -93,10 +99,12 @@ typedef void BlockDriverCompletionFunc(v
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque,
+ unsigned flags);
BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque,
+ unsigned flags);
void bdrv_aio_cancel(BlockDriverAIOCB *acb);
/* sg packet commands */
Index: qemu/dma-helpers.c
===================================================================
--- qemu.orig/dma-helpers.c 2009-04-26 16:38:30.531074417 +0200
+++ qemu/dma-helpers.c 2009-05-05 11:29:11.550836309 +0200
@@ -120,10 +120,10 @@ static void dma_bdrv_cb(void *opaque, in
if (dbs->is_write) {
dbs->acb = bdrv_aio_writev(dbs->bs, dbs->sector_num, &dbs->iov,
- dbs->iov.size / 512, dma_bdrv_cb, dbs);
+ dbs->iov.size / 512, dma_bdrv_cb, dbs, 0);
} else {
dbs->acb = bdrv_aio_readv(dbs->bs, dbs->sector_num, &dbs->iov,
- dbs->iov.size / 512, dma_bdrv_cb, dbs);
+ dbs->iov.size / 512, dma_bdrv_cb, dbs, 0);
}
if (!dbs->acb) {
dma_bdrv_unmap(dbs);
Index: qemu/hw/ide.c
===================================================================
--- qemu.orig/hw/ide.c 2009-05-04 17:02:14.367783478 +0200
+++ qemu/hw/ide.c 2009-05-05 11:29:11.552783415 +0200
@@ -1473,7 +1473,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
bm->iov.iov_len = n * 4 * 512;
qemu_iovec_init_external(&bm->qiov, &bm->iov, 1);
bm->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2, &bm->qiov,
- n * 4, ide_atapi_cmd_read_dma_cb, bm);
+ n * 4, ide_atapi_cmd_read_dma_cb, bm, 0);
if (!bm->aiocb) {
/* Note: media not present is the most likely case */
ide_atapi_cmd_error(s, SENSE_NOT_READY,
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c 2009-04-26 16:38:30.592081973 +0200
+++ qemu/hw/scsi-disk.c 2009-05-05 11:29:11.554836260 +0200
@@ -210,7 +210,7 @@ static void scsi_read_data(SCSIDevice *d
r->iov.iov_len = n * 512;
qemu_iovec_init_external(&r->qiov, &r->iov, 1);
r->aiocb = bdrv_aio_readv(s->bdrv, r->sector, &r->qiov, n,
- scsi_read_complete, r);
+ scsi_read_complete, r, 0);
if (r->aiocb == NULL)
scsi_command_complete(r, STATUS_CHECK_CONDITION, SENSE_HARDWARE_ERROR);
r->sector += n;
@@ -275,7 +275,7 @@ static void scsi_write_request(SCSIReque
if (n) {
qemu_iovec_init_external(&r->qiov, &r->iov, 1);
r->aiocb = bdrv_aio_writev(s->bdrv, r->sector, &r->qiov, n,
- scsi_write_complete, r);
+ scsi_write_complete, r, 0);
if (r->aiocb == NULL)
scsi_command_complete(r, STATUS_CHECK_CONDITION,
SENSE_HARDWARE_ERROR);
Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c 2009-05-04 17:02:14.374784160 +0200
+++ qemu/hw/virtio-blk.c 2009-05-05 13:21:24.196784347 +0200
@@ -212,13 +212,13 @@ static void virtio_blk_handle_scsi(VirtI
static void virtio_blk_handle_write(VirtIOBlockReq *req)
{
bdrv_aio_writev(req->dev->bs, req->out->sector, &req->qiov,
- req->qiov.size / 512, virtio_blk_rw_complete, req);
+ req->qiov.size / 512, virtio_blk_rw_complete, req, 0);
}
static void virtio_blk_handle_read(VirtIOBlockReq *req)
{
bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
- req->qiov.size / 512, virtio_blk_rw_complete, req);
+ req->qiov.size / 512, virtio_blk_rw_complete, req, 0);
}
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
Index: qemu/hw/xen_disk.c
===================================================================
--- qemu.orig/hw/xen_disk.c 2009-04-26 16:38:30.612075090 +0200
+++ qemu/hw/xen_disk.c 2009-05-05 11:29:11.556785880 +0200
@@ -397,14 +397,14 @@ static int ioreq_runio_qemu_aio(struct i
ioreq->aio_inflight++;
bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
&ioreq->v, ioreq->v.size / BLOCK_SIZE,
- qemu_aio_complete, ioreq);
+ qemu_aio_complete, ioreq, 0);
break;
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
ioreq->aio_inflight++;
bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
&ioreq->v, ioreq->v.size / BLOCK_SIZE,
- qemu_aio_complete, ioreq);
+ qemu_aio_complete, ioreq, 0);
break;
default:
/* unknown operation (shouldn't happen -- parse catches this) */
Index: qemu/qemu-io.c
===================================================================
--- qemu.orig/qemu-io.c 2009-05-04 17:02:14.381783935 +0200
+++ qemu/qemu-io.c 2009-05-05 11:31:57.159659916 +0200
@@ -148,7 +148,7 @@ static int do_aio_readv(QEMUIOVector *qi
int async_ret = NOT_DONE;
acb = bdrv_aio_readv(bs, offset >> 9, qiov, qiov->size >> 9,
- aio_rw_done, &async_ret);
+ aio_rw_done, &async_ret, 0);
if (!acb)
return -EIO;
@@ -165,7 +165,7 @@ static int do_aio_writev(QEMUIOVector *q
int async_ret = NOT_DONE;
acb = bdrv_aio_writev(bs, offset >> 9, qiov, qiov->size >> 9,
- aio_rw_done, &async_ret);
+ aio_rw_done, &async_ret, 0);
if (!acb)
return -EIO;
Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c 2009-04-28 11:42:09.257949541 +0200
+++ qemu/block-raw-posix.c 2009-05-05 13:21:23.136784385 +0200
@@ -654,7 +654,7 @@ static void raw_aio_remove(RawAIOCB *acb
static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
RawAIOCB *acb;
@@ -670,7 +670,7 @@ static BlockDriverAIOCB *raw_aio_readv(B
static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
{
RawAIOCB *acb;
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h 2009-04-26 16:38:30.513074989 +0200
+++ qemu/block_int.h 2009-05-05 13:23:37.599695377 +0200
@@ -56,10 +56,10 @@ struct BlockDriver {
/* aio */
BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
int aiocb_size;
@@ -117,6 +117,7 @@ struct BlockDriverState {
int encrypted; /* if true, the media is encrypted */
int valid_key; /* if true, a valid encryption key has been set */
int sg; /* if true, the device is a /dev/sg* */
+ int barrier_support; /* if true, the device supports barrier requests */
/* event callback when inserting/removing */
void (*change_cb)(void *opaque);
void *change_opaque;
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support
2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
@ 2009-05-05 12:08 ` Christoph Hellwig
2009-05-05 12:33 ` Jamie Lokier
2009-05-05 12:09 ` [Qemu-devel] [PATCH 3/3] barriers: virtio Christoph Hellwig
2009-05-05 13:53 ` [Qemu-devel] [PATCH 0/3] write barrier support Avi Kivity
3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:08 UTC (permalink / raw)
To: qemu-devel
Add support for write barriers to the posix raw file / block device code.
The guts of this is in the aio emulation as that's where we handle our queue
of outstanding requests.
The highlevel design is the following:
- As soon as a barrier request is submitted via qemu_paio_submit we increment
the barrier_inprogress count to signal we now have to deal with barriers.
- From that point on every new request that is queued up by qemu_paio_submit
does not get onto the normal request list but a secondary post-barrier queue
- Once the barrier request is dequeued by an aio_thread that thread waits for
all other outstanding requests to finish, issues an fdatasync, the actual
barrier request, another fdatasync to prevent reordering in the pagecache.
After the request is finished the barrier_inprogress counter is decrement,
the post-barrier list is splice back onto the main request list up to and
including the next barrier request if there is one and normal operation
is resumed.
That means barrier mean a quite massive serialization of the I/O submission
path, which unfortunately is not avoidable given their semantics. I will
mitigate it for setups with multiple virtual storage device with a patch
that makes the aio state per-device in the near future.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: qemu/posix-aio-compat.c
===================================================================
--- qemu.orig/posix-aio-compat.c 2009-05-05 13:35:09.115784239 +0200
+++ qemu/posix-aio-compat.c 2009-05-05 13:47:38.625659276 +0200
@@ -17,6 +17,7 @@
#include <errno.h>
#include <time.h>
#include <string.h>
+#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include "osdep.h"
@@ -31,8 +32,19 @@ static pthread_attr_t attr;
static int max_threads = 64;
static int cur_threads = 0;
static int idle_threads = 0;
+
+/* number of barriers currently handled */
+static int barrier_inprogress = 0;
+
+/* normal list of all requests waiting for execution */
static TAILQ_HEAD(, qemu_paiocb) request_list;
+/* list of all requests issued after a barrier request */
+static TAILQ_HEAD(, qemu_paiocb) post_barrier_list;
+
+/* wait for all I/O threads to be idle before issueing a barrier request */
+static pthread_cond_t idle_wait = PTHREAD_COND_INITIALIZER;
+
#ifdef HAVE_PREADV
static int preadv_present = 1;
#else
@@ -62,6 +74,13 @@ static void mutex_unlock(pthread_mutex_t
if (ret) die2(ret, "pthread_mutex_unlock");
}
+static int cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+ int ret = pthread_cond_wait(cond, mutex);
+ if (ret) die2(ret, "pthread_cond_wait");
+ return ret;
+}
+
static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
struct timespec *ts)
{
@@ -264,6 +283,22 @@ static size_t handle_aiocb_rw(struct qem
return nbytes;
}
+static void requeue_request_list(void)
+{
+ struct qemu_paiocb *cb, *next;
+
+ TAILQ_FOREACH_SAFE(cb, &post_barrier_list, node, next) {
+ TAILQ_REMOVE(&post_barrier_list, cb, node);
+ TAILQ_INSERT_TAIL(&request_list, cb, node);
+
+ /*
+ * Stop after the first barrier request.
+ */
+ if (cb->aio_flags & QEMU_AIO_BARRIER)
+ break;
+ }
+}
+
static void *aio_thread(void *unused)
{
pid_t pid;
@@ -280,6 +315,8 @@ static void *aio_thread(void *unused)
size_t ret = 0;
qemu_timeval tv;
struct timespec ts;
+ bool wakeup_threads = false;
+ bool wakeup_idle = false;
qemu_gettimeofday(&tv);
ts.tv_sec = tv.tv_sec + 10;
@@ -297,6 +334,16 @@ static void *aio_thread(void *unused)
aiocb = TAILQ_FIRST(&request_list);
TAILQ_REMOVE(&request_list, aiocb, node);
+
+ /*
+ * We've got a barrier request. Make sure all previous requests
+ * are completed before we issue it.
+ */
+ if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+ while (idle_threads != cur_threads)
+ cond_wait(&idle_wait, &lock);
+ }
+
aiocb->active = 1;
idle_threads--;
mutex_unlock(&lock);
@@ -304,7 +351,13 @@ static void *aio_thread(void *unused)
switch (aiocb->aio_type) {
case QEMU_PAIO_READ:
case QEMU_PAIO_WRITE:
- ret = handle_aiocb_rw(aiocb);
+ if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+ fdatasync(aiocb->aio_fildes);
+ ret = handle_aiocb_rw(aiocb);
+ fdatasync(aiocb->aio_fildes);
+ } else {
+ ret = handle_aiocb_rw(aiocb);
+ }
break;
case QEMU_PAIO_IOCTL:
ret = handle_aiocb_ioctl(aiocb);
@@ -317,9 +370,32 @@ static void *aio_thread(void *unused)
mutex_lock(&lock);
aiocb->ret = ret;
- idle_threads++;
+
+ if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+ barrier_inprogress--;
+ if (!TAILQ_EMPTY(&request_list))
+ die2(ret, "request list not empty");
+
+ if (!TAILQ_EMPTY(&post_barrier_list)) {
+ requeue_request_list();
+ wakeup_threads = true;
+ }
+ }
+
+ /* wake up barrier thread when all threads are idle */
+ if (++idle_threads == cur_threads && barrier_inprogress)
+ wakeup_idle = true;
mutex_unlock(&lock);
+ /*
+ * If any new requests were queued up on the post_barrier_list wake up
+ * I/O threads now.
+ */
+ if (wakeup_threads)
+ cond_signal(&cond);
+ if (wakeup_idle)
+ cond_signal(&idle_wait);
+
if (kill(pid, aiocb->ev_signo)) die("kill failed");
}
@@ -348,6 +424,7 @@ int qemu_paio_init(struct qemu_paioinit
if (ret) die2(ret, "pthread_attr_setdetachstate");
TAILQ_INIT(&request_list);
+ TAILQ_INIT(&post_barrier_list);
return 0;
}
@@ -357,10 +434,21 @@ static int qemu_paio_submit(struct qemu_
aiocb->aio_type = type;
aiocb->ret = -EINPROGRESS;
aiocb->active = 0;
+
mutex_lock(&lock);
if (idle_threads == 0 && cur_threads < max_threads)
spawn_thread();
- TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+
+ if (barrier_inprogress) {
+ aiocb->aio_flags |= QEMU_AIO_POST_BARRIER;
+ TAILQ_INSERT_TAIL(&post_barrier_list, aiocb, node);
+ } else {
+ TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+ }
+
+ if (aiocb->aio_flags & QEMU_AIO_BARRIER)
+ barrier_inprogress++;
+
mutex_unlock(&lock);
cond_signal(&cond);
@@ -411,13 +499,17 @@ int qemu_paio_cancel(int fd, struct qemu
mutex_lock(&lock);
if (!aiocb->active) {
- TAILQ_REMOVE(&request_list, aiocb, node);
+ if (aiocb->aio_flags & QEMU_AIO_POST_BARRIER)
+ TAILQ_REMOVE(&post_barrier_list, aiocb, node);
+ else
+ TAILQ_REMOVE(&request_list, aiocb, node);
aiocb->ret = -ECANCELED;
ret = QEMU_PAIO_CANCELED;
- } else if (aiocb->ret == -EINPROGRESS)
+ } else if (aiocb->ret == -EINPROGRESS) {
ret = QEMU_PAIO_NOTCANCELED;
- else
+ } else {
ret = QEMU_PAIO_ALLDONE;
+ }
mutex_unlock(&lock);
return ret;
Index: qemu/posix-aio-compat.h
===================================================================
--- qemu.orig/posix-aio-compat.h 2009-05-05 13:35:09.160784863 +0200
+++ qemu/posix-aio-compat.h 2009-05-05 13:45:54.312668406 +0200
@@ -39,6 +39,11 @@ struct qemu_paiocb
unsigned aio_flags;
/* 512 byte alignment required for buffer, offset and length */
#define QEMU_AIO_SECTOR_ALIGNED 0x01
+/* Barrier request, must not re-order */
+#define QEMU_AIO_BARRIER 0x02
+
+/* Internal flag, is in the post-barrier queue */
+#define QEMU_AIO_POST_BARRIER 0x80
/* private */
TAILQ_ENTRY(qemu_paiocb) node;
Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c 2009-05-05 13:43:21.431811845 +0200
+++ qemu/block-raw-posix.c 2009-05-05 13:43:21.897783237 +0200
@@ -172,6 +172,14 @@ static int raw_open(BlockDriverState *bs
return ret;
}
}
+
+ /*
+ * If the open mode allows caching writes in the file cache advertise
+ * barrier support so that the guest can control the cachie behaviour.
+ */
+ if (!(open_flags & (O_DIRECT|O_DSYNC)))
+ bs->barrier_support = 1;
+
return 0;
}
@@ -600,8 +608,8 @@ static int posix_aio_init(void)
}
static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque)
+ QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb,
+ void *opaque, unsigned flags)
{
BDRVRawState *s = bs->opaque;
RawAIOCB *acb;
@@ -627,6 +635,8 @@ static RawAIOCB *raw_aio_setup(BlockDriv
*/
if (s->aligned_buf)
acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+ if (flags & BDRV_IO_BARRIER)
+ acb->aiocb.aio_flags |= QEMU_AIO_BARRIER;
acb->next = posix_aio_state->first_aio;
posix_aio_state->first_aio = acb;
@@ -658,7 +668,7 @@ static BlockDriverAIOCB *raw_aio_readv(B
{
RawAIOCB *acb;
- acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+ acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
if (!acb)
return NULL;
if (qemu_paio_read(&acb->aiocb) < 0) {
@@ -674,7 +684,7 @@ static BlockDriverAIOCB *raw_aio_writev(
{
RawAIOCB *acb;
- acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+ acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
if (!acb)
return NULL;
if (qemu_paio_write(&acb->aiocb) < 0) {
@@ -1022,6 +1032,14 @@ static int hdev_open(BlockDriverState *b
s->fd_media_changed = 1;
}
#endif
+
+ /*
+ * If the open mode allows caching writes in the file cache advertise
+ * barrier support so that the guest can control the cachie behaviour.
+ */
+ if (!(open_flags & (O_DIRECT|O_DSYNC)))
+ bs->barrier_support = 1;
+
return 0;
}
^ permalink raw reply [flat|nested] 15+ messages in thread