[PATCH, RFC] virtio_blk: add cache flush command

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH, RFC] virtio_blk: add cache flush command
@ 2009-05-11  8:39 Christoph Hellwig
  2009-05-11 14:51 ` Anthony Liguori
  2009-05-12 13:54 ` Rusty Russell
  0 siblings, 2 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-11  8:39 UTC (permalink / raw)
  To: Rusty Russell; +Cc: kvm

Currently virtio-blk does support barriers for ordering requests which
is enough to guarantee filesystem metadata integrity with write back
caches, but it does not support any way to flush that writeback cache,
to guarantee that data is stable on disk on a fsync.

This patch implements a new VIRTIO_BLK_T_FLUSH command to flush the
cache and exposes the functionality to the block layer by implementing
a prepare_flush method.

Do we need a new feature flag for this command or can we expect that
all previous barrier support was buggy enough anyway?


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfs/drivers/block/virtio_blk.c
===================================================================
--- xfs.orig/drivers/block/virtio_blk.c	2009-05-11 10:11:28.884784539 +0200
+++ xfs/drivers/block/virtio_blk.c	2009-05-11 10:31:16.642783620 +0200
@@ -65,13 +65,25 @@ static void blk_done(struct virtqueue *v
 			break;
 		}
 
-		if (blk_pc_request(vbr->req)) {
+		switch (vbr->req->cmd_type) {
+		case REQ_TYPE_FS:
+			nr_bytes = blk_rq_bytes(vbr->req);
+			break;
+		case REQ_TYPE_BLOCK_PC:
 			vbr->req->data_len = vbr->in_hdr.residual;
 			nr_bytes = vbr->in_hdr.data_len;
 			vbr->req->sense_len = vbr->in_hdr.sense_len;
 			vbr->req->errors = vbr->in_hdr.errors;
-		} else
-			nr_bytes = blk_rq_bytes(vbr->req);
+			break;
+		case REQ_TYPE_LINUX_BLOCK:
+			if (vbr->req->cmd[0] == REQ_LB_OP_FLUSH) {
+				nr_bytes = blk_rq_bytes(vbr->req);
+				break;
+			}
+			/*FALLTHRU*/
+		default:
+			BUG();
+		}
 
 		__blk_end_request(vbr->req, error, nr_bytes);
 		list_del(&vbr->list);
@@ -82,7 +94,7 @@ static void blk_done(struct virtqueue *v
 	spin_unlock_irqrestore(&vblk->lock, flags);
 }
 
-static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
+static noinline bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
 	unsigned long num, out = 0, in = 0;
@@ -94,15 +106,27 @@ static bool do_req(struct request_queue 
 		return false;
 
 	vbr->req = req;
-	if (blk_fs_request(vbr->req)) {
+
+	switch (req->cmd_type) {
+	case REQ_TYPE_FS:
 		vbr->out_hdr.type = 0;
 		vbr->out_hdr.sector = vbr->req->sector;
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else if (blk_pc_request(vbr->req)) {
+		break;
+	case REQ_TYPE_BLOCK_PC:
 		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
 		vbr->out_hdr.sector = 0;
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else {
+		break;
+	case REQ_TYPE_LINUX_BLOCK:
+		if (req->cmd[0] == REQ_LB_OP_FLUSH) {
+			vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+			vbr->out_hdr.sector = 0;
+			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			break;
+		}
+		/*FALLTHRU*/
+	default:
 		/* We don't put anything else in the queue. */
 		BUG();
 	}
@@ -174,6 +198,12 @@ static void do_virtblk_request(struct re
 		vblk->vq->vq_ops->kick(vblk->vq);
 }
 
+static void virtblk_prepare_flush(struct request_queue *q, struct request *req)
+{
+	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+	req->cmd[0] = REQ_LB_OP_FLUSH;
+}
+
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
@@ -310,7 +340,8 @@ static int virtblk_probe(struct virtio_d
 
 	/* If barriers are supported, tell block layer that queue is ordered */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
-		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
+		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG_FLUSH,
+				  virtblk_prepare_flush);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
Index: xfs/include/linux/virtio_blk.h
===================================================================
--- xfs.orig/include/linux/virtio_blk.h	2009-05-11 10:18:39.933666519 +0200
+++ xfs/include/linux/virtio_blk.h	2009-05-11 10:22:14.396660919 +0200
@@ -35,6 +35,17 @@ struct virtio_blk_config
 	__u32 blk_size;
 } __attribute__((packed));
 
+/*
+ * Command types
+ *
+ * Usage is a bit tricky as some bits are used as flags and some not.
+ *
+ * Rules:
+ *   VIRTIO_BLK_T_OUT may be combinaed with VIRTIO_BLK_T_SCSI_CMD or
+ *   VIRTIO_BLK_T_BARRIER.  VIRTIO_BLK_T_FLUSH is a command of it's own
+ *   and may no be comined with any of the other flags.
+ */
+
 /* These two define direction. */
 #define VIRTIO_BLK_T_IN		0
 #define VIRTIO_BLK_T_OUT	1
@@ -42,6 +53,9 @@ struct virtio_blk_config
 /* This bit says it's a scsi command, not an actual read or write. */
 #define VIRTIO_BLK_T_SCSI_CMD	2
 
+/* Cache flush command */
+#define VIRTIO_BLK_T_FLUSH	4
+
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11  8:39 [PATCH, RFC] virtio_blk: add cache flush command Christoph Hellwig
@ 2009-05-11 14:51 ` Anthony Liguori
  2009-05-11 15:40   ` Christoph Hellwig
  2009-05-12 13:54 ` Rusty Russell
  1 sibling, 1 reply; 21+ messages in thread
From: Anthony Liguori @ 2009-05-11 14:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Rusty Russell, kvm

Christoph Hellwig wrote:
> Currently virtio-blk does support barriers for ordering requests which
> is enough to guarantee filesystem metadata integrity with write back
> caches, but it does not support any way to flush that writeback cache,
> to guarantee that data is stable on disk on a fsync.
>
> This patch implements a new VIRTIO_BLK_T_FLUSH command to flush the
> cache and exposes the functionality to the block layer by implementing
> a prepare_flush method.
>   

What typically triggers a flush operation?

I would assume an fsync would, but would a flush happen after every 
O_DIRECT write?

If the backend implementation of T_FLUSH is fsync, I would think that 
this would result in rather poor performance for O_DIRECT operations in 
the guest.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 14:51 ` Anthony Liguori
@ 2009-05-11 15:40   ` Christoph Hellwig
  2009-05-11 15:45     ` Avi Kivity
  2009-05-11 16:38     ` Anthony Liguori
  0 siblings, 2 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-11 15:40 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Rusty Russell, kvm

On Mon, May 11, 2009 at 09:51:40AM -0500, Anthony Liguori wrote:
> What typically triggers a flush operation?

fsync.

> I would assume an fsync would, but would a flush happen after every 
> O_DIRECT write?

Right now it doesn't, but it probably should.

> If the backend implementation of T_FLUSH is fsync, I would think that 
> this would result in rather poor performance for O_DIRECT operations in 
> the guest.

Right now it's fsync.  By the time I'll submit the backend change it
will still be fsync, but at least called from the posix-aio-compat
thread pool.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 15:40   ` Christoph Hellwig
@ 2009-05-11 15:45     ` Avi Kivity
  2009-05-11 16:28       ` Christoph Hellwig
  2009-05-11 16:38     ` Anthony Liguori
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2009-05-11 15:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Anthony Liguori, Rusty Russell, kvm

Christoph Hellwig wrote:
>> If the backend implementation of T_FLUSH is fsync, I would think that 
>> this would result in rather poor performance for O_DIRECT operations in 
>> the guest.
>>     
>
> Right now it's fsync.  By the time I'll submit the backend change it
> will still be fsync, but at least called from the posix-aio-compat
> thread pool.
>   

I think if we have cache=writeback we should ignore this.  The user is 
saying, I don't care about data loss, make the thing go fast.

For cache=none and cache=writethrough we don't really need fsync, but we 
do need to flush the inflight commands.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 15:45     ` Avi Kivity
@ 2009-05-11 16:28       ` Christoph Hellwig
  2009-05-11 16:49         ` Avi Kivity
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-11 16:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Anthony Liguori, Rusty Russell, kvm

On Mon, May 11, 2009 at 06:45:50PM +0300, Avi Kivity wrote:
> >Right now it's fsync.  By the time I'll submit the backend change it
> >will still be fsync, but at least called from the posix-aio-compat
> >thread pool.
> >  
> 
> I think if we have cache=writeback we should ignore this.

It's only needed for cache=writeback, because without that there is no
reason to flush a write cache.

> For cache=none and cache=writethrough we don't really need fsync, but we 
> do need to flush the inflight commands.

What we do need for those modes is the basic barrier support because
we can currently re-order requests.  The next version of my patch will
implement a barriers without cache flush mode, although I don't think
a fdatasync without any outstanding dirty data should cause problems.
(Or maybe ext3 actually is stupid enough to flush the whole fs even for
that case)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 15:40   ` Christoph Hellwig
  2009-05-11 15:45     ` Avi Kivity
@ 2009-05-11 16:38     ` Anthony Liguori
  2009-05-12  7:26       ` Christoph Hellwig
  1 sibling, 1 reply; 21+ messages in thread
From: Anthony Liguori @ 2009-05-11 16:38 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Rusty Russell, kvm

Christoph Hellwig wrote:
> On Mon, May 11, 2009 at 09:51:40AM -0500, Anthony Liguori wrote:
>   
>> What typically triggers a flush operation?
>>     
>
> fsync.
>
>   
>> I would assume an fsync would, but would a flush happen after every 
>> O_DIRECT write?
>>     
>
> Right now it doesn't, but it probably should.
>   

So then with cache=writeback, fsync behaves itself but O_DIRECT writes 
do not.

This seems like a really undesirable combination of behavior from a 
guest integrity point of view.  It makes me wonder if it's really 
useful.  I think that any serious user would have to continue using 
cache=writethrough.  Is there a path that would ever allow someone who 
cares about their data to use cache=writeback instead of cache=writethrough?

>> If the backend implementation of T_FLUSH is fsync, I would think that 
>> this would result in rather poor performance for O_DIRECT operations in 
>> the guest.
>>     
>
> Right now it's fsync.  By the time I'll submit the backend change it
> will still be fsync, but at least called from the posix-aio-compat
> thread pool.
>   

fsync is pretty crappy on ext3 default configs.  I'm concerned that this 
could be considered a DoS by a malicious guest.  If it sat in a T_FLUSH 
loop, it would potentially bring your system to a crawl, no?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 16:28       ` Christoph Hellwig
@ 2009-05-11 16:49         ` Avi Kivity
  2009-05-11 17:47           ` Anthony Liguori
  2009-05-12  7:19           ` Christoph Hellwig
  0 siblings, 2 replies; 21+ messages in thread
From: Avi Kivity @ 2009-05-11 16:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Anthony Liguori, Rusty Russell, kvm

Christoph Hellwig wrote:
> On Mon, May 11, 2009 at 06:45:50PM +0300, Avi Kivity wrote:
>   
>>> Right now it's fsync.  By the time I'll submit the backend change it
>>> will still be fsync, but at least called from the posix-aio-compat
>>> thread pool.
>>>  
>>>       
>> I think if we have cache=writeback we should ignore this.
>>     
>
> It's only needed for cache=writeback, because without that there is no
> reason to flush a write cache.
>   

Maybe we should add a fourth cache= mode then.  But 
cache=writeback+fsync doesn't correspond to any real world drive; in the 
real world you're limited to power failures and a few megabytes of cache 
(typically less), cache=writeback+fsync can lose hundreds of megabytes 
due to power loss or software failure.

Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add fsync 
after metadata updates.

>> For cache=none and cache=writethrough we don't really need fsync, but we 
>> do need to flush the inflight commands.
>>     
>
> What we do need for those modes is the basic barrier support because
> we can currently re-order requests.  The next version of my patch will
> implement a barriers without cache flush mode, although I don't think
> a fdatasync without any outstanding dirty data should cause problems.
>   

Yeah.  And maybe one day push the barrier into the kernel.

> (Or maybe ext3 actually is stupid enough to flush the whole fs even for
> that case

Sigh.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 16:49         ` Avi Kivity
@ 2009-05-11 17:47           ` Anthony Liguori
  2009-05-11 18:00             ` Avi Kivity
  2009-05-12  7:23             ` Christoph Hellwig
  2009-05-12  7:19           ` Christoph Hellwig
  1 sibling, 2 replies; 21+ messages in thread
From: Anthony Liguori @ 2009-05-11 17:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Rusty Russell, kvm

Avi Kivity wrote:
> Christoph Hellwig wrote:
>> On Mon, May 11, 2009 at 06:45:50PM +0300, Avi Kivity wrote:
>>  
>>>> Right now it's fsync.  By the time I'll submit the backend change it
>>>> will still be fsync, but at least called from the posix-aio-compat
>>>> thread pool.
>>>>  
>>>>       
>>> I think if we have cache=writeback we should ignore this.
>>>     
>>
>> It's only needed for cache=writeback, because without that there is no
>> reason to flush a write cache.
>>   
>
> Maybe we should add a fourth cache= mode then.  But 
> cache=writeback+fsync doesn't correspond to any real world drive; in 
> the real world you're limited to power failures and a few megabytes of 
> cache (typically less), cache=writeback+fsync can lose hundreds of 
> megabytes due to power loss or software failure.
>
> Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add 
> fsync after metadata updates.

But how do we define the data integrity guarantees to the user of 
cache=writeback+fsync?  It seems to require a rather detailed knowledge 
of Linux's use of T_FLUSH operations.

Right now, it's fairly easy to understand.  cache=none and 
cache=writethrough guarantee that all write operations that the guest 
thinks have completed are completed.  cache=writeback provides no such 
guarantee.

cache=writeback+fsync would guarantee that only operations that include 
a T_FLUSH are present on disk which currently includes fsyncs but does 
not include O_DIRECT writes.  I guess whether O_SYNC does a T_FLUSH also 
has to be determined.

It seems too complicated to me.  If we could provide a mode where 
cache=writeback provided as strong a guarantee as cache=writethrough, 
then that would be quite interesting.

>> (Or maybe ext3 actually is stupid enough to flush the whole fs even for
>> that case
>
> Sigh.

I'm also worried about ext3 here.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 17:47           ` Anthony Liguori
@ 2009-05-11 18:00             ` Avi Kivity
  2009-05-11 18:29               ` Anthony Liguori
  2009-05-12  7:23             ` Christoph Hellwig
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2009-05-11 18:00 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Rusty Russell, kvm

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Christoph Hellwig wrote:
>>> On Mon, May 11, 2009 at 06:45:50PM +0300, Avi Kivity wrote:
>>>  
>>>>> Right now it's fsync.  By the time I'll submit the backend change it
>>>>> will still be fsync, but at least called from the posix-aio-compat
>>>>> thread pool.
>>>>>  
>>>>>       
>>>> I think if we have cache=writeback we should ignore this.
>>>>     
>>>
>>> It's only needed for cache=writeback, because without that there is no
>>> reason to flush a write cache.
>>>   
>>
>> Maybe we should add a fourth cache= mode then.  But 
>> cache=writeback+fsync doesn't correspond to any real world drive; in 
>> the real world you're limited to power failures and a few megabytes 
>> of cache (typically less), cache=writeback+fsync can lose hundreds of 
>> megabytes due to power loss or software failure.
>>
>> Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add 
>> fsync after metadata updates.
>
> But how do we define the data integrity guarantees to the user of 
> cache=writeback+fsync?  It seems to require a rather detailed 
> knowledge of Linux's use of T_FLUSH operations.

True.  I don't think cache=writeback+fsync is useful.  Like I mentioned, 
it doesn't act like a real drive, and it doesn't work well with qcow2.

>
> Right now, it's fairly easy to understand.  cache=none and 
> cache=writethrough guarantee that all write operations that the guest 
> thinks have completed are completed.  cache=writeback provides no such 
> guarantee.

cache=none is partially broken as well, since O_DIRECT writes might hit 
an un-battery-packed write cache.  I think cache=writeback will send the 
necessary flushes, if the disk and the underlying filesystem support them.

> cache=writeback+fsync would guarantee that only operations that 
> include a T_FLUSH are present on disk which currently includes fsyncs 
> but does not include O_DIRECT writes.  I guess whether O_SYNC does a 
> T_FLUSH also has to be determined.
>
> It seems too complicated to me.  If we could provide a mode where 
> cache=writeback provided as strong a guarantee as cache=writethrough, 
> then that would be quite interesting.

It don't think we realistically can.

>>> (Or maybe ext3 actually is stupid enough to flush the whole fs even for
>>> that case
>>
>> Sigh.
>
> I'm also worried about ext3 here.

I'm just waiting for btrfs.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 18:00             ` Avi Kivity
@ 2009-05-11 18:29               ` Anthony Liguori
  2009-05-11 18:40                 ` Avi Kivity
  2009-05-18 12:03                 ` Christoph Hellwig
  0 siblings, 2 replies; 21+ messages in thread
From: Anthony Liguori @ 2009-05-11 18:29 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Rusty Russell, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>
>>
>> Right now, it's fairly easy to understand.  cache=none and 
>> cache=writethrough guarantee that all write operations that the guest 
>> thinks have completed are completed.  cache=writeback provides no 
>> such guarantee.
>
> cache=none is partially broken as well, since O_DIRECT writes might 
> hit an un-battery-packed write cache.  I think cache=writeback will 
> send the necessary flushes, if the disk and the underlying filesystem 
> support them.

Sure, but this likely doesn't upset people that much since O_DIRECT has 
always had this behavior.  Using non-battery backed disks with writeback 
enabled introduces a larger set of possible data integrity issues.  I 
think this case is acceptable to ignore because it's a straight forward 
policy.

>> cache=writeback+fsync would guarantee that only operations that 
>> include a T_FLUSH are present on disk which currently includes fsyncs 
>> but does not include O_DIRECT writes.  I guess whether O_SYNC does a 
>> T_FLUSH also has to be determined.
>>
>> It seems too complicated to me.  If we could provide a mode where 
>> cache=writeback provided as strong a guarantee as cache=writethrough, 
>> then that would be quite interesting.
>
> It don't think we realistically can.

Maybe two fds?  One open in O_SYNC and one not.  Is such a thing sane?

>>>> (Or maybe ext3 actually is stupid enough to flush the whole fs even 
>>>> for
>>>> that case
>>>
>>> Sigh.
>>
>> I'm also worried about ext3 here.
>
> I'm just waiting for btrfs.

Even ext4 is saner but we'll get lots of bug reports while ext3 remains 
common.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 18:29               ` Anthony Liguori
@ 2009-05-11 18:40                 ` Avi Kivity
  2009-05-18 12:03                 ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2009-05-11 18:40 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Rusty Russell, kvm

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>
>>>
>>> Right now, it's fairly easy to understand.  cache=none and 
>>> cache=writethrough guarantee that all write operations that the 
>>> guest thinks have completed are completed.  cache=writeback provides 
>>> no such guarantee.
>>
>> cache=none is partially broken as well, since O_DIRECT writes might 
>> hit an un-battery-packed write cache.  I think cache=writeback will 
>> send the necessary flushes, if the disk and the underlying filesystem 
>> support them.
>
> Sure, but this likely doesn't upset people that much since O_DIRECT 
> has always had this behavior.  

But people are not using O_DIRECT.  They're using their guests, which 
may or may not issue the appropriate barriers.  They don't know that 
we're using O_DIRECT underneath with different guarantees.

> Using non-battery backed disks with writeback enabled introduces a 
> larger set of possible data integrity issues.  I think this case is 
> acceptable to ignore because it's a straight forward policy.

It isn't straightforward to me.  A guest should be able to get the same 
guarantees running on a hypervisor backed by such a disk as it would get 
if it was running on bare metal with the same disk.  Right now, that's 
not the case, we're reducing the guarantees the guest gets.

>>> cache=writeback+fsync would guarantee that only operations that 
>>> include a T_FLUSH are present on disk which currently includes 
>>> fsyncs but does not include O_DIRECT writes.  I guess whether O_SYNC 
>>> does a T_FLUSH also has to be determined.
>>>
>>> It seems too complicated to me.  If we could provide a mode where 
>>> cache=writeback provided as strong a guarantee as 
>>> cache=writethrough, then that would be quite interesting.
>>
>> It don't think we realistically can.
>
> Maybe two fds?  One open in O_SYNC and one not.  Is such a thing sane?

For all I care, yes.  Filesystem developers would probably have you 
locked up.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 16:49         ` Avi Kivity
  2009-05-11 17:47           ` Anthony Liguori
@ 2009-05-12  7:19           ` Christoph Hellwig
  2009-05-12  8:35             ` Avi Kivity
  1 sibling, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-12  7:19 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Anthony Liguori, Rusty Russell, kvm

On Mon, May 11, 2009 at 07:49:37PM +0300, Avi Kivity wrote:
> Maybe we should add a fourth cache= mode then.  But 
> cache=writeback+fsync doesn't correspond to any real world drive; in the 
> real world you're limited to power failures and a few megabytes of cache 
> (typically less), cache=writeback+fsync can lose hundreds of megabytes 
> due to power loss or software failure.

cache=writeback+fsync is exactly the same model as a normal writeback
cache disk drive.  (Well, almost as we currently don't use tag ordering
but drain flushes as a Linux implementation detail, but the disks also
support TCQ-based ordering).

The cache size on disks is constantly growing, and if you lose cache
it doesn't really matter how much you lose but what you lose.

> Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add fsync 
> after metadata updates.

If you care about data integrity in case of crashes qcow2 doesn't work
at all.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 17:47           ` Anthony Liguori
  2009-05-11 18:00             ` Avi Kivity
@ 2009-05-12  7:23             ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-12  7:23 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Avi Kivity, Christoph Hellwig, Rusty Russell, kvm

On Mon, May 11, 2009 at 12:47:58PM -0500, Anthony Liguori wrote:
> But how do we define the data integrity guarantees to the user of 
> cache=writeback+fsync?  It seems to require a rather detailed knowledge 
> of Linux's use of T_FLUSH operations.

It does work the same as for disks with writeback caches.  If a barrier
request completes the filesystem can assume all previous I/O on this
disk has finished before the barrier request, and the barrier request
has finished after these without any later request being finished before
it.  If a cache flush is issued the cache as of that point in time is
flushed (used for fsync).

> Right now, it's fairly easy to understand.  cache=none and 
> cache=writethrough guarantee that all write operations that the guest 
> thinks have completed are completed.  cache=writeback provides no such 
> guarantee.

As said above with barriers it indeed does.

> cache=writeback+fsync would guarantee that only operations that include 
> a T_FLUSH are present on disk which currently includes fsyncs but does 
> not include O_DIRECT writes.  I guess whether O_SYNC does a T_FLUSH also 
> has to be determined.

O_SYNC and O_DIRECT do it at least on XFS due to updating metadata after
the write.  I can't vouch for implementation details on other
filesystems.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 16:38     ` Anthony Liguori
@ 2009-05-12  7:26       ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-12  7:26 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Rusty Russell, kvm

On Mon, May 11, 2009 at 11:38:09AM -0500, Anthony Liguori wrote:
> >Right now it doesn't, but it probably should.
> >  
> 
> So then with cache=writeback, fsync behaves itself but O_DIRECT writes 
> do not.

Right now O_DIRECT does not do an explicit cache flush, but due to the
way barriers are implemented in Linux we do get the cache flush as part
of the metadata updates after I/O completion.

> fsync is pretty crappy on ext3 default configs.  I'm concerned that this 
> could be considered a DoS by a malicious guest.  If it sat in a T_FLUSH 
> loop, it would potentially bring your system to a crawl, no?

It's exactly the same effect as a regular user doing fsync in a loop,
whatever that causes on ext3.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-12  7:19           ` Christoph Hellwig
@ 2009-05-12  8:35             ` Avi Kivity
  2009-05-18 12:06               ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2009-05-12  8:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Anthony Liguori, Rusty Russell, kvm

Christoph Hellwig wrote:
> On Mon, May 11, 2009 at 07:49:37PM +0300, Avi Kivity wrote:
>   
>> Maybe we should add a fourth cache= mode then.  But 
>> cache=writeback+fsync doesn't correspond to any real world drive; in the 
>> real world you're limited to power failures and a few megabytes of cache 
>> (typically less), cache=writeback+fsync can lose hundreds of megabytes 
>> due to power loss or software failure.
>>     
>
> cache=writeback+fsync is exactly the same model as a normal writeback
> cache disk drive.  (Well, almost as we currently don't use tag ordering
> but drain flushes as a Linux implementation detail, but the disks also
> support TCQ-based ordering).
>
> The cache size on disks is constantly growing, and if you lose cache
> it doesn't really matter how much you lose but what you lose.
>   

Software errors won't cause data loss on a real disk (firmware bugs 
will, but the firmware is less likely to crash than the host OS).

>> Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add fsync 
>> after metadata updates.
>>     
>
> If you care about data integrity in case of crashes qcow2 doesn't work
> at all.
>   

Do you known of any known corruptors in qcow2 with cache=writethrough?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11  8:39 [PATCH, RFC] virtio_blk: add cache flush command Christoph Hellwig
  2009-05-11 14:51 ` Anthony Liguori
@ 2009-05-12 13:54 ` Rusty Russell
  2009-05-12 14:18   ` Christian Borntraeger
  1 sibling, 1 reply; 21+ messages in thread
From: Rusty Russell @ 2009-05-12 13:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kvm, Christian Borntraeger

On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
> Do we need a new feature flag for this command or can we expect that
> all previous barrier support was buggy enough anyway?

You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.

AFAIK only lguest offered that, and lguest has no ABI.  Best would be to 
implement VIRTIO_BLK_T_FLUSH as well; it's supposed to be demo code, and it 
should be easy).

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-12 13:54 ` Rusty Russell
@ 2009-05-12 14:18   ` Christian Borntraeger
  2009-05-13  1:52     ` Rusty Russell
  2009-05-18 12:07     ` Christoph Hellwig
  0 siblings, 2 replies; 21+ messages in thread
From: Christian Borntraeger @ 2009-05-12 14:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Christoph Hellwig, kvm

Am Tuesday 12 May 2009 15:54:14 schrieb Rusty Russell:
> On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
> > Do we need a new feature flag for this command or can we expect that
> > all previous barrier support was buggy enough anyway?
> 
> You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.
> 
> AFAIK only lguest offered that, and lguest has no ABI.  Best would be to 
> implement VIRTIO_BLK_T_FLUSH as well; it's supposed to be demo code, and it 
> should be easy).

It is also used by kuli (http://www.ibm.com/developerworks/linux/linux390/kuli.html)
and kuli used fdatasync. Since kuli is on offical webpages it takes a while
to update that code. When you reuse the VIRTIO_BLK_F_BARRIER flag, that would 
trigger some VIRTIO_BLK_T_FLUSH commands sent to the kuli host, right?
I think the if/else logic of kuli would interpret that as a read request....I 
am voting for a new feature flag :-)

Christian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-12 14:18   ` Christian Borntraeger
@ 2009-05-13  1:52     ` Rusty Russell
  2009-05-18 12:07     ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Rusty Russell @ 2009-05-13  1:52 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: Christoph Hellwig, kvm

On Tue, 12 May 2009 11:48:36 pm Christian Borntraeger wrote:
> Am Tuesday 12 May 2009 15:54:14 schrieb Rusty Russell:
> > On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
> > > Do we need a new feature flag for this command or can we expect that
> > > all previous barrier support was buggy enough anyway?
> >
> > You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.
>
> It is also used by kuli
> (http://www.ibm.com/developerworks/linux/linux390/kuli.html) and kuli used
> fdatasync.

OK, that's sufficient for me.  It's not like block is going to catch up with net 
for feature flags any time soon.

Christoph, please make a new feature flag.

Oh and FYI Christian, you might not have seen this patch:

lguest: barrier me harder

Impact: barrier correctness in example launcher

I doubt either lguest user will complain about performance.

Reported-by: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1630,6 +1630,13 @@ static bool service_io(struct device *de
 		}
 	}
 
+	/* OK, so we noted that it was pretty poor to use an fdatasync as a
+	 * barrier.  But Christoph Hellwig points out that we need a sync
+	 * *afterwards* as well: "Barriers specify no reordering to the front
+	 * or the back."  And Jens Axboe confirmed it, so here we are: */
+	if (out->type & VIRTIO_BLK_T_BARRIER)
+		fdatasync(vblk->fd);
+
 	/* We can't trigger an IRQ, because we're not the Launcher.  It does
 	 * that when we tell it we're done. */
 	add_used(dev->vq, head, wlen);


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-11 18:29               ` Anthony Liguori
  2009-05-11 18:40                 ` Avi Kivity
@ 2009-05-18 12:03                 ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-18 12:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Avi Kivity, Christoph Hellwig, Rusty Russell, kvm

On Mon, May 11, 2009 at 01:29:06PM -0500, Anthony Liguori wrote:
> >It don't think we realistically can.
> 
> Maybe two fds?  One open in O_SYNC and one not.  Is such a thing sane?

O_SYNC and O_DIRECT currently behave exactly in the same way.  In none
of the filesystem I've looked at there is an explicit cache flush,
but all send down a barrier which gives you an implicit cache flush
with the way barriers are implemented.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-12  8:35             ` Avi Kivity
@ 2009-05-18 12:06               ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-18 12:06 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, Anthony Liguori, Rusty Russell, kvm

On Tue, May 12, 2009 at 11:35:23AM +0300, Avi Kivity wrote:
> >The cache size on disks is constantly growing, and if you lose cache
> >it doesn't really matter how much you lose but what you lose.
> >  
> 
> Software errors won't cause data loss on a real disk (firmware bugs 
> will, but the firmware is less likely to crash than the host OS).

OS crash or hardware crash really makes no difference for writeback
caches.   The case we are trying to protect against here is any crash,
and a hardware induced one or power failure is probably more likely
in either case than a software failure in either the OS or firmware.

> >If you care about data integrity in case of crashes qcow2 doesn't work
> >at all.
> >  
> 
> Do you known of any known corruptors in qcow2 with cache=writethrough?

It's not related to cache=writethrough.  The issue is that there are no
transactional guarantees in qcow2.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH, RFC] virtio_blk: add cache flush command
  2009-05-12 14:18   ` Christian Borntraeger
  2009-05-13  1:52     ` Rusty Russell
@ 2009-05-18 12:07     ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2009-05-18 12:07 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: Rusty Russell, Christoph Hellwig, kvm

On Tue, May 12, 2009 at 04:18:36PM +0200, Christian Borntraeger wrote:
> Am Tuesday 12 May 2009 15:54:14 schrieb Rusty Russell:
> > On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
> > > Do we need a new feature flag for this command or can we expect that
> > > all previous barrier support was buggy enough anyway?
> > 
> > You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.
> > 
> > AFAIK only lguest offered that, and lguest has no ABI.  Best would be to 
> > implement VIRTIO_BLK_T_FLUSH as well; it's supposed to be demo code, and it 
> > should be easy).
> 
> It is also used by kuli (http://www.ibm.com/developerworks/linux/linux390/kuli.html)
> and kuli used fdatasync. Since kuli is on offical webpages it takes a while
> to update that code. When you reuse the VIRTIO_BLK_F_BARRIER flag, that would 
> trigger some VIRTIO_BLK_T_FLUSH commands sent to the kuli host, right?
> I think the if/else logic of kuli would interpret that as a read request....I 
> am voting for a new feature flag :-)

Ok, next version will have a new feature flag.  I actually have some
other bits I want to redesign so it might take a bit longer to get it
out.


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2009-05-18 12:07 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-11  8:39 [PATCH, RFC] virtio_blk: add cache flush command Christoph Hellwig
2009-05-11 14:51 ` Anthony Liguori
2009-05-11 15:40   ` Christoph Hellwig
2009-05-11 15:45     ` Avi Kivity
2009-05-11 16:28       ` Christoph Hellwig
2009-05-11 16:49         ` Avi Kivity
2009-05-11 17:47           ` Anthony Liguori
2009-05-11 18:00             ` Avi Kivity
2009-05-11 18:29               ` Anthony Liguori
2009-05-11 18:40                 ` Avi Kivity
2009-05-18 12:03                 ` Christoph Hellwig
2009-05-12  7:23             ` Christoph Hellwig
2009-05-12  7:19           ` Christoph Hellwig
2009-05-12  8:35             ` Avi Kivity
2009-05-18 12:06               ` Christoph Hellwig
2009-05-11 16:38     ` Anthony Liguori
2009-05-12  7:26       ` Christoph Hellwig
2009-05-12 13:54 ` Rusty Russell
2009-05-12 14:18   ` Christian Borntraeger
2009-05-13  1:52     ` Rusty Russell
2009-05-18 12:07     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox