* [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
@ 2008-02-25 18:13 Ian Jackson
2008-02-25 20:50 ` Jamie Lokier
0 siblings, 1 reply; 12+ messages in thread
From: Ian Jackson @ 2008-02-25 18:13 UTC (permalink / raw)
To: qemu-devel
[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 598 bytes --]
The attached patch implements the ATA write cache feature. This
enables a guest to control, in the standard way, whether disk writes
are immediately committed to disk before the IDE command completes, or
may be buffered in the host.
In this patch, by default buffering is off, which provides better
reliability but may have a performance impact. It would be
straightforward to change the default, or perhaps offer a command-line
option, if that would be preferred.
This patch is derived from one which was originally submitted to the
Xen tree by Rik van Riel <riel@redhat.com>.
Regards,
Ian.
[-- Attachment #2: IDE write caching configurability --]
[-- Type: text/plain, Size: 2563 bytes --]
diff --git a/hw/ide.c b/hw/ide.c
index 56a1cda..6668cc9 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -377,6 +377,7 @@ typedef struct IDEState {
PCIDevice *pci_dev;
struct BMDMAState *bmdma;
int drive_serial;
+ int write_cache;
/* ide regs */
uint8_t feature;
uint8_t error;
@@ -553,7 +554,8 @@ static void ide_identify(IDEState *s)
put_le16(p + 68, 120);
put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
put_le16(p + 81, 0x16); /* conforms to ata5 */
- put_le16(p + 82, (1 << 14));
+ /* 14=nop 5=write_cache */
+ put_le16(p + 82, (1 << 14) | (1 << 5));
/* 13=flush_cache_ext,12=flush_cache,10=lba48 */
put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
put_le16(p + 84, (1 << 14));
@@ -959,6 +961,9 @@ static void ide_sector_write(IDEState *s)
if (n > s->req_nb_sectors)
n = s->req_nb_sectors;
ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+ if (ret == 0 && !s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ }
if (ret != 0) {
ide_rw_error(s);
return;
@@ -1015,6 +1020,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
/* end of transfer ? */
if (s->nsector == 0) {
+ if (!s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) {
+ ide_dma_error(s);
+ return;
+ }
+ }
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
eot:
@@ -2097,7 +2109,17 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
case 0xcc: /* reverting to power-on defaults enable */
case 0x66: /* reverting to power-on defaults disable */
case 0x02: /* write cache enable */
+ s->write_cache = 1;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
case 0x82: /* write cache disable */
+ s->write_cache = 0;
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) goto abort_cmd;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
case 0xaa: /* read look-ahead enable */
case 0x55: /* read look-ahead disable */
case 0x05: /* set advanced power management mode */
@@ -2623,6 +2645,7 @@ static void ide_init2(IDEState *ide_state,
s->irq = irq;
s->sector_write_timer = qemu_new_timer(vm_clock,
ide_sector_write_timer_cb, s);
+ s->write_cache = 0;
ide_reset(s);
}
}
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-25 18:13 [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest Ian Jackson
@ 2008-02-25 20:50 ` Jamie Lokier
2008-02-26 1:16 ` Chris Wedgwood
2008-02-26 12:15 ` Ian Jackson
0 siblings, 2 replies; 12+ messages in thread
From: Jamie Lokier @ 2008-02-25 20:50 UTC (permalink / raw)
To: qemu-devel
Ian Jackson wrote:
Content-Description: message body text
> The attached patch implements the ATA write cache feature. This
> enables a guest to control, in the standard way, whether disk writes
> are immediately committed to disk before the IDE command completes, or
> may be buffered in the host.
>
> In this patch, by default buffering is off, which provides better
> reliability but may have a performance impact. It would be
> straightforward to change the default, or perhaps offer a command-line
> option, if that would be preferred.
>
> This patch is derived from one which was originally submitted to the
> Xen tree by Rik van Riel <riel@redhat.com>.
This is a very sensible improvement, imho.
However, I notice that it tells the guest that data is committed to
hard storage when the host has merely called fsync().
On Linux (and other host OSes), fdatsync() and fsync() don't always
commit data to hard storage; it sometimes only commits it to the hard
drive cache. (Seriously, just look at fs/ext3/fsync.c; only journal
writes cause the flush, and they aren't done if the inode itself
hasn't changed).
It may be worth mentioning in documentation that guests which need
strong durability guarantees, i.e. for critical database work or for
filesystem journalling safety following host power failure, it is not
enough to disable the IDE write cache in the guest even with this
patch. It is necessary to disable the host's disk write cache too,
for that.
Ideally, the host would provide variation of fdatasync() which flushes
data to hard storage in the same way that kernel filesystem journal
writes can do, and Qemu would use that.
But, presently, I'm not aware of any way to do that short of the
administrator disabling the host's disk write cache.
(Darwin provides F_FULLSYNC. On Linux, an extra flag to
sync_file_range() suggests itself. It would need changes to the block
device and elevator APIs, though, as it's a flush command not an
ordering tag, and not always associated with a prior or subsequent
write although there are some coalescing optimisations when it can be.)
-- Jamie
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-25 20:50 ` Jamie Lokier
@ 2008-02-26 1:16 ` Chris Wedgwood
2008-02-26 7:32 ` Jamie Lokier
2008-02-26 12:15 ` Ian Jackson
1 sibling, 1 reply; 12+ messages in thread
From: Chris Wedgwood @ 2008-02-26 1:16 UTC (permalink / raw)
To: qemu-devel
On Mon, Feb 25, 2008 at 08:50:40PM +0000, Jamie Lokier wrote:
> On Linux (and other host OSes), fdatsync() and fsync() don't always
> commit data to hard storage; it sometimes only commits it to the hard
> drive cache.
That's a filesystem bug IMO. People should be able to use f[data]sync
with some level onf confidence or else it's basically pointless.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-26 1:16 ` Chris Wedgwood
@ 2008-02-26 7:32 ` Jamie Lokier
0 siblings, 0 replies; 12+ messages in thread
From: Jamie Lokier @ 2008-02-26 7:32 UTC (permalink / raw)
To: qemu-devel
[To qemu-devel and Chris, I have started a thread on linux-kernel on
this topic. I've copied the first few paragraphs here, so you can see
what it's about since it's a response to a post here. But it's
largely off topic for Qemu, and on topic for linux-kernel, so I didn't
cross post lest linux-kernel replies come here.]
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: Proposal for "proper" durable fsync() and fdatasync()
Message-ID: <20080226072649.GB30238@shareable.org>
Date: Tue, 26 Feb 2008 07:26:49 +0000
Dear kernel,
This is a proposal to add "proper" durable fsync() and fdatasync() to Linux.
First the problem, then a proposed solution "with benefits", so to speak.
[...]
By durable, I mean that fsync() should actually commit writes to
physical stable storage, not just the disk write cache when that is
enabled. Databases and guest VMs needs this, or an equivalent
feature, if they aren't to face occasional corruption after power
failure and perhaps some crashes.
The alternative is to disable the disk write cache. But that isn't
modern practice or recommendation, since I/O write barriers were
implemented and they are much faster.
I was surprised that fsync() doesn't do this already. There was a lot
of effort put into block I/O write barriers during 2.5, so that
journalling filesystems can force correct write ordering, using disk
flush cache commands.
After all that effort, I was very surprised to notice that Linux 2.6.x
doesn't use that capability to ensure fsync() flushes the disk cache
onto stable storage.
I noticed this following up discussions on the Qemu mailing list,
about guest VMs and how their IDE flush cache command should translate
to fsync() to avoid data loss. (For guest VMs, fsync() isn't
necessary if the host machine is fine, and it isn't enough (on Linux
host) if the host machine loses power or the hard disk crashes another
way.)
Then I noticed it again, when I was designing a database engine with
filesystem characteristics. I thought "how do I ensure ordered
journal writes; can I use fdatasync()?" and was surprised to find the
answer is no, I have to use hacks like calling hdparm, and the authors
of major SQL databases seem to brush the problem under a carpet.
(Interestingly, in the Linux 2.4 patches for write barriers, fsync()
seems to be fine, if a bit slow.)
It isn't the first time this topic has come up:
http://groups.google.com.br/group/linux.kernel/browse_thread/thread/d343e51655b4ac7c/7ee9bca80977c2d1?#7ee9bca80977c2d1
("True fsync() in Linux (on IDE)")
In that thread, it was implied that would be fixed in 2.6. So I bet
some people are under the illusion that it's fixed in 2.6...
For a while, I've been meaning to bring it up on linux-kernel...
[More on linux-kernel].
Thanks,
-- Jamie
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-25 20:50 ` Jamie Lokier
2008-02-26 1:16 ` Chris Wedgwood
@ 2008-02-26 12:15 ` Ian Jackson
2008-02-26 12:49 ` Jamie Lokier
1 sibling, 1 reply; 12+ messages in thread
From: Ian Jackson @ 2008-02-26 12:15 UTC (permalink / raw)
To: qemu-devel
Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest"):
> However, I notice that it tells the guest that data is committed to
> hard storage when the host has merely called fsync().
Yes, that's what fsync is supposed to do.
> On Linux (and other host OSes), fdatsync() and fsync() don't always
> commit data to hard storage; it sometimes only commits it to the hard
> drive cache. (Seriously, just look at fs/ext3/fsync.c; only journal
> writes cause the flush, and they aren't done if the inode itself
> hasn't changed).
As Chris Wedgwood says, I think this is a bug. I was going to quote
SuSv3 but sadly it's very vague.
> Ideally, the host would provide variation of fdatasync() which flushes
> data to hard storage in the same way that kernel filesystem journal
> writes can do, and Qemu would use that.
Another question arises: do we want bdrv_flush to call (eventually)
fsync or fdatasync ? If the latter we need to make sure that we call
fsync instead when necessary, for example when a cow file is extended.
Ian.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-26 12:15 ` Ian Jackson
@ 2008-02-26 12:49 ` Jamie Lokier
2008-02-26 16:57 ` Ian Jackson
0 siblings, 1 reply; 12+ messages in thread
From: Jamie Lokier @ 2008-02-26 12:49 UTC (permalink / raw)
To: qemu-devel
Ian Jackson wrote:
> > Ideally, the host would provide variation of fdatasync() which flushes
> > data to hard storage in the same way that kernel filesystem journal
> > writes can do, and Qemu would use that.
>
> Another question arises: do we want bdrv_flush to call (eventually)
> fsync or fdatasync ? If the latter we need to make sure that we call
> fsync instead when necessary, for example when a cow file is extended.
I'm imagining that fdatasync() will flush the necessary metadata,
including file size, when a file is extended. As would O_DSYNC.
I could be wrong, but I think it's expected to do that, as I recall
VxFS doing something like that with O_DSYNC.
Then again, even if it's meant to, that doesn't mean it does...
-- Jamie
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-26 12:49 ` Jamie Lokier
@ 2008-02-26 16:57 ` Ian Jackson
2008-02-26 17:25 ` Jamie Lokier
0 siblings, 1 reply; 12+ messages in thread
From: Ian Jackson @ 2008-02-26 16:57 UTC (permalink / raw)
To: qemu-devel
Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest"):
> I'm imagining that fdatasync() will flush the necessary metadata,
> including file size, when a file is extended. As would O_DSYNC.
Do you have a reference to support this supposition ? SuSv3's
specification for fdatasync is (astonishingly!) even vaguer and less
meaningful than that for fsync.
The closest I've come to finding answers are these:
HP-UX 11i's fdatasync manpage:
fdatasync() causes all modified data and file attributes of fildes
required to retrieve the data to be written to disk.
The glibc info manual:
Sometimes it is not even necessary to write all data associated with
a file descriptor. E.g., in database files which do not change in size
it is enough to write all the file content data to the device.
The Solaris manpage says that fdatasync does the same as O_DSYNC, and
it calls the service "synchronized I/O data integrity completion"
which is defined by the `Programming Interfaces Guide' to include
this:
* For writes, the operation has been completed, or diagnosed if
unsuccessful. The write operation succeeds when the data specified
in the write request is successfully transferred. Furthermore, all
file system information required to retrieve the data must be
successfully transferred.
But then the next bullet point is this:
* File attributes that are not necessary for data retrieval are not
transferred prior to returning to the calling process.
which says `are not transferred' when it ought to say `are not
necessarily transferred' so it may be unwise to rely on the precise
wording.
I looked at various other manpages but they all say useless things
like `metadata such as modification time' which leaves open the
question of whether the file size is included.
If the size is supposed to be included then the OS is required to keep
a flag to say whether the file has been extended so that it knows that
the next fdatasync ought really to be an fsync and write the inode
too. (In a traditional filesystem structure.) Or perhaps fsck needs
to extend the file as necessary to include the data blocks past the
nominal end of file.
This seems like rather a minefield.
Ian.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-26 16:57 ` Ian Jackson
@ 2008-02-26 17:25 ` Jamie Lokier
2008-02-26 18:11 ` Ian Jackson
0 siblings, 1 reply; 12+ messages in thread
From: Jamie Lokier @ 2008-02-26 17:25 UTC (permalink / raw)
To: qemu-devel
Ian Jackson wrote:
> Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest"):
> > I'm imagining that fdatasync() will flush the necessary metadata,
> > including file size, when a file is extended. As would O_DSYNC.
>
> Do you have a reference to support this supposition ?
Not a _standard_, of course, as you found with SuSv3. More a folk
understanding, which admittedly might be lacking in some
implementations (like Linux perhaps...).
Take a look at your references.
> HP-UX 11i's fdatasync manpage:
>
> fdatasync() causes all modified data and file attributes of fildes
^^^^^^^^^^^^^^^^^^^^^^^^^
> required to retrieve the data to be written to disk.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
That means size, bitmap updates, block pointers, extents etc. needed
to retrieve the data.
> The glibc info manual:
>
> Sometimes it is not even necessary to write all data associated with
> a file descriptor. E.g., in database files which do not change in size
> it is enough to write all the file content data to the device.
A bit more from Glibc:
Meta-information, like the modification time etc., are not that
important and leaving such information uncommitted does not prevent a
successful recovering of the file in case of a problem.
When a call to the `fdatasync' function returns, it is ensured
that all of the file data is written to the device. For all
pending I/O operations, the parts guaranteeing data integrity
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
finished.
^^^^^^^^
Draw your own conclusion.
> The Solaris manpage says that fdatasync does the same as O_DSYNC,
That's right, it's the common meaning of O_DSYNC.
> and it calls the service "synchronized I/O data integrity
> completion" which is defined by the `Programming Interfaces Guide'
> to include this:
>
> * For writes, the operation has been completed, or diagnosed if
> unsuccessful. The write operation succeeds when the data specified
> in the write request is successfully transferred. Furthermore, all
> file system information required to retrieve the data must be
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> successfully transferred.
^^^^^^^^^^^^^^^^^^^^^^^^
That's quite clear.
> But then the next bullet point is this:
>
> * File attributes that are not necessary for data retrieval are not
> transferred prior to returning to the calling process.
>
> which says `are not transferred' when it ought to say `are not
> necessarily transferred' so it may be unwise to rely on the precise
> wording.
That's fine and consistent with the previous text. It means size
increase, bitmaps, pointers, extents etc. are written (those are the
attributes necessary for data retrieval).
Attributes like modification time, access time, change time,
permissions etc. are not (necessarily) transferred. You're right it
should say "not necessarily", but that's implicit: they can be
transferred at any time anyway, by normal background writeback.
> I looked at various other manpages but they all say useless things
> like `metadata such as modification time' which leaves open the
> question of whether the file size is included.
I agree it's a bit ambiguous. My understanding is that _increases_ in
size are included, by convention as much as anything, since the larger
size is necessary to retrieve the data later.
This is supported by the fact that O_DSYNC has a tendancy to become
very slow on some systems when extending a file, compared with writing
in place.
> If the size is supposed to be included then the OS is required to keep
> a flag to say whether the file has been extended so that it knows that
> the next fdatasync ought really to be an fsync and write the inode
> too. (In a traditional filesystem structure.)
That's right.
> Or perhaps fsck needs
> to extend the file as necessary to include the data blocks past the
> nominal end of file.
Well, in general, if your system is such that fsck following a crash
is part of normal filesystem operations, then fsck could be allowed to
do a lot more than extend the size attribute.
That doesn't matter to the application, though. What matters is that
it writes data (including extending the file), calls fdatasync() (or
uses O_DSYNC), and when the fdatasync returns it knows after a crash
and recovery that it will be able to retrieve that data with the
appropriate confidence level.
> This seems like rather a minefield.
The implementation details seem like a minefield, but the intent and
documentation and tradition of fdatasync() seems quite clear to me.
However, I suppose you might want to be careful and check, when
deploying your new database which depends on fdatasync(), if the
target systems really do sync size changes :-)
It's easy enough to check, as it greatly slows down extending writes.
But I suppose, for an app writer, as you know it's going to involve a
slower than normal write anyway, it's also easy enough to extend by a
big chunk then call fsync() once, if you prefer to not have to trust
fdatasync() on this.
-- Jamie
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-02-26 17:25 ` Jamie Lokier
@ 2008-02-26 18:11 ` Ian Jackson
0 siblings, 0 replies; 12+ messages in thread
From: Ian Jackson @ 2008-02-26 18:11 UTC (permalink / raw)
To: qemu-devel
Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest"):
> I agree it's a bit ambiguous. My understanding is that _increases_ in
> size are included, by convention as much as anything, since the larger
> size is necessary to retrieve the data later.
Yes. Well, if you're sufficiently convinced then I think the right
thing for you to do is to submit a patch to change the calls to fsync
to calls to fdatasync.
Sadly my patches don't seem to have been applied in CVS yet. I'll
arrange to publish my git tree and then you can make a diff that will
sit properly on top of my tree (which is what you want because I have
added error checking, so otherwise we get a conflict).
Ian.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
@ 2008-03-27 18:02 Ian Jackson
2008-03-27 18:16 ` Paul Brook
0 siblings, 1 reply; 12+ messages in thread
From: Ian Jackson @ 2008-03-27 18:02 UTC (permalink / raw)
To: qemu-devel
[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 269 bytes --]
I submitted this patch earlier but it seems to have been dropped,
perhaps due to the surrounding discussion of fsync vs. fdatasync.
I think this should be applied now and then I'll follow up with
another change to make all of the flushes use fdatasync.
Thanks,
Ian.
[-- Attachment #2: make write cacheing controllable by guest --]
[-- Type: text/plain, Size: 5496 bytes --]
>From 88ec110495c5c837562ceed4171d286bd4f7f470 Mon Sep 17 00:00:00 2001
From: Ian Jackson <iwj@mariner.uk.xensource.com>
Date: Thu, 27 Mar 2008 17:58:45 +0000
Subject: [PATCH] make write cacheing controllable by guest
This patch implements the ATA write cache feature. This enables a
guest to control, in the standard way, whether disk writes are
immediately committed to disk before the IDE command completes, or may
be buffered in the host.
In this patch, by default buffering is off, which provides better
reliability but may have a performance impact. It would be
straightforward to change the default, or perhaps offer a command-line
option, if that would be preferred.
This patch is derived from one which was originally submitted to the
Xen tree by Rik van Riel <riel@redhat.com> and includes code to save
the write_cache setting from Samuel Thibault.
From: Rik van Riel <riel@redhat.com>
Signed-off-by: Christian Limpach <Christian.Limpach@xensource.com>
Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
---
hw/ide.c | 34 ++++++++++++++++++++++++++++++----
1 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/hw/ide.c b/hw/ide.c
index b73dba2..acb2139 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -383,6 +383,7 @@ typedef struct IDEState {
PCIDevice *pci_dev;
struct BMDMAState *bmdma;
int drive_serial;
+ int write_cache;
/* ide regs */
uint8_t feature;
uint8_t error;
@@ -559,7 +560,8 @@ static void ide_identify(IDEState *s)
put_le16(p + 68, 120);
put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
put_le16(p + 81, 0x16); /* conforms to ata5 */
- put_le16(p + 82, (1 << 14));
+ /* 14=nop 5=write_cache */
+ put_le16(p + 82, (1 << 14) | (1 << 5));
/* 13=flush_cache_ext,12=flush_cache,10=lba48 */
put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
put_le16(p + 84, (1 << 14));
@@ -965,6 +967,9 @@ static void ide_sector_write(IDEState *s)
if (n > s->req_nb_sectors)
n = s->req_nb_sectors;
ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+ if (ret == 0 && !s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ }
if (ret != 0) {
ide_rw_error(s);
return;
@@ -1021,6 +1026,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
/* end of transfer ? */
if (s->nsector == 0) {
+ if (!s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) {
+ ide_dma_error(s);
+ return;
+ }
+ }
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
eot:
@@ -2103,7 +2115,17 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
case 0xcc: /* reverting to power-on defaults enable */
case 0x66: /* reverting to power-on defaults disable */
case 0x02: /* write cache enable */
+ s->write_cache = 1;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
case 0x82: /* write cache disable */
+ s->write_cache = 0;
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) goto abort_cmd;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
case 0xaa: /* read look-ahead enable */
case 0x55: /* read look-ahead disable */
case 0x05: /* set advanced power management mode */
@@ -2629,6 +2651,7 @@ static void ide_init2(IDEState *ide_state,
s->irq = irq;
s->sector_write_timer = qemu_new_timer(vm_clock,
ide_sector_write_timer_cb, s);
+ s->write_cache = 0;
ide_reset(s);
}
}
@@ -2657,6 +2680,7 @@ static void ide_save(QEMUFile* f, IDEState *s)
if (s->identify_set) {
qemu_put_buffer(f, (const uint8_t *)s->identify_data, 512);
}
+ qemu_put_8s(f, &s->write_cache);
qemu_put_8s(f, &s->feature);
qemu_put_8s(f, &s->error);
qemu_put_be32s(f, &s->nsector);
@@ -2685,6 +2709,8 @@ static void ide_load(QEMUFile* f, IDEState *s)
if (s->identify_set) {
qemu_get_buffer(f, (uint8_t *)s->identify_data, 512);
}
+ if (version_id >= 2)
+ qemu_get_8s(f, &s->write_cache);
qemu_get_8s(f, &s->feature);
qemu_get_8s(f, &s->error);
qemu_get_be32s(f, &s->nsector);
@@ -3029,7 +3055,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
PCIIDEState *d = opaque;
int ret, i;
- if (version_id != 1)
+ if (version_id != 1 && version_id != 2)
return -EINVAL;
ret = pci_device_load(&d->dev, f);
if (ret < 0)
@@ -3105,7 +3131,7 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
- register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+ register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
}
/* hd_table must contain 4 block drivers */
@@ -3143,7 +3169,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
- register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+ register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
}
/***********************************************************/
--
1.4.4.4
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-03-27 18:02 Ian Jackson
@ 2008-03-27 18:16 ` Paul Brook
2008-03-28 9:38 ` Ian Jackson
0 siblings, 1 reply; 12+ messages in thread
From: Paul Brook @ 2008-03-27 18:16 UTC (permalink / raw)
To: qemu-devel; +Cc: Ian Jackson
On Thursday 27 March 2008, Ian Jackson wrote:
> case 0xcc: /* reverting to power-on defaults enable */
> case 0x66: /* reverting to power-on defaults disable */
> case 0x02: /* write cache enable */
> + s->write_cache = 1;
This look wrong.
Paul
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest
2008-03-27 18:16 ` Paul Brook
@ 2008-03-28 9:38 ` Ian Jackson
0 siblings, 0 replies; 12+ messages in thread
From: Ian Jackson @ 2008-03-28 9:38 UTC (permalink / raw)
To: Paul Brook; +Cc: qemu-devel
[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 533 bytes --]
Paul Brook writes ("Re: [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest"):
> On Thursday 27 March 2008, Ian Jackson wrote:
> > case 0xcc: /* reverting to power-on defaults enable */
> > case 0x66: /* reverting to power-on defaults disable */
> > case 0x02: /* write cache enable */
> > + s->write_cache = 1;
>
> This look wrong.
Oops, that must have been my fault when I ported this feature from
xen-unstable's hg. Here is a fixed version.
Thanks,
Ian.
[-- Attachment #2: make write cacheing controllable by guest (take 2) --]
[-- Type: text/plain, Size: 4713 bytes --]
diff --git a/hw/ide.c b/hw/ide.c
index b73dba2..0fb58e7 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -383,6 +383,7 @@ typedef struct IDEState {
PCIDevice *pci_dev;
struct BMDMAState *bmdma;
int drive_serial;
+ int write_cache;
/* ide regs */
uint8_t feature;
uint8_t error;
@@ -559,7 +560,8 @@ static void ide_identify(IDEState *s)
put_le16(p + 68, 120);
put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
put_le16(p + 81, 0x16); /* conforms to ata5 */
- put_le16(p + 82, (1 << 14));
+ /* 14=nop 5=write_cache */
+ put_le16(p + 82, (1 << 14) | (1 << 5));
/* 13=flush_cache_ext,12=flush_cache,10=lba48 */
put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
put_le16(p + 84, (1 << 14));
@@ -965,6 +967,9 @@ static void ide_sector_write(IDEState *s)
if (n > s->req_nb_sectors)
n = s->req_nb_sectors;
ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+ if (ret == 0 && !s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ }
if (ret != 0) {
ide_rw_error(s);
return;
@@ -1021,6 +1026,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
/* end of transfer ? */
if (s->nsector == 0) {
+ if (!s->write_cache) {
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) {
+ ide_dma_error(s);
+ return;
+ }
+ }
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
eot:
@@ -2102,8 +2114,6 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
switch(s->feature) {
case 0xcc: /* reverting to power-on defaults enable */
case 0x66: /* reverting to power-on defaults disable */
- case 0x02: /* write cache enable */
- case 0x82: /* write cache disable */
case 0xaa: /* read look-ahead enable */
case 0x55: /* read look-ahead disable */
case 0x05: /* set advanced power management mode */
@@ -2117,6 +2127,18 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
break;
+ case 0x02: /* write cache enable */
+ s->write_cache = 1;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
+ case 0x82: /* write cache disable */
+ s->write_cache = 0;
+ ret = bdrv_flush(s->bs);
+ if (ret != 0) goto abort_cmd;
+ s->status = READY_STAT | SEEK_STAT;
+ ide_set_irq(s);
+ break;
case 0x03: { /* set transfer mode */
uint8_t val = s->nsector & 0x07;
@@ -2629,6 +2651,7 @@ static void ide_init2(IDEState *ide_state,
s->irq = irq;
s->sector_write_timer = qemu_new_timer(vm_clock,
ide_sector_write_timer_cb, s);
+ s->write_cache = 0;
ide_reset(s);
}
}
@@ -2657,6 +2680,7 @@ static void ide_save(QEMUFile* f, IDEState *s)
if (s->identify_set) {
qemu_put_buffer(f, (const uint8_t *)s->identify_data, 512);
}
+ qemu_put_8s(f, &s->write_cache);
qemu_put_8s(f, &s->feature);
qemu_put_8s(f, &s->error);
qemu_put_be32s(f, &s->nsector);
@@ -2685,6 +2709,8 @@ static void ide_load(QEMUFile* f, IDEState *s)
if (s->identify_set) {
qemu_get_buffer(f, (uint8_t *)s->identify_data, 512);
}
+ if (version_id >= 2)
+ qemu_get_8s(f, &s->write_cache);
qemu_get_8s(f, &s->feature);
qemu_get_8s(f, &s->error);
qemu_get_be32s(f, &s->nsector);
@@ -3029,7 +3055,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
PCIIDEState *d = opaque;
int ret, i;
- if (version_id != 1)
+ if (version_id != 1 && version_id != 2)
return -EINVAL;
ret = pci_device_load(&d->dev, f);
if (ret < 0)
@@ -3105,7 +3131,7 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
- register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+ register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
}
/* hd_table must contain 4 block drivers */
@@ -3143,7 +3169,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
- register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+ register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
}
/***********************************************************/
^ permalink raw reply related [flat|nested] 12+ messages in thread
end of thread, other threads:[~2008-03-28 9:39 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-25 18:13 [Qemu-devel] [PATCH] ide.c make write cacheing controllable by guest Ian Jackson
2008-02-25 20:50 ` Jamie Lokier
2008-02-26 1:16 ` Chris Wedgwood
2008-02-26 7:32 ` Jamie Lokier
2008-02-26 12:15 ` Ian Jackson
2008-02-26 12:49 ` Jamie Lokier
2008-02-26 16:57 ` Ian Jackson
2008-02-26 17:25 ` Jamie Lokier
2008-02-26 18:11 ` Ian Jackson
-- strict thread matches above, loose matches on Subject: below --
2008-03-27 18:02 Ian Jackson
2008-03-27 18:16 ` Paul Brook
2008-03-28 9:38 ` Ian Jackson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).