linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
@ 2010-01-08  9:36 Michal Novotny
  2010-01-11 20:06 ` Ric Wheeler
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-08  9:36 UTC (permalink / raw)
  To: linux-ext4

[-- Attachment #1: Type: text/plain, Size: 911 bytes --]

This patch extends functionality of e{2|4}fsprogs to add 
EXT2_FLAG_DIRECT flag to be passed to ext2fs_open2() function. This 
internally calls open() function with O_DIRECT and handles the memory 
alignment for both read and write operations.
In some cases direct access to devices is necessary and that was the 
main reason for this patch to be done.

The main reason why this was done is that pygrub (used by xen 
virtualization user-space package, it's a python version of grub for 
paravirtualized guests) sometimes uses outdated version of grub.conf 
file. Modifications to xen package were *not* enough because e2fsprogs 
doesn't open the files directly. That's why I added EXT2_FLAG_DIRECT 
support to make read/write operations work directly when passed. It's 
been tested with pygrub like mentioned above for read operation and it's 
working fine.

Signed-off-by: Michal Novotny <minovotn@redhat.com>

[-- Attachment #2: mig-e4fsprogs-direct-read-write.patch --]
[-- Type: text/x-patch, Size: 5618 bytes --]

Author: Michal Novotny <minovotn@redhat.com>
Date:   Wed Dec 23 11:25:40 2009 +0200

    extend e{2|4}fsprogs functionality to add EXT2_FLAG_DIRECT option
    
    This patch extends functionality of e{2|4}fsprogs to add EXT2_FLAG_DIRECT
    flag to be passed to ext2fs_open2() function. This internally calls open()
    function with O_DIRECT and handles the memory alignment for both read and
    write operations.
    In some cases direct access to devices is necessary and that was the main
    reason for this patch to be done.
    
    Signed-off-by: Michal Novotny <minovotn@redhat.com>

diff -up e2fsprogs-1.41.9/lib/ext2fs/ext2fs.h.opendirect e2fsprogs-1.41.9/lib/ext2fs/ext2fs.h
--- e2fsprogs-1.41.9/lib/ext2fs/ext2fs.h.opendirect	2009-12-23 10:04:05.000000000 +0100
+++ e2fsprogs-1.41.9/lib/ext2fs/ext2fs.h	2009-12-23 10:04:05.000000000 +0100
@@ -172,6 +172,7 @@ typedef struct ext2_file *ext2_file_t;
 #define EXT2_FLAG_EXCLUSIVE		0x4000
 #define EXT2_FLAG_SOFTSUPP_FEATURES	0x8000
 #define EXT2_FLAG_NOFREE_ON_ERROR	0x10000
+#define EXT2_FLAG_DIRECT		0x20000
 
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
diff -up e2fsprogs-1.41.9/lib/ext2fs/ext2_io.h.opendirect e2fsprogs-1.41.9/lib/ext2fs/ext2_io.h
--- e2fsprogs-1.41.9/lib/ext2fs/ext2_io.h.opendirect	2009-08-23 04:44:35.000000000 +0200
+++ e2fsprogs-1.41.9/lib/ext2fs/ext2_io.h	2009-12-23 10:04:05.000000000 +0100
@@ -29,6 +29,7 @@ typedef struct struct_io_channel *io_cha
 typedef struct struct_io_stats *io_stats;
 
 #define CHANNEL_FLAGS_WRITETHROUGH	0x01
+#define CHANNEL_FLAGS_DIRECT		0x02
 
 struct struct_io_channel {
 	errcode_t	magic;
@@ -88,6 +89,7 @@ struct struct_io_manager {
 
 #define IO_FLAG_RW		0x0001
 #define IO_FLAG_EXCLUSIVE	0x0002
+#define IO_FLAG_DIRECT		0x0004
 
 /*
  * Convenience functions....
diff -up e2fsprogs-1.41.9/lib/ext2fs/openfs.c.opendirect e2fsprogs-1.41.9/lib/ext2fs/openfs.c
--- e2fsprogs-1.41.9/lib/ext2fs/openfs.c.opendirect	2009-08-23 04:44:35.000000000 +0200
+++ e2fsprogs-1.41.9/lib/ext2fs/openfs.c	2009-12-23 10:04:05.000000000 +0100
@@ -121,6 +121,8 @@ errcode_t ext2fs_open2(const char *name,
 		io_flags |= IO_FLAG_RW;
 	if (flags & EXT2_FLAG_EXCLUSIVE)
 		io_flags |= IO_FLAG_EXCLUSIVE;
+	if (flags & EXT2_FLAG_DIRECT)
+		io_flags |= IO_FLAG_DIRECT;
 	retval = manager->open(fs->device_name, io_flags, &fs->io);
 	if (retval)
 		goto cleanup;
@@ -296,6 +298,7 @@ errcode_t ext2fs_open2(const char *name,
        }
 	fs->desc_blocks = ext2fs_div_ceil(fs->group_desc_count,
 					  EXT2_DESC_PER_BLOCK(fs->super));
+
 	retval = ext2fs_get_array(fs->desc_blocks, fs->blocksize,
 				&fs->group_desc);
 	if (retval)
diff -up e2fsprogs-1.41.9/lib/ext2fs/unix_io.c.opendirect e2fsprogs-1.41.9/lib/ext2fs/unix_io.c
--- e2fsprogs-1.41.9/lib/ext2fs/unix_io.c.opendirect	2009-08-13 03:39:57.000000000 +0200
+++ e2fsprogs-1.41.9/lib/ext2fs/unix_io.c	2009-12-23 10:16:26.000000000 +0100
@@ -17,6 +17,7 @@
 
 #define _LARGEFILE_SOURCE
 #define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
 
 #include <stdio.h>
 #include <string.h>
@@ -163,15 +164,37 @@ static errcode_t raw_read_blk(io_channel
 	ssize_t		size;
 	ext2_loff_t	location;
 	int		actual = 0;
+	int		memret;
+	void		*aligned_buffer;
+	long		pagesize;
 
 	size = (count < 0) ? -count : count * channel->block_size;
+
 	data->io_stats.bytes_read += size;
 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
-	actual = read(data->dev, buf, size);
+
+	if (channel->flags & CHANNEL_FLAGS_DIRECT) {
+		pagesize = sysconf(_SC_PAGE_SIZE);
+		if (pagesize < 0)
+			goto error_out;
+
+		memret = posix_memalign(&aligned_buffer, pagesize, size);
+		if (memret != 0) {
+			errno = memret;
+			goto error_out;
+		}
+		actual = read(data->dev, aligned_buffer, size);
+		if (actual > 0)
+			memcpy(buf, aligned_buffer, size);
+		free(aligned_buffer);
+	}
+	else
+		actual = read(data->dev, buf, size);
+
 	if (actual != size) {
 		if (actual < 0)
 			actual = 0;
@@ -252,6 +275,9 @@ static errcode_t raw_write_blk(io_channe
 	ext2_loff_t	location;
 	int		actual = 0;
 	errcode_t	retval;
+        int		memret;
+        void		*aligned_buffer;
+        long		pagesize;
 
 	if (count == 1)
 		size = channel->block_size;
@@ -269,7 +295,23 @@ static errcode_t raw_write_blk(io_channe
 		goto error_out;
 	}
 
-	actual = write(data->dev, buf, size);
+	if (channel->flags & CHANNEL_FLAGS_DIRECT) {
+		pagesize = sysconf(_SC_PAGE_SIZE);
+		if (pagesize < 0)
+			goto error_out;
+
+		memret = posix_memalign(&aligned_buffer, pagesize, size);
+		if (memret != 0) {
+			errno = memret;
+			goto error_out;
+		}
+		memcpy(aligned_buffer, buf, size);
+		actual = write(data->dev, aligned_buffer, size);
+		free(aligned_buffer);
+	}
+	else
+		actual = write(data->dev, buf, size);
+
 	if (actual != size) {
 		retval = EXT2_ET_SHORT_WRITE;
 		goto error_out;
@@ -443,6 +485,9 @@ static errcode_t unix_open(const char *n
 	io->write_error = 0;
 	io->refcount = 1;
 
+	if (flags & IO_FLAG_DIRECT)
+		io->flags = CHANNEL_FLAGS_DIRECT;
+
 	memset(data, 0, sizeof(struct unix_private_data));
 	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
 	data->io_stats.num_fields = 2;
@@ -453,6 +498,8 @@ static errcode_t unix_open(const char *n
 	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
 	if (flags & IO_FLAG_EXCLUSIVE)
 		open_flags |= O_EXCL;
+        if (flags & IO_FLAG_DIRECT)
+                open_flags |= O_DIRECT;
 #ifdef HAVE_OPEN64
 	data->dev = open64(io->name, open_flags);
 #else

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-08  9:36 [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option Michal Novotny
@ 2010-01-11 20:06 ` Ric Wheeler
  2010-01-12 10:54   ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-11 20:06 UTC (permalink / raw)
  To: Michal Novotny; +Cc: linux-ext4

On 01/08/2010 04:36 AM, Michal Novotny wrote:
> This patch extends functionality of e{2|4}fsprogs to add 
> EXT2_FLAG_DIRECT flag to be passed to ext2fs_open2() function. This 
> internally calls open() function with O_DIRECT and handles the memory 
> alignment for both read and write operations.
> In some cases direct access to devices is necessary and that was the 
> main reason for this patch to be done.
>
> The main reason why this was done is that pygrub (used by xen 
> virtualization user-space package, it's a python version of grub for 
> paravirtualized guests) sometimes uses outdated version of grub.conf 
> file. Modifications to xen package were *not* enough because e2fsprogs 
> doesn't open the files directly. That's why I added EXT2_FLAG_DIRECT 
> support to make read/write operations work directly when passed. It's 
> been tested with pygrub like mentioned above for read operation and 
> it's working fine.
>
> Signed-off-by: Michal Novotny <minovotn@redhat.com>

Can you add to this some kind of data flow overview?  Seems like a 
really odd way to update this file system...

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-11 20:06 ` Ric Wheeler
@ 2010-01-12 10:54   ` Michal Novotny
  2010-01-12 11:59     ` Ric Wheeler
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 10:54 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: linux-ext4

On 01/11/2010 09:06 PM, Ric Wheeler wrote:
> On 01/08/2010 04:36 AM, Michal Novotny wrote:
>> This patch extends functionality of e{2|4}fsprogs to add 
>> EXT2_FLAG_DIRECT flag to be passed to ext2fs_open2() function. This 
>> internally calls open() function with O_DIRECT and handles the memory 
>> alignment for both read and write operations.
>> In some cases direct access to devices is necessary and that was the 
>> main reason for this patch to be done.
>>
>> The main reason why this was done is that pygrub (used by xen 
>> virtualization user-space package, it's a python version of grub for 
>> paravirtualized guests) sometimes uses outdated version of grub.conf 
>> file. Modifications to xen package were *not* enough because 
>> e2fsprogs doesn't open the files directly. That's why I added 
>> EXT2_FLAG_DIRECT support to make read/write operations work directly 
>> when passed. It's been tested with pygrub like mentioned above for 
>> read operation and it's working fine.
>>
>> Signed-off-by: Michal Novotny <minovotn@redhat.com>
>
> Can you add to this some kind of data flow overview?  Seems like a 
> really odd way to update this file system...
>
> ric
>
What do you mean by adding some kind of data flow overview? I am not 
working on file systems but I needed option  open files directly to be 
added to e2fsprogs to make pygrub working right with not using 
cached/outdated data so this was the first patch I did for file system 
so I don't realy know what do you mean by data flow overview ...

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 10:54   ` Michal Novotny
@ 2010-01-12 11:59     ` Ric Wheeler
  2010-01-12 12:15       ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 11:59 UTC (permalink / raw)
  To: Michal Novotny; +Cc: linux-ext4, Christoph Hellwig

On 01/12/2010 05:54 AM, Michal Novotny wrote:
> On 01/11/2010 09:06 PM, Ric Wheeler wrote:
>> On 01/08/2010 04:36 AM, Michal Novotny wrote:
>>> This patch extends functionality of e{2|4}fsprogs to add
>>> EXT2_FLAG_DIRECT flag to be passed to ext2fs_open2() function. This
>>> internally calls open() function with O_DIRECT and handles the memory
>>> alignment for both read and write operations.
>>> In some cases direct access to devices is necessary and that was the
>>> main reason for this patch to be done.
>>>
>>> The main reason why this was done is that pygrub (used by xen
>>> virtualization user-space package, it's a python version of grub for
>>> paravirtualized guests) sometimes uses outdated version of grub.conf
>>> file. Modifications to xen package were *not* enough because
>>> e2fsprogs doesn't open the files directly. That's why I added
>>> EXT2_FLAG_DIRECT support to make read/write operations work directly
>>> when passed. It's been tested with pygrub like mentioned above for
>>> read operation and it's working fine.
>>>
>>> Signed-off-by: Michal Novotny <minovotn@redhat.com>
>>
>> Can you add to this some kind of data flow overview? Seems like a
>> really odd way to update this file system...
>>
>> ric
>>
> What do you mean by adding some kind of data flow overview? I am not
> working on file systems but I needed option open files directly to be
> added to e2fsprogs to make pygrub working right with not using
> cached/outdated data so this was the first patch I did for file system
> so I don't realy know what do you mean by data flow overview ...
>
> Thanks,
> Michal

Hi Michael,

What would be useful is where the data is written (guest/host) and the state of 
the file system (mounted/unmounted). Which app it is that writes the data that 
gets cached, who needs to read it and misses that cached data.

 From the above, it is really not clear that you should not be able to use 
existing interfaces to flush any cached data to disk....

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 11:59     ` Ric Wheeler
@ 2010-01-12 12:15       ` Michal Novotny
  2010-01-12 12:23         ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 12:15 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: linux-ext4, Christoph Hellwig

On 01/12/2010 12:59 PM, Ric Wheeler wrote:
> On 01/12/2010 05:54 AM, Michal Novotny wrote:
>> On 01/11/2010 09:06 PM, Ric Wheeler wrote:
>>> On 01/08/2010 04:36 AM, Michal Novotny wrote:
>>>> This patch extends functionality of e{2|4}fsprogs to add
>>>> EXT2_FLAG_DIRECT flag to be passed to ext2fs_open2() function. This
>>>> internally calls open() function with O_DIRECT and handles the memory
>>>> alignment for both read and write operations.
>>>> In some cases direct access to devices is necessary and that was the
>>>> main reason for this patch to be done.
>>>>
>>>> The main reason why this was done is that pygrub (used by xen
>>>> virtualization user-space package, it's a python version of grub for
>>>> paravirtualized guests) sometimes uses outdated version of grub.conf
>>>> file. Modifications to xen package were *not* enough because
>>>> e2fsprogs doesn't open the files directly. That's why I added
>>>> EXT2_FLAG_DIRECT support to make read/write operations work directly
>>>> when passed. It's been tested with pygrub like mentioned above for
>>>> read operation and it's working fine.
>>>>
>>>> Signed-off-by: Michal Novotny <minovotn@redhat.com>
>>>
>>> Can you add to this some kind of data flow overview? Seems like a
>>> really odd way to update this file system...
>>>
>>> ric
>>>
>> What do you mean by adding some kind of data flow overview? I am not
>> working on file systems but I needed option open files directly to be
>> added to e2fsprogs to make pygrub working right with not using
>> cached/outdated data so this was the first patch I did for file system
>> so I don't realy know what do you mean by data flow overview ...
>>
>> Thanks,
>> Michal
>
> Hi Michael,
>
> What would be useful is where the data is written (guest/host) and the 
> state of the file system (mounted/unmounted). Which app it is that 
> writes the data that gets cached, who needs to read it and misses that 
> cached data.
>
> From the above, it is really not clear that you should not be able to 
> use existing interfaces to flush any cached data to disk....
>
> ric
>
Hi Ric,
I don't really know if I see your point but the thing here is that there 
was no way to open a file directly (ie. using O_DIRECT). The direct 
write support has been added only to make it possible to use both read 
and write directly. The main reason to create this patch was to add 
direct read support and flush capability won't help me at all. I am 
working in Red Hat, Virtualization team on Xen so I am really not that 
much familiar with file systems but what I needed was an option to read 
the data directly (using O_DIRECT) in e2fsprogs. One bug was about 
pygrub (Python version of GRUB of Xen PV guests that is internally using 
e2fsprogs functionality to access data on ext2/3/4 partition to boot the 
PV guests) uses outdated/cached data so some modifications were 
necessary to open everything directly... It was done to Xen by a 
collegue but there was some open() call with no O_DIRECT set so I 
investigated this further and it turned out that this call was coming 
from e2fsprogs library. That was the reason why I needed to add O_DIRECT 
read support and O_DIRECT write support was added just to make it for 
both read and write. For write, yes, there may be the option to flush 
the disk but not for read. To workaround this but dropping caches is a 
really ugly idea so finally patching e2fsprogs too (along with Xen) 
turned out to the best idea. Once again, this was done to support 
O_DIRECT for read function only - it is *not* used for write function 
and it was added just to make it complete for both read and write 
functions. Regarding the functionality of read function, this is done in 
the host machine before the PV guest boots. It reads grub.conf, then it 
reads initrd and kernel and boots from it... Some incidents of booting 
outdated grub.conf/initrd and kernel has been reported to us and this 
patch is to make it don't use caches so outdated data can't be used...

Is the scenario clear or do you need some other information about that?

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:15       ` Michal Novotny
@ 2010-01-12 12:23         ` Christoph Hellwig
  2010-01-12 12:30           ` Michal Novotny
  2010-01-12 15:16           ` Eric Sandeen
  0 siblings, 2 replies; 31+ messages in thread
From: Christoph Hellwig @ 2010-01-12 12:23 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Ric Wheeler, linux-ext4, Christoph Hellwig

On Tue, Jan 12, 2010 at 01:15:49PM +0100, Michal Novotny wrote:
> I don't really know if I see your point but the thing here is that there  
> was no way to open a file directly (ie. using O_DIRECT). The direct  
> write support has been added only to make it possible to use both read  
> and write directly. The main reason to create this patch was to add  
> direct read support and flush capability won't help me at all. I am  
> working in Red Hat, Virtualization team on Xen so I am really not that  
> much familiar with file systems but what I needed was an option to read  
> the data directly (using O_DIRECT) in e2fsprogs. One bug was about  
> pygrub (Python version of GRUB of Xen PV guests that is internally using  
> e2fsprogs functionality to access data on ext2/3/4 partition to boot the  
> PV guests) uses outdated/cached data so some modifications were  
> necessary to open everything directly...

So to get things staigt:  you're using e2fsprogs to manipulate a life
filesystem and thing using O_DIRECT saves your ass?  I think you need to
rething your model of operation fundamentally in that case. 


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:23         ` Christoph Hellwig
@ 2010-01-12 12:30           ` Michal Novotny
  2010-01-12 12:46             ` Christoph Hellwig
  2010-01-12 12:47             ` Andreas Dilger
  2010-01-12 15:16           ` Eric Sandeen
  1 sibling, 2 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 12:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ric Wheeler, linux-ext4

On 01/12/2010 01:23 PM, Christoph Hellwig wrote:
> On Tue, Jan 12, 2010 at 01:15:49PM +0100, Michal Novotny wrote:
>    
>> I don't really know if I see your point but the thing here is that there
>> was no way to open a file directly (ie. using O_DIRECT). The direct
>> write support has been added only to make it possible to use both read
>> and write directly. The main reason to create this patch was to add
>> direct read support and flush capability won't help me at all. I am
>> working in Red Hat, Virtualization team on Xen so I am really not that
>> much familiar with file systems but what I needed was an option to read
>> the data directly (using O_DIRECT) in e2fsprogs. One bug was about
>> pygrub (Python version of GRUB of Xen PV guests that is internally using
>> e2fsprogs functionality to access data on ext2/3/4 partition to boot the
>> PV guests) uses outdated/cached data so some modifications were
>> necessary to open everything directly...
>>      
> So to get things staigt:  you're using e2fsprogs to manipulate a life
> filesystem and thing using O_DIRECT saves your ass?  I think you need to
> rething your model of operation fundamentally in that case.
>
>    
Not really, pygrub doesn't do any manipulation with file system and 
also, it's not working on a life file system. It's called before the 
guest boots up to read information about grub.conf/initrd and kernel for 
PV guest and after this is read and selected in pygrub then the guest is 
booted using the kernel and initrd extracted from the image (after which 
the file is closed). Once again, nothing uses write support and it was 
added just to make it use O_DIRECT for both read and write operations 
but only pygrub uses only read support and O_DIRECT passed here is the 
only way to make it use non-cached data.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:30           ` Michal Novotny
@ 2010-01-12 12:46             ` Christoph Hellwig
  2010-01-12 13:01               ` Michal Novotny
  2010-01-12 12:47             ` Andreas Dilger
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2010-01-12 12:46 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4

On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
> Not really, pygrub doesn't do any manipulation with file system and  
> also, it's not working on a life file system. It's called before the  
> guest boots up to read information about grub.conf/initrd and kernel for  
> PV guest and after this is read and selected in pygrub then the guest is  
> booted using the kernel and initrd extracted from the image (after which  
> the file is closed). Once again, nothing uses write support and it was  
> added just to make it use O_DIRECT for both read and write operations  
> but only pygrub uses only read support and O_DIRECT passed here is the  
> only way to make it use non-cached data.

So what caches get in the way?  From the above it seems the situation
is the following:

 - filesystem N is a guest filesystem.  It's not usually mounted on the
   host, except for initial setup long time ago
 - before booting a guest your "pygrub" tools needs to read files on
   it, and it's doing so using e2fsprogs
 - once the guest is life it uses the extN kernel driver to access the
   filesystem

nowhere in this cycle you should have any stale cached data.  The kernel
always makes sure to write back data on umount/reboot, as does e2fsprogs
if actually used to write data (which you said is not the case anyway).

The only data that may be in the cache are unmodified data from reads
on the block device from either e2fsprogs or a suboptimal virtual block
device implementation, but these can't cause any problems.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:30           ` Michal Novotny
  2010-01-12 12:46             ` Christoph Hellwig
@ 2010-01-12 12:47             ` Andreas Dilger
  2010-01-12 13:04               ` Michal Novotny
  1 sibling, 1 reply; 31+ messages in thread
From: Andreas Dilger @ 2010-01-12 12:47 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4

On 2010-01-12, at 08:30, Michal Novotny wrote:
> On 01/12/2010 01:23 PM, Christoph Hellwig wrote:
>> So to get things staigt:  you're using e2fsprogs to manipulate a life
>> filesystem and thing using O_DIRECT saves your ass?  I think you  
>> need to
>> rething your model of operation fundamentally in that case.
>>
>
> Not really, pygrub doesn't do any manipulation with file system and  
> also, it's not working on a life file system. It's called before the  
> guest boots up to read information about grub.conf/initrd and kernel  
> for PV guest and after this is read and selected in pygrub then the  
> guest is booted using the kernel and initrd extracted from the image  
> (after which the file is closed). Once again, nothing uses write  
> support and it was added just to make it use O_DIRECT for both read  
> and write operations but only pygrub uses only read support and  
> O_DIRECT passed here is the only way to make it use non-cached data.


Michal, I think the thing that is confusing everyone is that if you  
are not accessing a live filesystem, and you are not doing the writes  
yourself, then why is it bad to read cached data?  How is it that the  
cached data becomes stale if pygrub isn't modifying it, and there is  
nothing else mounting the filesystem?

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:46             ` Christoph Hellwig
@ 2010-01-12 13:01               ` Michal Novotny
  2010-01-12 13:04                 ` Ric Wheeler
  2010-01-12 16:38                 ` Christoph Hellwig
  0 siblings, 2 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 13:01 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ric Wheeler, linux-ext4

On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>    
>> Not really, pygrub doesn't do any manipulation with file system and
>> also, it's not working on a life file system. It's called before the
>> guest boots up to read information about grub.conf/initrd and kernel for
>> PV guest and after this is read and selected in pygrub then the guest is
>> booted using the kernel and initrd extracted from the image (after which
>> the file is closed). Once again, nothing uses write support and it was
>> added just to make it use O_DIRECT for both read and write operations
>> but only pygrub uses only read support and O_DIRECT passed here is the
>> only way to make it use non-cached data.
>>      
> So what caches get in the way?  From the above it seems the situation
> is the following:
>
>   - filesystem N is a guest filesystem.  It's not usually mounted on the
>     host, except for initial setup long time ago
>    

Yes, it is really a guest file system. This is not mounted in the host 
and the reason is to get actual version of grub.conf, initrd and kernel 
to be booted...

>   - before booting a guest your "pygrub" tools needs to read files on
>     it, and it's doing so using e2fsprogs
>    

Correct.

>   - once the guest is life it uses the extN kernel driver to access the
>     filesystem
>    

That's right. So this is no longer pygrub responsibility...

> nowhere in this cycle you should have any stale cached data.  The kernel
> always makes sure to write back data on umount/reboot, as does e2fsprogs
> if actually used to write data (which you said is not the case anyway).
>    

In fact I was unable to run into those problems myself but 
reporter/customer did.

> The only data that may be in the cache are unmodified data from reads
> on the block device from either e2fsprogs or a suboptimal virtual block
> device implementation, but these can't cause any problems.
>    
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:47             ` Andreas Dilger
@ 2010-01-12 13:04               ` Michal Novotny
  0 siblings, 0 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 13:04 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4

On 01/12/2010 01:47 PM, Andreas Dilger wrote:
> On 2010-01-12, at 08:30, Michal Novotny wrote:
>> On 01/12/2010 01:23 PM, Christoph Hellwig wrote:
>>> So to get things staigt:  you're using e2fsprogs to manipulate a life
>>> filesystem and thing using O_DIRECT saves your ass?  I think you 
>>> need to
>>> rething your model of operation fundamentally in that case.
>>>
>>
>> Not really, pygrub doesn't do any manipulation with file system and 
>> also, it's not working on a life file system. It's called before the 
>> guest boots up to read information about grub.conf/initrd and kernel 
>> for PV guest and after this is read and selected in pygrub then the 
>> guest is booted using the kernel and initrd extracted from the image 
>> (after which the file is closed). Once again, nothing uses write 
>> support and it was added just to make it use O_DIRECT for both read 
>> and write operations but only pygrub uses only read support and 
>> O_DIRECT passed here is the only way to make it use non-cached data.
>
>
> Michal, I think the thing that is confusing everyone is that if you 
> are not accessing a live filesystem, and you are not doing the writes 
> yourself, then why is it bad to read cached data?  How is it that the 
> cached data becomes stale if pygrub isn't modifying it, and there is 
> nothing else mounting the filesystem?
>
Hi Andreas,
it really is bad because the host have data about grub.conf, kernel and 
initrd cached but if you run the guest, update kernel or grub.conf etc., 
you need to see the changes here to boot the new/updated kernel now. Not 
the old, cached one. This is why it is bad to read cached data. Pygrub 
isn't modifying it but the guest can modify the data itself...

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:01               ` Michal Novotny
@ 2010-01-12 13:04                 ` Ric Wheeler
  2010-01-12 13:12                   ` Michal Novotny
  2010-01-12 16:38                 ` Christoph Hellwig
  1 sibling, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 13:04 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, linux-ext4

On 01/12/2010 08:01 AM, Michal Novotny wrote:
> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>> Not really, pygrub doesn't do any manipulation with file system and
>>> also, it's not working on a life file system. It's called before the
>>> guest boots up to read information about grub.conf/initrd and kernel for
>>> PV guest and after this is read and selected in pygrub then the guest is
>>> booted using the kernel and initrd extracted from the image (after which
>>> the file is closed). Once again, nothing uses write support and it was
>>> added just to make it use O_DIRECT for both read and write operations
>>> but only pygrub uses only read support and O_DIRECT passed here is the
>>> only way to make it use non-cached data.
>> So what caches get in the way? From the above it seems the situation
>> is the following:
>>
>> - filesystem N is a guest filesystem. It's not usually mounted on the
>> host, except for initial setup long time ago
>
> Yes, it is really a guest file system. This is not mounted in the host
> and the reason is to get actual version of grub.conf, initrd and kernel
> to be booted...
>
>> - before booting a guest your "pygrub" tools needs to read files on
>> it, and it's doing so using e2fsprogs
>
> Correct.
>
>> - once the guest is life it uses the extN kernel driver to access the
>> filesystem
>
> That's right. So this is no longer pygrub responsibility...
>
>> nowhere in this cycle you should have any stale cached data. The kernel
>> always makes sure to write back data on umount/reboot, as does e2fsprogs
>> if actually used to write data (which you said is not the case anyway).
>
> In fact I was unable to run into those problems myself but
> reporter/customer did.
>
>> The only data that may be in the cache are unmodified data from reads
>> on the block device from either e2fsprogs or a suboptimal virtual block
>> device implementation, but these can't cause any problems.
> Michal

If the guest is the only one (when running) that installs a new grub.conf file 
and kernel and it shuts down properly, you should be good. It if does not shut 
down cleanly, it could have a stale grub.conf file (or worse, a partially 
written one), but using O_DIRECT to bypass the file system cache should not help.

If we cannot reproduce this failure, sounds like we need to go back and get a 
better understanding of what the customer saw?

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:04                 ` Ric Wheeler
@ 2010-01-12 13:12                   ` Michal Novotny
  2010-01-12 13:23                     ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 13:12 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Christoph Hellwig, linux-ext4

On 01/12/2010 02:04 PM, Ric Wheeler wrote:
> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>> Not really, pygrub doesn't do any manipulation with file system and
>>>> also, it's not working on a life file system. It's called before the
>>>> guest boots up to read information about grub.conf/initrd and 
>>>> kernel for
>>>> PV guest and after this is read and selected in pygrub then the 
>>>> guest is
>>>> booted using the kernel and initrd extracted from the image (after 
>>>> which
>>>> the file is closed). Once again, nothing uses write support and it was
>>>> added just to make it use O_DIRECT for both read and write operations
>>>> but only pygrub uses only read support and O_DIRECT passed here is the
>>>> only way to make it use non-cached data.
>>> So what caches get in the way? From the above it seems the situation
>>> is the following:
>>>
>>> - filesystem N is a guest filesystem. It's not usually mounted on the
>>> host, except for initial setup long time ago
>>
>> Yes, it is really a guest file system. This is not mounted in the host
>> and the reason is to get actual version of grub.conf, initrd and kernel
>> to be booted...
>>
>>> - before booting a guest your "pygrub" tools needs to read files on
>>> it, and it's doing so using e2fsprogs
>>
>> Correct.
>>
>>> - once the guest is life it uses the extN kernel driver to access the
>>> filesystem
>>
>> That's right. So this is no longer pygrub responsibility...
>>
>>> nowhere in this cycle you should have any stale cached data. The kernel
>>> always makes sure to write back data on umount/reboot, as does 
>>> e2fsprogs
>>> if actually used to write data (which you said is not the case anyway).
>>
>> In fact I was unable to run into those problems myself but
>> reporter/customer did.
>>
>>> The only data that may be in the cache are unmodified data from reads
>>> on the block device from either e2fsprogs or a suboptimal virtual block
>>> device implementation, but these can't cause any problems.
>> Michal
>
> If the guest is the only one (when running) that installs a new 
> grub.conf file and kernel and it shuts down properly, you should be 
> good. It if does not shut down cleanly, it could have a stale 
> grub.conf file (or worse, a partially written one), but using O_DIRECT 
> to bypass the file system cache should not help.
>
> If we cannot reproduce this failure, sounds like we need to go back 
> and get a better understanding of what the customer saw?
>
> ric
>
That's right. I am going write an e-mail regarding this information to 
the reproducer if this bug and tell him that I need more information 
about what's happening at the customer side.

Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:12                   ` Michal Novotny
@ 2010-01-12 13:23                     ` Michal Novotny
  2010-01-12 13:29                       ` Ric Wheeler
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 13:23 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Christoph Hellwig, linux-ext4

On 01/12/2010 02:12 PM, Michal Novotny wrote:
> On 01/12/2010 02:04 PM, Ric Wheeler wrote:
>> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>>> Not really, pygrub doesn't do any manipulation with file system and
>>>>> also, it's not working on a life file system. It's called before the
>>>>> guest boots up to read information about grub.conf/initrd and 
>>>>> kernel for
>>>>> PV guest and after this is read and selected in pygrub then the 
>>>>> guest is
>>>>> booted using the kernel and initrd extracted from the image (after 
>>>>> which
>>>>> the file is closed). Once again, nothing uses write support and it 
>>>>> was
>>>>> added just to make it use O_DIRECT for both read and write operations
>>>>> but only pygrub uses only read support and O_DIRECT passed here is 
>>>>> the
>>>>> only way to make it use non-cached data.
>>>> So what caches get in the way? From the above it seems the situation
>>>> is the following:
>>>>
>>>> - filesystem N is a guest filesystem. It's not usually mounted on the
>>>> host, except for initial setup long time ago
>>>
>>> Yes, it is really a guest file system. This is not mounted in the host
>>> and the reason is to get actual version of grub.conf, initrd and kernel
>>> to be booted...
>>>
>>>> - before booting a guest your "pygrub" tools needs to read files on
>>>> it, and it's doing so using e2fsprogs
>>>
>>> Correct.
>>>
>>>> - once the guest is life it uses the extN kernel driver to access the
>>>> filesystem
>>>
>>> That's right. So this is no longer pygrub responsibility...
>>>
>>>> nowhere in this cycle you should have any stale cached data. The 
>>>> kernel
>>>> always makes sure to write back data on umount/reboot, as does 
>>>> e2fsprogs
>>>> if actually used to write data (which you said is not the case 
>>>> anyway).
>>>
>>> In fact I was unable to run into those problems myself but
>>> reporter/customer did.
>>>
>>>> The only data that may be in the cache are unmodified data from reads
>>>> on the block device from either e2fsprogs or a suboptimal virtual 
>>>> block
>>>> device implementation, but these can't cause any problems.
>>> Michal
>>
>> If the guest is the only one (when running) that installs a new 
>> grub.conf file and kernel and it shuts down properly, you should be 
>> good. It if does not shut down cleanly, it could have a stale 
>> grub.conf file (or worse, a partially written one), but using 
>> O_DIRECT to bypass the file system cache should not help.
>>
>> If we cannot reproduce this failure, sounds like we need to go back 
>> and get a better understanding of what the customer saw?
>>
>> ric
>>
> That's right. I am going write an e-mail regarding this information to 
> the reproducer if this bug and tell him that I need more information 
> about what's happening at the customer side.
>
One more thing to point out, let's have a look at: 
https://bugzilla.redhat.com/show_bug.cgi?id=466681#c15 .This is about 
workaround to drop caches to be added to pygrub in the host machine 
using this command:

echo 1>  /proc/sys/vm/drop_caches

So this really looks like the caching issue if it's working fine after 
dropping the caches. That may be the reason why this could be fine with 
this patch present in e2fsprogs.

  Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:23                     ` Michal Novotny
@ 2010-01-12 13:29                       ` Ric Wheeler
  2010-01-12 13:33                         ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 13:29 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, linux-ext4

On 01/12/2010 08:23 AM, Michal Novotny wrote:
> On 01/12/2010 02:12 PM, Michal Novotny wrote:
>> On 01/12/2010 02:04 PM, Ric Wheeler wrote:
>>> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>>>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>>>> Not really, pygrub doesn't do any manipulation with file system and
>>>>>> also, it's not working on a life file system. It's called before the
>>>>>> guest boots up to read information about grub.conf/initrd and
>>>>>> kernel for
>>>>>> PV guest and after this is read and selected in pygrub then the
>>>>>> guest is
>>>>>> booted using the kernel and initrd extracted from the image (after
>>>>>> which
>>>>>> the file is closed). Once again, nothing uses write support and it
>>>>>> was
>>>>>> added just to make it use O_DIRECT for both read and write operations
>>>>>> but only pygrub uses only read support and O_DIRECT passed here is
>>>>>> the
>>>>>> only way to make it use non-cached data.
>>>>> So what caches get in the way? From the above it seems the situation
>>>>> is the following:
>>>>>
>>>>> - filesystem N is a guest filesystem. It's not usually mounted on the
>>>>> host, except for initial setup long time ago
>>>>
>>>> Yes, it is really a guest file system. This is not mounted in the host
>>>> and the reason is to get actual version of grub.conf, initrd and kernel
>>>> to be booted...
>>>>
>>>>> - before booting a guest your "pygrub" tools needs to read files on
>>>>> it, and it's doing so using e2fsprogs
>>>>
>>>> Correct.
>>>>
>>>>> - once the guest is life it uses the extN kernel driver to access the
>>>>> filesystem
>>>>
>>>> That's right. So this is no longer pygrub responsibility...
>>>>
>>>>> nowhere in this cycle you should have any stale cached data. The
>>>>> kernel
>>>>> always makes sure to write back data on umount/reboot, as does
>>>>> e2fsprogs
>>>>> if actually used to write data (which you said is not the case
>>>>> anyway).
>>>>
>>>> In fact I was unable to run into those problems myself but
>>>> reporter/customer did.
>>>>
>>>>> The only data that may be in the cache are unmodified data from reads
>>>>> on the block device from either e2fsprogs or a suboptimal virtual
>>>>> block
>>>>> device implementation, but these can't cause any problems.
>>>> Michal
>>>
>>> If the guest is the only one (when running) that installs a new
>>> grub.conf file and kernel and it shuts down properly, you should be
>>> good. It if does not shut down cleanly, it could have a stale
>>> grub.conf file (or worse, a partially written one), but using
>>> O_DIRECT to bypass the file system cache should not help.
>>>
>>> If we cannot reproduce this failure, sounds like we need to go back
>>> and get a better understanding of what the customer saw?
>>>
>>> ric
>>>
>> That's right. I am going write an e-mail regarding this information to
>> the reproducer if this bug and tell him that I need more information
>> about what's happening at the customer side.
>>
> One more thing to point out, let's have a look at:
> https://bugzilla.redhat.com/show_bug.cgi?id=466681#c15 .This is about
> workaround to drop caches to be added to pygrub in the host machine
> using this command:
>
> echo 1> /proc/sys/vm/drop_caches
>
> So this really looks like the caching issue if it's working fine after
> dropping the caches. That may be the reason why this could be fine with
> this patch present in e2fsprogs.
>
> Michal

That BZ has a pretty long and twisted history, but after a quick read, I still 
don't see why a cleanly shutdown guest would have issues with caching that using 
O_DIRECT on read would help.

We will need to dig into a bit more...

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:29                       ` Ric Wheeler
@ 2010-01-12 13:33                         ` Michal Novotny
  2010-01-12 14:33                           ` Chris Lee
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 13:33 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Christoph Hellwig, linux-ext4

On 01/12/2010 02:29 PM, Ric Wheeler wrote:
> On 01/12/2010 08:23 AM, Michal Novotny wrote:
>> On 01/12/2010 02:12 PM, Michal Novotny wrote:
>>> On 01/12/2010 02:04 PM, Ric Wheeler wrote:
>>>> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>>>>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>>>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>>>>> Not really, pygrub doesn't do any manipulation with file system and
>>>>>>> also, it's not working on a life file system. It's called before 
>>>>>>> the
>>>>>>> guest boots up to read information about grub.conf/initrd and
>>>>>>> kernel for
>>>>>>> PV guest and after this is read and selected in pygrub then the
>>>>>>> guest is
>>>>>>> booted using the kernel and initrd extracted from the image (after
>>>>>>> which
>>>>>>> the file is closed). Once again, nothing uses write support and it
>>>>>>> was
>>>>>>> added just to make it use O_DIRECT for both read and write 
>>>>>>> operations
>>>>>>> but only pygrub uses only read support and O_DIRECT passed here is
>>>>>>> the
>>>>>>> only way to make it use non-cached data.
>>>>>> So what caches get in the way? From the above it seems the situation
>>>>>> is the following:
>>>>>>
>>>>>> - filesystem N is a guest filesystem. It's not usually mounted on 
>>>>>> the
>>>>>> host, except for initial setup long time ago
>>>>>
>>>>> Yes, it is really a guest file system. This is not mounted in the 
>>>>> host
>>>>> and the reason is to get actual version of grub.conf, initrd and 
>>>>> kernel
>>>>> to be booted...
>>>>>
>>>>>> - before booting a guest your "pygrub" tools needs to read files on
>>>>>> it, and it's doing so using e2fsprogs
>>>>>
>>>>> Correct.
>>>>>
>>>>>> - once the guest is life it uses the extN kernel driver to access 
>>>>>> the
>>>>>> filesystem
>>>>>
>>>>> That's right. So this is no longer pygrub responsibility...
>>>>>
>>>>>> nowhere in this cycle you should have any stale cached data. The
>>>>>> kernel
>>>>>> always makes sure to write back data on umount/reboot, as does
>>>>>> e2fsprogs
>>>>>> if actually used to write data (which you said is not the case
>>>>>> anyway).
>>>>>
>>>>> In fact I was unable to run into those problems myself but
>>>>> reporter/customer did.
>>>>>
>>>>>> The only data that may be in the cache are unmodified data from 
>>>>>> reads
>>>>>> on the block device from either e2fsprogs or a suboptimal virtual
>>>>>> block
>>>>>> device implementation, but these can't cause any problems.
>>>>> Michal
>>>>
>>>> If the guest is the only one (when running) that installs a new
>>>> grub.conf file and kernel and it shuts down properly, you should be
>>>> good. It if does not shut down cleanly, it could have a stale
>>>> grub.conf file (or worse, a partially written one), but using
>>>> O_DIRECT to bypass the file system cache should not help.
>>>>
>>>> If we cannot reproduce this failure, sounds like we need to go back
>>>> and get a better understanding of what the customer saw?
>>>>
>>>> ric
>>>>
>>> That's right. I am going write an e-mail regarding this information to
>>> the reproducer if this bug and tell him that I need more information
>>> about what's happening at the customer side.
>>>
>> One more thing to point out, let's have a look at:
>> https://bugzilla.redhat.com/show_bug.cgi?id=466681#c15 .This is about
>> workaround to drop caches to be added to pygrub in the host machine
>> using this command:
>>
>> echo 1> /proc/sys/vm/drop_caches
>>
>> So this really looks like the caching issue if it's working fine after
>> dropping the caches. That may be the reason why this could be fine with
>> this patch present in e2fsprogs.
>>
>> Michal
>
> That BZ has a pretty long and twisted history, but after a quick read, 
> I still don't see why a cleanly shutdown guest would have issues with 
> caching that using O_DIRECT on read would help.
>
> We will need to dig into a bit more...
>
> ric
>
I am not saying we don't  need to dig a little bit more, we surely do 
but unfortunately I am waiting for information from reporter. But I am 
also thinking that this O_DIRECT functionality support to bypass caches 
could be useful...

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:33                         ` Michal Novotny
@ 2010-01-12 14:33                           ` Chris Lee
  2010-01-12 14:37                             ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Chris Lee @ 2010-01-12 14:33 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Ric Wheeler, Christoph Hellwig, linux-ext4



Michal Novotny wrote:
> On 01/12/2010 02:29 PM, Ric Wheeler wrote:
>> On 01/12/2010 08:23 AM, Michal Novotny wrote:
>>> On 01/12/2010 02:12 PM, Michal Novotny wrote:
>>>> On 01/12/2010 02:04 PM, Ric Wheeler wrote:
>>>>> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>>>>>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>>>>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>>>>>> Not really, pygrub doesn't do any manipulation with file system 
>>>>>>>> and
>>>>>>>> also, it's not working on a life file system. It's called 
>>>>>>>> before the
>>>>>>>> guest boots up to read information about grub.conf/initrd and
>>>>>>>> kernel for
>>>>>>>> PV guest and after this is read and selected in pygrub then the
>>>>>>>> guest is
>>>>>>>> booted using the kernel and initrd extracted from the image (after
>>>>>>>> which
>>>>>>>> the file is closed). Once again, nothing uses write support and it
>>>>>>>> was
>>>>>>>> added just to make it use O_DIRECT for both read and write 
>>>>>>>> operations
>>>>>>>> but only pygrub uses only read support and O_DIRECT passed here is
>>>>>>>> the
>>>>>>>> only way to make it use non-cached data.
>>>>>>> So what caches get in the way? From the above it seems the 
>>>>>>> situation
>>>>>>> is the following:
>>>>>>>
>>>>>>> - filesystem N is a guest filesystem. It's not usually mounted 
>>>>>>> on the
>>>>>>> host, except for initial setup long time ago
>>>>>>
>>>>>> Yes, it is really a guest file system. This is not mounted in the 
>>>>>> host
>>>>>> and the reason is to get actual version of grub.conf, initrd and 
>>>>>> kernel
>>>>>> to be booted...
>>>>>>
>>>>>>> - before booting a guest your "pygrub" tools needs to read files on
>>>>>>> it, and it's doing so using e2fsprogs
>>>>>>
>>>>>> Correct.
>>>>>>
>>>>>>> - once the guest is life it uses the extN kernel driver to 
>>>>>>> access the
>>>>>>> filesystem
>>>>>>
>>>>>> That's right. So this is no longer pygrub responsibility...
>>>>>>
>>>>>>> nowhere in this cycle you should have any stale cached data. The
>>>>>>> kernel
>>>>>>> always makes sure to write back data on umount/reboot, as does
>>>>>>> e2fsprogs
>>>>>>> if actually used to write data (which you said is not the case
>>>>>>> anyway).
>>>>>>
>>>>>> In fact I was unable to run into those problems myself but
>>>>>> reporter/customer did.
>>>>>>
>>>>>>> The only data that may be in the cache are unmodified data from 
>>>>>>> reads
>>>>>>> on the block device from either e2fsprogs or a suboptimal virtual
>>>>>>> block
>>>>>>> device implementation, but these can't cause any problems.
>>>>>> Michal
>>>>>
>>>>> If the guest is the only one (when running) that installs a new
>>>>> grub.conf file and kernel and it shuts down properly, you should be
>>>>> good. It if does not shut down cleanly, it could have a stale
>>>>> grub.conf file (or worse, a partially written one), but using
>>>>> O_DIRECT to bypass the file system cache should not help.
>>>>>
>>>>> If we cannot reproduce this failure, sounds like we need to go back
>>>>> and get a better understanding of what the customer saw?
>>>>>
>>>>> ric
>>>>>
>>>> That's right. I am going write an e-mail regarding this information to
>>>> the reproducer if this bug and tell him that I need more information
>>>> about what's happening at the customer side.
>>>>
>>> One more thing to point out, let's have a look at:
>>> https://bugzilla.redhat.com/show_bug.cgi?id=466681#c15 .This is about
>>> workaround to drop caches to be added to pygrub in the host machine
>>> using this command:
>>>
>>> echo 1> /proc/sys/vm/drop_caches
>>>
>>> So this really looks like the caching issue if it's working fine after
>>> dropping the caches. That may be the reason why this could be fine with
>>> this patch present in e2fsprogs.
>>>
>>> Michal
>>
>> That BZ has a pretty long and twisted history, but after a quick 
>> read, I still don't see why a cleanly shutdown guest would have 
>> issues with caching that using O_DIRECT on read would help.
>>
>> We will need to dig into a bit more...
>>
>> ric
>>
> I am not saying we don't  need to dig a little bit more, we surely do 
> but unfortunately I am waiting for information from reporter. But I am 
> also thinking that this O_DIRECT functionality support to bypass 
> caches could be useful...
>
> Thanks,
> Michal
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
I can not see where the cache could cause this problem but is it 
possible that it is in the Host file system rather than than the guest 
where it is causing a problem; If a guest shuts down  it writes to it's 
file system and all is good only it's file system is a file on another 
file system. So the cache looking after those writes as managed by the 
hyper visor or whatever could hold that data un-flushed for whatever reason.
But when the guest starts up it should get the cached or most recent 
version, this should not be stale data unless more than one guest is 
using this file system and they are overwriting each others files, then 
a cached version might be newer and different from what is expected and 
the on disk version might be the old and expected version.
End user needs to provide more information.

Chris.
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 14:33                           ` Chris Lee
@ 2010-01-12 14:37                             ` Michal Novotny
  0 siblings, 0 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 14:37 UTC (permalink / raw)
  To: Chris Lee; +Cc: Ric Wheeler, Christoph Hellwig, linux-ext4

On 01/12/2010 03:33 PM, Chris Lee wrote:
>
>
> Michal Novotny wrote:
>> On 01/12/2010 02:29 PM, Ric Wheeler wrote:
>>> On 01/12/2010 08:23 AM, Michal Novotny wrote:
>>>> On 01/12/2010 02:12 PM, Michal Novotny wrote:
>>>>> On 01/12/2010 02:04 PM, Ric Wheeler wrote:
>>>>>> On 01/12/2010 08:01 AM, Michal Novotny wrote:
>>>>>>> On 01/12/2010 01:46 PM, Christoph Hellwig wrote:
>>>>>>>> On Tue, Jan 12, 2010 at 01:30:40PM +0100, Michal Novotny wrote:
>>>>>>>>> Not really, pygrub doesn't do any manipulation with file 
>>>>>>>>> system and
>>>>>>>>> also, it's not working on a life file system. It's called 
>>>>>>>>> before the
>>>>>>>>> guest boots up to read information about grub.conf/initrd and
>>>>>>>>> kernel for
>>>>>>>>> PV guest and after this is read and selected in pygrub then the
>>>>>>>>> guest is
>>>>>>>>> booted using the kernel and initrd extracted from the image 
>>>>>>>>> (after
>>>>>>>>> which
>>>>>>>>> the file is closed). Once again, nothing uses write support 
>>>>>>>>> and it
>>>>>>>>> was
>>>>>>>>> added just to make it use O_DIRECT for both read and write 
>>>>>>>>> operations
>>>>>>>>> but only pygrub uses only read support and O_DIRECT passed 
>>>>>>>>> here is
>>>>>>>>> the
>>>>>>>>> only way to make it use non-cached data.
>>>>>>>> So what caches get in the way? From the above it seems the 
>>>>>>>> situation
>>>>>>>> is the following:
>>>>>>>>
>>>>>>>> - filesystem N is a guest filesystem. It's not usually mounted 
>>>>>>>> on the
>>>>>>>> host, except for initial setup long time ago
>>>>>>>
>>>>>>> Yes, it is really a guest file system. This is not mounted in 
>>>>>>> the host
>>>>>>> and the reason is to get actual version of grub.conf, initrd and 
>>>>>>> kernel
>>>>>>> to be booted...
>>>>>>>
>>>>>>>> - before booting a guest your "pygrub" tools needs to read 
>>>>>>>> files on
>>>>>>>> it, and it's doing so using e2fsprogs
>>>>>>>
>>>>>>> Correct.
>>>>>>>
>>>>>>>> - once the guest is life it uses the extN kernel driver to 
>>>>>>>> access the
>>>>>>>> filesystem
>>>>>>>
>>>>>>> That's right. So this is no longer pygrub responsibility...
>>>>>>>
>>>>>>>> nowhere in this cycle you should have any stale cached data. The
>>>>>>>> kernel
>>>>>>>> always makes sure to write back data on umount/reboot, as does
>>>>>>>> e2fsprogs
>>>>>>>> if actually used to write data (which you said is not the case
>>>>>>>> anyway).
>>>>>>>
>>>>>>> In fact I was unable to run into those problems myself but
>>>>>>> reporter/customer did.
>>>>>>>
>>>>>>>> The only data that may be in the cache are unmodified data from 
>>>>>>>> reads
>>>>>>>> on the block device from either e2fsprogs or a suboptimal virtual
>>>>>>>> block
>>>>>>>> device implementation, but these can't cause any problems.
>>>>>>> Michal
>>>>>>
>>>>>> If the guest is the only one (when running) that installs a new
>>>>>> grub.conf file and kernel and it shuts down properly, you should be
>>>>>> good. It if does not shut down cleanly, it could have a stale
>>>>>> grub.conf file (or worse, a partially written one), but using
>>>>>> O_DIRECT to bypass the file system cache should not help.
>>>>>>
>>>>>> If we cannot reproduce this failure, sounds like we need to go back
>>>>>> and get a better understanding of what the customer saw?
>>>>>>
>>>>>> ric
>>>>>>
>>>>> That's right. I am going write an e-mail regarding this 
>>>>> information to
>>>>> the reproducer if this bug and tell him that I need more information
>>>>> about what's happening at the customer side.
>>>>>
>>>> One more thing to point out, let's have a look at:
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=466681#c15 .This is about
>>>> workaround to drop caches to be added to pygrub in the host machine
>>>> using this command:
>>>>
>>>> echo 1> /proc/sys/vm/drop_caches
>>>>
>>>> So this really looks like the caching issue if it's working fine after
>>>> dropping the caches. That may be the reason why this could be fine 
>>>> with
>>>> this patch present in e2fsprogs.
>>>>
>>>> Michal
>>>
>>> That BZ has a pretty long and twisted history, but after a quick 
>>> read, I still don't see why a cleanly shutdown guest would have 
>>> issues with caching that using O_DIRECT on read would help.
>>>
>>> We will need to dig into a bit more...
>>>
>>> ric
>>>
>> I am not saying we don't  need to dig a little bit more, we surely do 
>> but unfortunately I am waiting for information from reporter. But I 
>> am also thinking that this O_DIRECT functionality support to bypass 
>> caches could be useful...
>>
>> Thanks,
>> Michal
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> I can not see where the cache could cause this problem but is it 
> possible that it is in the Host file system rather than than the guest 
> where it is causing a problem;

This may be right because drop caches in the host is a working 
workaround. Also, I am having some information about it. Scott wrote 
that he was able to reproduce it but with my patches applied it is 
working fine. I am waiting for more information about that and customer 
test results...

Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 12:23         ` Christoph Hellwig
  2010-01-12 12:30           ` Michal Novotny
@ 2010-01-12 15:16           ` Eric Sandeen
  2010-01-12 15:46             ` Michal Novotny
  1 sibling, 1 reply; 31+ messages in thread
From: Eric Sandeen @ 2010-01-12 15:16 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Michal Novotny, Ric Wheeler, linux-ext4

Christoph Hellwig wrote:
> On Tue, Jan 12, 2010 at 01:15:49PM +0100, Michal Novotny wrote:
>> I don't really know if I see your point but the thing here is that there  
>> was no way to open a file directly (ie. using O_DIRECT). The direct  
>> write support has been added only to make it possible to use both read  
>> and write directly. The main reason to create this patch was to add  
>> direct read support and flush capability won't help me at all. I am  
>> working in Red Hat, Virtualization team on Xen so I am really not that  
>> much familiar with file systems but what I needed was an option to read  
>> the data directly (using O_DIRECT) in e2fsprogs. One bug was about  
>> pygrub (Python version of GRUB of Xen PV guests that is internally using  
>> e2fsprogs functionality to access data on ext2/3/4 partition to boot the  
>> PV guests) uses outdated/cached data so some modifications were  
>> necessary to open everything directly...
> 
> So to get things staigt:  you're using e2fsprogs to manipulate a life
> filesystem and thing using O_DIRECT saves your ass?  I think you need to
> rething your model of operation fundamentally in that case. 
> 

Christoph - 

It's my understanding that nobody is doing concurrent access.

If the host reads the block device via pygrub to boot the guest,
the guest while running updates the same device when installing
a new kernel (either through the fs, or by writing to the bdev;
probably the former...), and then the guest shuts down - is there
something which will sync the host's cache (cached from the
prior read) with the updates from the guest?

I don't know nearly enough about how the virt IO goes ...

But if access is sequential between host & guest, I'd hope that
this would all just work.

If that is all supposed to work, then maybe the description of the
data flow (as Ric asked for) is not correct.

-Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 15:16           ` Eric Sandeen
@ 2010-01-12 15:46             ` Michal Novotny
  2010-01-12 20:01               ` Ric Wheeler
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 15:46 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4

On 01/12/2010 04:16 PM, Eric Sandeen wrote:
> Christoph Hellwig wrote:
>    
>> On Tue, Jan 12, 2010 at 01:15:49PM +0100, Michal Novotny wrote:
>>      
>>> I don't really know if I see your point but the thing here is that there
>>> was no way to open a file directly (ie. using O_DIRECT). The direct
>>> write support has been added only to make it possible to use both read
>>> and write directly. The main reason to create this patch was to add
>>> direct read support and flush capability won't help me at all. I am
>>> working in Red Hat, Virtualization team on Xen so I am really not that
>>> much familiar with file systems but what I needed was an option to read
>>> the data directly (using O_DIRECT) in e2fsprogs. One bug was about
>>> pygrub (Python version of GRUB of Xen PV guests that is internally using
>>> e2fsprogs functionality to access data on ext2/3/4 partition to boot the
>>> PV guests) uses outdated/cached data so some modifications were
>>> necessary to open everything directly...
>>>        
>> So to get things staigt:  you're using e2fsprogs to manipulate a life
>> filesystem and thing using O_DIRECT saves your ass?  I think you need to
>> rething your model of operation fundamentally in that case.
>>
>>      
> Christoph -
>
> It's my understanding that nobody is doing concurrent access.
>
>    

Eric,
indeed, no concurrent accesses are happening that time. The guests file 
system seems to be cached since after dropping caches in host (dom0) it 
is reported to work correctly.

> If the host reads the block device via pygrub to boot the guest,
> the guest while running updates the same device when installing
> a new kernel (either through the fs, or by writing to the bdev;
> probably the former...), and then the guest shuts down - is there
> something which will sync the host's cache (cached from the
> prior read) with the updates from the guest?
>    

It appears that the data are cached in host (dom0) and another access 
(after guest update/shutdown sequence) makes dom0 still provide old 
(cached) data and not the current (newest) data.

Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 13:01               ` Michal Novotny
  2010-01-12 13:04                 ` Ric Wheeler
@ 2010-01-12 16:38                 ` Christoph Hellwig
  2010-01-12 16:43                   ` Michal Novotny
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2010-01-12 16:38 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4, linux-kernel

Ok, I looked at the issue.  The problem is that the Xen backend drivers
are (as expected) utterly braindead and submit bios directly from the
virtualization backed without using proper abstractions and thus
bypassing all the cache coherency features in the fileystems (the block
device nodes are just another mini-filesystem in that respect).  So
when you first have buffered access in the host pages may stay in cache
and get overwritten directly on disk by a Xen guest, and once the guest
is down the host may still use the now stale cached data.

I would recommend to migrate your cutomers to KVM which uses the proper
abtractions and thus doesn't have this problem.  There's a reason after
all why all the Xen dom0 mess never got merged to mainline.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:38                 ` Christoph Hellwig
@ 2010-01-12 16:43                   ` Michal Novotny
  2010-01-12 16:47                     ` Christoph Hellwig
  2010-01-12 16:50                     ` Ric Wheeler
  0 siblings, 2 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 16:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ric Wheeler, linux-ext4, linux-kernel

On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
> Ok, I looked at the issue.  The problem is that the Xen backend drivers
> are (as expected) utterly braindead and submit bios directly from the
> virtualization backed without using proper abstractions and thus
> bypassing all the cache coherency features in the fileystems (the block
> device nodes are just another mini-filesystem in that respect).  So
> when you first have buffered access in the host pages may stay in cache
> and get overwritten directly on disk by a Xen guest, and once the guest
> is down the host may still use the now stale cached data.
>
> I would recommend to migrate your cutomers to KVM which uses the proper
> abtractions and thus doesn't have this problem.  There's a reason after
> all why all the Xen dom0 mess never got merged to mainline.
>    
So, do you think the problem is in the Xen backend drivers and to make 
it working right in Xen the driver fix is needed?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:43                   ` Michal Novotny
@ 2010-01-12 16:47                     ` Christoph Hellwig
  2010-01-12 16:51                       ` Michal Novotny
  2010-01-12 16:50                     ` Ric Wheeler
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2010-01-12 16:47 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, Ric Wheeler, linux-ext4, linux-kernel

On Tue, Jan 12, 2010 at 05:43:21PM +0100, Michal Novotny wrote:
> So, do you think the problem is in the Xen backend drivers and to make  
> it working right in Xen the driver fix is needed?

Yes, the Xen blkback driver just submits I/O directly without using
the right interfaces to force cache coherency.  It might be relatively
easy to hack a call in to flush all caches when it starts up, but given
how it bypasses all abstractions it's almost impossible to give full
coherency as if using the normal block device interfaces.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:43                   ` Michal Novotny
  2010-01-12 16:47                     ` Christoph Hellwig
@ 2010-01-12 16:50                     ` Ric Wheeler
  2010-01-12 16:53                       ` Michal Novotny
  1 sibling, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 16:50 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Christoph Hellwig, linux-ext4, linux-kernel

On 01/12/2010 11:43 AM, Michal Novotny wrote:
> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>> Ok, I looked at the issue. The problem is that the Xen backend drivers
>> are (as expected) utterly braindead and submit bios directly from the
>> virtualization backed without using proper abstractions and thus
>> bypassing all the cache coherency features in the fileystems (the block
>> device nodes are just another mini-filesystem in that respect). So
>> when you first have buffered access in the host pages may stay in cache
>> and get overwritten directly on disk by a Xen guest, and once the guest
>> is down the host may still use the now stale cached data.
>>
>> I would recommend to migrate your cutomers to KVM which uses the proper
>> abtractions and thus doesn't have this problem. There's a reason after
>> all why all the Xen dom0 mess never got merged to mainline.
> So, do you think the problem is in the Xen backend drivers and to make
> it working right in Xen the driver fix is needed?

If XEN drivers by pass the normal IO and FS stack on the host, then I can 
understand why the hack to e2fsprogs works but it does not seem like a good fix.

Specifically, the data will continue to be cached (and if dirty, might be 
written back to the storage eventually).

If we need a work around, you need to drop VM caches for that device before you 
update the guest's files and possibly again afterwards (and make sure that 
nothing pulls the data into cache during the operation).

Basically, this sounds like the backend drivers are doing something really, 
really dangerous....

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:47                     ` Christoph Hellwig
@ 2010-01-12 16:51                       ` Michal Novotny
  0 siblings, 0 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 16:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ric Wheeler, linux-ext4, linux-kernel

On 01/12/2010 05:47 PM, Christoph Hellwig wrote:
> On Tue, Jan 12, 2010 at 05:43:21PM +0100, Michal Novotny wrote:
>    
>> So, do you think the problem is in the Xen backend drivers and to make
>> it working right in Xen the driver fix is needed?
>>      
> Yes, the Xen blkback driver just submits I/O directly without using
> the right interfaces to force cache coherency.  It might be relatively
> easy to hack a call in to flush all caches when it starts up, but given
> how it bypasses all abstractions it's almost impossible to give full
> coherency as if using the normal block device interfaces.
>
>    
Ok, this way the fix to the drivers should be done. Thanks for your 
investigation. I just forwarded your reply to other members of my team.

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:50                     ` Ric Wheeler
@ 2010-01-12 16:53                       ` Michal Novotny
  2010-01-12 16:56                         ` Eric Sandeen
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 16:53 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Christoph Hellwig, linux-ext4, linux-kernel

On 01/12/2010 05:50 PM, Ric Wheeler wrote:
> On 01/12/2010 11:43 AM, Michal Novotny wrote:
>> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>>> Ok, I looked at the issue. The problem is that the Xen backend drivers
>>> are (as expected) utterly braindead and submit bios directly from the
>>> virtualization backed without using proper abstractions and thus
>>> bypassing all the cache coherency features in the fileystems (the block
>>> device nodes are just another mini-filesystem in that respect). So
>>> when you first have buffered access in the host pages may stay in cache
>>> and get overwritten directly on disk by a Xen guest, and once the guest
>>> is down the host may still use the now stale cached data.
>>>
>>> I would recommend to migrate your cutomers to KVM which uses the proper
>>> abtractions and thus doesn't have this problem. There's a reason after
>>> all why all the Xen dom0 mess never got merged to mainline.
>> So, do you think the problem is in the Xen backend drivers and to make
>> it working right in Xen the driver fix is needed?
>
> If XEN drivers by pass the normal IO and FS stack on the host, then I 
> can understand why the hack to e2fsprogs works but it does not seem 
> like a good fix.
>
> Specifically, the data will continue to be cached (and if dirty, might 
> be written back to the storage eventually).
>
> If we need a work around, you need to drop VM caches for that device 
> before you update the guest's files and possibly again afterwards (and 
> make sure that nothing pulls the data into cache during the operation).
>
> Basically, this sounds like the backend drivers are doing something 
> really, really dangerous....
>
> ric
>
Ok, so you think this is not good to do this patch for e2fsprogs for 
direct access support? The only things we could do now is to fix the 
backend drivers or create a workaround to drop caches? I need to discuss 
this further with guys in my team...

Thanks,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:53                       ` Michal Novotny
@ 2010-01-12 16:56                         ` Eric Sandeen
  2010-01-12 16:59                           ` Ric Wheeler
  0 siblings, 1 reply; 31+ messages in thread
From: Eric Sandeen @ 2010-01-12 16:56 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Ric Wheeler, Christoph Hellwig, linux-ext4, linux-kernel

Michal Novotny wrote:
> On 01/12/2010 05:50 PM, Ric Wheeler wrote:
>> On 01/12/2010 11:43 AM, Michal Novotny wrote:
>>> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>>>> Ok, I looked at the issue. The problem is that the Xen backend drivers
>>>> are (as expected) utterly braindead and submit bios directly from the
>>>> virtualization backed without using proper abstractions and thus
>>>> bypassing all the cache coherency features in the fileystems (the block
>>>> device nodes are just another mini-filesystem in that respect). So
>>>> when you first have buffered access in the host pages may stay in cache
>>>> and get overwritten directly on disk by a Xen guest, and once the guest
>>>> is down the host may still use the now stale cached data.
>>>>
>>>> I would recommend to migrate your cutomers to KVM which uses the proper
>>>> abtractions and thus doesn't have this problem. There's a reason after
>>>> all why all the Xen dom0 mess never got merged to mainline.
>>> So, do you think the problem is in the Xen backend drivers and to make
>>> it working right in Xen the driver fix is needed?
>>
>> If XEN drivers by pass the normal IO and FS stack on the host, then I
>> can understand why the hack to e2fsprogs works but it does not seem
>> like a good fix.
>>
>> Specifically, the data will continue to be cached (and if dirty, might
>> be written back to the storage eventually).
>>
>> If we need a work around, you need to drop VM caches for that device
>> before you update the guest's files and possibly again afterwards (and
>> make sure that nothing pulls the data into cache during the operation).
>>
>> Basically, this sounds like the backend drivers are doing something
>> really, really dangerous....
>>
>> ric
>>
> Ok, so you think this is not good to do this patch for e2fsprogs for
> direct access support? The only things we could do now is to fix the
> backend drivers or create a workaround to drop caches? I need to discuss
> this further with guys in my team...

I do think that patching it up in e2fsprogs is unnecessarily invasive;
it's fixing it at the wrong spot.

Any block dev IO from the host is dangerous; fixing it only in e2fsprogs
for this one case doesn't seem like the right course of action.

-Eric

> Thanks,
> Michal
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:56                         ` Eric Sandeen
@ 2010-01-12 16:59                           ` Ric Wheeler
  2010-01-12 17:00                             ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 16:59 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Michal Novotny, Christoph Hellwig, linux-ext4, linux-kernel

On 01/12/2010 11:56 AM, Eric Sandeen wrote:
> Michal Novotny wrote:
>> On 01/12/2010 05:50 PM, Ric Wheeler wrote:
>>> On 01/12/2010 11:43 AM, Michal Novotny wrote:
>>>> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>>>>> Ok, I looked at the issue. The problem is that the Xen backend drivers
>>>>> are (as expected) utterly braindead and submit bios directly from the
>>>>> virtualization backed without using proper abstractions and thus
>>>>> bypassing all the cache coherency features in the fileystems (the block
>>>>> device nodes are just another mini-filesystem in that respect). So
>>>>> when you first have buffered access in the host pages may stay in cache
>>>>> and get overwritten directly on disk by a Xen guest, and once the guest
>>>>> is down the host may still use the now stale cached data.
>>>>>
>>>>> I would recommend to migrate your cutomers to KVM which uses the proper
>>>>> abtractions and thus doesn't have this problem. There's a reason after
>>>>> all why all the Xen dom0 mess never got merged to mainline.
>>>> So, do you think the problem is in the Xen backend drivers and to make
>>>> it working right in Xen the driver fix is needed?
>>>
>>> If XEN drivers by pass the normal IO and FS stack on the host, then I
>>> can understand why the hack to e2fsprogs works but it does not seem
>>> like a good fix.
>>>
>>> Specifically, the data will continue to be cached (and if dirty, might
>>> be written back to the storage eventually).
>>>
>>> If we need a work around, you need to drop VM caches for that device
>>> before you update the guest's files and possibly again afterwards (and
>>> make sure that nothing pulls the data into cache during the operation).
>>>
>>> Basically, this sounds like the backend drivers are doing something
>>> really, really dangerous....
>>>
>>> ric
>>>
>> Ok, so you think this is not good to do this patch for e2fsprogs for
>> direct access support? The only things we could do now is to fix the
>> backend drivers or create a workaround to drop caches? I need to discuss
>> this further with guys in my team...
>
> I do think that patching it up in e2fsprogs is unnecessarily invasive;
> it's fixing it at the wrong spot.
>
> Any block dev IO from the host is dangerous; fixing it only in e2fsprogs
> for this one case doesn't seem like the right course of action.
>
> -Eric

It actually could produce some nastier issues where it would work a bit, the bad 
data gets flushed back to the backing store and then your O_DIRECT read would be 
broken.

Also, for normal users of e2fsprogs, they should never bypass the cache...

ric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 16:59                           ` Ric Wheeler
@ 2010-01-12 17:00                             ` Michal Novotny
  2010-01-14 13:46                               ` Michal Novotny
  0 siblings, 1 reply; 31+ messages in thread
From: Michal Novotny @ 2010-01-12 17:00 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Eric Sandeen, Christoph Hellwig, linux-ext4, linux-kernel

On 01/12/2010 05:59 PM, Ric Wheeler wrote:
> On 01/12/2010 11:56 AM, Eric Sandeen wrote:
>> Michal Novotny wrote:
>>> On 01/12/2010 05:50 PM, Ric Wheeler wrote:
>>>> On 01/12/2010 11:43 AM, Michal Novotny wrote:
>>>>> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>>>>>> Ok, I looked at the issue. The problem is that the Xen backend 
>>>>>> drivers
>>>>>> are (as expected) utterly braindead and submit bios directly from 
>>>>>> the
>>>>>> virtualization backed without using proper abstractions and thus
>>>>>> bypassing all the cache coherency features in the fileystems (the 
>>>>>> block
>>>>>> device nodes are just another mini-filesystem in that respect). So
>>>>>> when you first have buffered access in the host pages may stay in 
>>>>>> cache
>>>>>> and get overwritten directly on disk by a Xen guest, and once the 
>>>>>> guest
>>>>>> is down the host may still use the now stale cached data.
>>>>>>
>>>>>> I would recommend to migrate your cutomers to KVM which uses the 
>>>>>> proper
>>>>>> abtractions and thus doesn't have this problem. There's a reason 
>>>>>> after
>>>>>> all why all the Xen dom0 mess never got merged to mainline.
>>>>> So, do you think the problem is in the Xen backend drivers and to 
>>>>> make
>>>>> it working right in Xen the driver fix is needed?
>>>>
>>>> If XEN drivers by pass the normal IO and FS stack on the host, then I
>>>> can understand why the hack to e2fsprogs works but it does not seem
>>>> like a good fix.
>>>>
>>>> Specifically, the data will continue to be cached (and if dirty, might
>>>> be written back to the storage eventually).
>>>>
>>>> If we need a work around, you need to drop VM caches for that device
>>>> before you update the guest's files and possibly again afterwards (and
>>>> make sure that nothing pulls the data into cache during the 
>>>> operation).
>>>>
>>>> Basically, this sounds like the backend drivers are doing something
>>>> really, really dangerous....
>>>>
>>>> ric
>>>>
>>> Ok, so you think this is not good to do this patch for e2fsprogs for
>>> direct access support? The only things we could do now is to fix the
>>> backend drivers or create a workaround to drop caches? I need to 
>>> discuss
>>> this further with guys in my team...
>>
>> I do think that patching it up in e2fsprogs is unnecessarily invasive;
>> it's fixing it at the wrong spot.
>>
>> Any block dev IO from the host is dangerous; fixing it only in e2fsprogs
>> for this one case doesn't seem like the right course of action.
>>
>> -Eric
>
> It actually could produce some nastier issues where it would work a 
> bit, the bad data gets flushed back to the backing store and then your 
> O_DIRECT read would be broken.
>
> Also, for normal users of e2fsprogs, they should never bypass the 
> cache...
>
> ric
Ok, good to know the normal users should never bypass the cache. That 
way this seems to be some kind of kernel bug...

Thank you all for your input,
Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 15:46             ` Michal Novotny
@ 2010-01-12 20:01               ` Ric Wheeler
  0 siblings, 0 replies; 31+ messages in thread
From: Ric Wheeler @ 2010-01-12 20:01 UTC (permalink / raw)
  To: Michal Novotny; +Cc: Eric Sandeen, Christoph Hellwig, linux-ext4


Just for completeness, this is what we think we have going on:

1) First boot: Running on the host, pygrub reads the device (unmounted) 
to bootstrap the guest with image A (kernel + grub.conf)

(2) guest image updates the kernel/grub.conf using weird Xen IO path 
(bypassing the host page cache, creating BIO's directly in the host 
memory).

Note that at this point in time, the image from (1) is still possibly in 
the page cache of the host

(3) reboot of guest - host pygrub uses page cache (stale pages) when 
bootstrapping the guest who occasionally boots into the stale image.

This doesn't happen all of the time - if the pages are dropped before 
(3) happens, it won't happen.

Uses O_DIRECT has a side effect of invalidating the page cache while 
reading.

Dropping VM caches just for the devices in question would fix this or 
(as Christoph mentioned) use kvm which does this more sanely :-)

ric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option
  2010-01-12 17:00                             ` Michal Novotny
@ 2010-01-14 13:46                               ` Michal Novotny
  0 siblings, 0 replies; 31+ messages in thread
From: Michal Novotny @ 2010-01-14 13:46 UTC (permalink / raw)
  To: Ric Wheeler; +Cc: Eric Sandeen, Christoph Hellwig, linux-ext4, linux-kernel

On 01/12/2010 06:00 PM, Michal Novotny wrote:
> On 01/12/2010 05:59 PM, Ric Wheeler wrote:
>> On 01/12/2010 11:56 AM, Eric Sandeen wrote:
>>> Michal Novotny wrote:
>>>> On 01/12/2010 05:50 PM, Ric Wheeler wrote:
>>>>> On 01/12/2010 11:43 AM, Michal Novotny wrote:
>>>>>> On 01/12/2010 05:38 PM, Christoph Hellwig wrote:
>>>>>>> Ok, I looked at the issue. The problem is that the Xen backend 
>>>>>>> drivers
>>>>>>> are (as expected) utterly braindead and submit bios directly 
>>>>>>> from the
>>>>>>> virtualization backed without using proper abstractions and thus
>>>>>>> bypassing all the cache coherency features in the fileystems 
>>>>>>> (the block
>>>>>>> device nodes are just another mini-filesystem in that respect). So
>>>>>>> when you first have buffered access in the host pages may stay 
>>>>>>> in cache
>>>>>>> and get overwritten directly on disk by a Xen guest, and once 
>>>>>>> the guest
>>>>>>> is down the host may still use the now stale cached data.
>>>>>>>
>>>>>>> I would recommend to migrate your cutomers to KVM which uses the 
>>>>>>> proper
>>>>>>> abtractions and thus doesn't have this problem. There's a reason 
>>>>>>> after
>>>>>>> all why all the Xen dom0 mess never got merged to mainline.
>>>>>> So, do you think the problem is in the Xen backend drivers and to 
>>>>>> make
>>>>>> it working right in Xen the driver fix is needed?
>>>>>
>>>>> If XEN drivers by pass the normal IO and FS stack on the host, then I
>>>>> can understand why the hack to e2fsprogs works but it does not seem
>>>>> like a good fix.
>>>>>
>>>>> Specifically, the data will continue to be cached (and if dirty, 
>>>>> might
>>>>> be written back to the storage eventually).
>>>>>
>>>>> If we need a work around, you need to drop VM caches for that device
>>>>> before you update the guest's files and possibly again afterwards 
>>>>> (and
>>>>> make sure that nothing pulls the data into cache during the 
>>>>> operation).
>>>>>
>>>>> Basically, this sounds like the backend drivers are doing something
>>>>> really, really dangerous....
>>>>>
>>>>> ric
>>>>>
>>>> Ok, so you think this is not good to do this patch for e2fsprogs for
>>>> direct access support? The only things we could do now is to fix the
>>>> backend drivers or create a workaround to drop caches? I need to 
>>>> discuss
>>>> this further with guys in my team...
>>>
>>> I do think that patching it up in e2fsprogs is unnecessarily invasive;
>>> it's fixing it at the wrong spot.
>>>
>>> Any block dev IO from the host is dangerous; fixing it only in 
>>> e2fsprogs
>>> for this one case doesn't seem like the right course of action.
>>>
>>> -Eric
>>
>> It actually could produce some nastier issues where it would work a 
>> bit, the bad data gets flushed back to the backing store and then 
>> your O_DIRECT read would be broken.
>>
>> Also, for normal users of e2fsprogs, they should never bypass the 
>> cache...
>>
>> ric
> Ok, good to know the normal users should never bypass the cache. That 
> way this seems to be some kind of kernel bug...
>
Hi,
finally, after discussing this with several people we decided it's 
better to solve it in kernel path since Xen is using some weird IO path 
like mentioned. Since this patch is about using O_DIRECT in e2fsprogs 
which I've been told that normal e2fsprogs user should never do I'd like 
to thanks you all for your help and tell you to ignore this patch since 
we're going to solve it some other way, ie. fix this in the kernel itself.

Thanks for your help!

Michal

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2010-01-14 13:47 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-08  9:36 [PATCH] extend e2fsprogs functionality to add EXT2_FLAG_DIRECT option Michal Novotny
2010-01-11 20:06 ` Ric Wheeler
2010-01-12 10:54   ` Michal Novotny
2010-01-12 11:59     ` Ric Wheeler
2010-01-12 12:15       ` Michal Novotny
2010-01-12 12:23         ` Christoph Hellwig
2010-01-12 12:30           ` Michal Novotny
2010-01-12 12:46             ` Christoph Hellwig
2010-01-12 13:01               ` Michal Novotny
2010-01-12 13:04                 ` Ric Wheeler
2010-01-12 13:12                   ` Michal Novotny
2010-01-12 13:23                     ` Michal Novotny
2010-01-12 13:29                       ` Ric Wheeler
2010-01-12 13:33                         ` Michal Novotny
2010-01-12 14:33                           ` Chris Lee
2010-01-12 14:37                             ` Michal Novotny
2010-01-12 16:38                 ` Christoph Hellwig
2010-01-12 16:43                   ` Michal Novotny
2010-01-12 16:47                     ` Christoph Hellwig
2010-01-12 16:51                       ` Michal Novotny
2010-01-12 16:50                     ` Ric Wheeler
2010-01-12 16:53                       ` Michal Novotny
2010-01-12 16:56                         ` Eric Sandeen
2010-01-12 16:59                           ` Ric Wheeler
2010-01-12 17:00                             ` Michal Novotny
2010-01-14 13:46                               ` Michal Novotny
2010-01-12 12:47             ` Andreas Dilger
2010-01-12 13:04               ` Michal Novotny
2010-01-12 15:16           ` Eric Sandeen
2010-01-12 15:46             ` Michal Novotny
2010-01-12 20:01               ` Ric Wheeler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).