From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from aserp1040.oracle.com ([141.146.126.69]:46298 "EHLO aserp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933321AbbFJNif (ORCPT ); Wed, 10 Jun 2015 09:38:35 -0400 Date: Wed, 10 Jun 2015 21:38:29 +0800 From: Liu Bo To: Filipe David Manana Cc: "linux-btrfs@vger.kernel.org" Subject: Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file Message-ID: <20150610133827.GG3561@localhost.localdomain> Reply-To: bo.li.liu@oracle.com References: <1433851471-11543-1-git-send-email-bo.li.liu@oracle.com> <1433851471-11543-2-git-send-email-bo.li.liu@oracle.com> <20150610020906.GA2849@localhost.localdomain> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii In-Reply-To: Sender: linux-btrfs-owner@vger.kernel.org List-ID: On Wed, Jun 10, 2015 at 01:45:29PM +0100, Filipe David Manana wrote: > On Wed, Jun 10, 2015 at 9:26 AM, Filipe David Manana wrote: > > On Wed, Jun 10, 2015 at 3:09 AM, Liu Bo wrote: > >> On Tue, Jun 09, 2015 at 01:56:41PM +0100, Filipe David Manana wrote: > >>> On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo wrote: > >>> > If we're overwriting an allocated file without changing timestamp > >>> > and inode version, and the file is with NODATACOW, we don't have any metadata to > >>> > commit, thus we can just flush the data device cache and go forward. > >>> > > >>> > However, if there's have any change on extents' disk bytenr, inode size, > >>> > timestamp or inode version, we need to go through the normal btrfs_log_inode > >>> > path. > >>> > > >>> > Test: > >>> > ---------------------------- > >>> > 1. sysbench test of > >>> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite + > >>> > fsync_on_each_write", > >>> > 2. loop device associated with tmpfs file > >>> > 3. > >>> > - For btrfs, "-o nodatacow" and "-o noi_version" option > >>> > - For ext4 and xfs, no extra mount options > >>> > ---------------------------- > >>> > > >>> > Results: > >>> > ---------------------------- > >>> > - btrfs: > >>> > w/o: ~30Mb/sec > >>> > w: ~181Mb/sec > >>> > > >>> > - other filesystems: (both don't enable i_version by default) > >>> > ext4: 203Mb/sec > >>> > xfs: 212Mb/sec > >>> > ---------------------------- > >>> > > >>> > Signed-off-by: Liu Bo > >>> > --- > >>> > fs/btrfs/btrfs_inode.h | 2 ++ > >>> > fs/btrfs/disk-io.c | 2 +- > >>> > fs/btrfs/disk-io.h | 1 + > >>> > fs/btrfs/file.c | 39 ++++++++++++++++++++++++++++++++++----- > >>> > fs/btrfs/inode.c | 3 +++ > >>> > 5 files changed, 41 insertions(+), 6 deletions(-) > >>> > > >>> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h > >>> > index 0ef5cc1..b36d87a 100644 > >>> > --- a/fs/btrfs/btrfs_inode.h > >>> > +++ b/fs/btrfs/btrfs_inode.h > >>> > @@ -44,6 +44,8 @@ > >>> > #define BTRFS_INODE_IN_DELALLOC_LIST 9 > >>> > #define BTRFS_INODE_READDIO_NEED_LOCK 10 > >>> > #define BTRFS_INODE_HAS_PROPS 11 > >>> > +#define BTRFS_INODE_NOTIMESTAMP 12 > >>> > +#define BTRFS_INODE_NOISIZE 13 > >>> > /* > >>> > * The following 3 bits are meant only for the btree inode. > >>> > * When any of them is set, it means an error happened while writing an > >>> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > >>> > index 2ef9a4b..de7fd94 100644 > >>> > --- a/fs/btrfs/disk-io.c > >>> > +++ b/fs/btrfs/disk-io.c > >>> > @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) > >>> > * send an empty flush down to each device in parallel, > >>> > * then wait for them > >>> > */ > >>> > -static int barrier_all_devices(struct btrfs_fs_info *info) > >>> > +int barrier_all_devices(struct btrfs_fs_info *info) > >>> > { > >>> > struct list_head *head; > >>> > struct btrfs_device *dev; > >>> > diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h > >>> > index d4cbfee..2bc91fe 100644 > >>> > --- a/fs/btrfs/disk-io.h > >>> > +++ b/fs/btrfs/disk-io.h > >>> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root); > >>> > int write_ctree_super(struct btrfs_trans_handle *trans, > >>> > struct btrfs_root *root, int max_mirrors); > >>> > struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); > >>> > +int barrier_all_devices(struct btrfs_fs_info *info); > >>> > int btrfs_commit_super(struct btrfs_root *root); > >>> > struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, > >>> > u64 bytenr); > >>> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > >>> > index 23b6e03..861c29f 100644 > >>> > --- a/fs/btrfs/file.c > >>> > +++ b/fs/btrfs/file.c > >>> > @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, > >>> > * the disk i_size. There is no need to log the inode > >>> > * at this time. > >>> > */ > >>> > - if (end_pos > isize) > >>> > + if (end_pos > isize) { > >>> > i_size_write(inode, end_pos); > >>> > + clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags); > >>> > + } else { > >>> > + set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags); > >>> > + } > >>> > return 0; > >>> > } > >>> > > >>> > @@ -1711,19 +1715,33 @@ out: > >>> > static void update_time_for_write(struct inode *inode) > >>> > { > >>> > struct timespec now; > >>> > + int sync_it = 0; > >>> > > >>> > - if (IS_NOCMTIME(inode)) > >>> > + if (IS_NOCMTIME(inode)) { > >>> > + set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags); > >>> > return; > >>> > + } > >>> > > >>> > now = current_fs_time(inode->i_sb); > >>> > - if (!timespec_equal(&inode->i_mtime, &now)) > >>> > + if (!timespec_equal(&inode->i_mtime, &now)) { > >>> > inode->i_mtime = now; > >>> > + sync_it = S_MTIME; > >>> > + } > >>> > > >>> > - if (!timespec_equal(&inode->i_ctime, &now)) > >>> > + if (!timespec_equal(&inode->i_ctime, &now)) { > >>> > inode->i_ctime = now; > >>> > + sync_it |= S_CTIME; > >>> > + } > >>> > > >>> > - if (IS_I_VERSION(inode)) > >>> > + if (IS_I_VERSION(inode)) { > >>> > inode_inc_iversion(inode); > >>> > + sync_it |= S_VERSION; > >>> > + } > >>> > + > >>> > + if (!sync_it) > >>> > + set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags); > >>> > + else > >>> > + clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags); > >>> > } > >>> > > >>> > static ssize_t btrfs_file_write_iter(struct kiocb *iocb, > >>> > @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) > >>> > goto out; > >>> > } > >>> > > >>> > + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { > >>> > + if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP, > >>> > + &BTRFS_I(inode)->runtime_flags) && > >>> > + test_and_clear_bit(BTRFS_INODE_NOISIZE, > >>> > + &BTRFS_I(inode)->runtime_flags)) { > >>> > + barrier_all_devices(root->fs_info); > >>> > + mutex_unlock(&inode->i_mutex); > >>> > + goto out; > >>> > >>> Hi Liu, > >>> > >>> For the non-full sync case, what happens if an IO error happened > >>> during writeback? > >>> I don't see anything here that checks if an IO error happened and > >>> return -EIO to user space if such error happened. > >>> In other words, testing for the bit AS_EIO in the inode->i_mapping->flags. > >> > >> Good point, I missed that part. > >> > >> Besides that, is the whole "noi_version and nocow" idea acceptable to you? > > > > Yes. > > I haven't tested it however, just eyeballed the patches. > > I forgot to ask before, but any special reason to use > barrier_all_devices() instead of waiting for the ordered extents in > the given range to get the BTRFS_ORDERED_IO_DONE set? That reminds me, yes, both are needed -- flushing/waiting data and flushing all devices' cache, but IMO we don't have to wait for BTRFS_ORDERED_IO_DONE but only filemap_fdatawait_range is good enough. I'll fix that along with data error. > > Using the barrier, wouldn't it wait for all writeback, including those > for other files or for ranges outside the range given to > btrfs_sync_file() (important for msync for e.g.). This barrier is for flushing device cache, I think we have to do that since we skip log commit part. (But maybe it's not needed if we specify NOBARRIER option.) Thanks, -liubo > > thanks > > > > > Thanks. > > > >> > >> Thanks, > >> > >> -liubo > >> > >>> > >>> thanks > >>> > >>> > + } > >>> > + } > >>> > + > >>> > /* > >>> > * ok we haven't committed the transaction yet, lets do a commit > >>> > */ > >>> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > >>> > index 0020b56..3d230e6 100644 > >>> > --- a/fs/btrfs/inode.c > >>> > +++ b/fs/btrfs/inode.c > >>> > @@ -1384,6 +1384,7 @@ out_check: > >>> > > >>> > btrfs_release_path(path); > >>> > if (cow_start != (u64)-1) { > >>> > + clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags); > >>> > ret = cow_file_range(inode, locked_page, > >>> > cow_start, found_key.offset - 1, > >>> > page_started, nr_written, 1); > >>> > @@ -1426,6 +1427,7 @@ out_check: > >>> > em->start + em->len - 1, 0); > >>> > } > >>> > type = BTRFS_ORDERED_PREALLOC; > >>> > + clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags); > >>> > } else { > >>> > type = BTRFS_ORDERED_NOCOW; > >>> > } > >>> > @@ -1464,6 +1466,7 @@ out_check: > >>> > } > >>> > > >>> > if (cow_start != (u64)-1) { > >>> > + clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags); > >>> > ret = cow_file_range(inode, locked_page, cow_start, end, > >>> > page_started, nr_written, 1); > >>> > if (ret) > >>> > -- > >>> > 2.1.0 > >>> > > >>> > -- > >>> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > >>> > the body of a message to majordomo@vger.kernel.org > >>> > More majordomo info at http://vger.kernel.org/majordomo-info.html > >>> > >>> > >>> > >>> -- > >>> Filipe David Manana, > >>> > >>> "Reasonable men adapt themselves to the world. > >>> Unreasonable men adapt the world to themselves. > >>> That's why all progress depends on unreasonable men." > > > > > > > > -- > > Filipe David Manana, > > > > "Reasonable men adapt themselves to the world. > > Unreasonable men adapt the world to themselves. > > That's why all progress depends on unreasonable men." > > > > -- > Filipe David Manana, > > "Reasonable men adapt themselves to the world. > Unreasonable men adapt the world to themselves. > That's why all progress depends on unreasonable men."