All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
To: Jan Kara <jack@suse.cz>, Ted Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org,
	Gao Xiang <hsiangkao@linux.alibaba.com>, Jan Kara <jack@suse.cz>
Subject: Re: [PATCH] ext4: Properly sync file size update after O_SYNC direct IO
Date: Thu, 12 Oct 2023 20:55:29 +0530	[thread overview]
Message-ID: <87il7bj21i.fsf@doe.com> (raw)
In-Reply-To: <20231011142155.19328-1-jack@suse.cz>

Jan Kara <jack@suse.cz> writes:

> Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly
> sync file size update and thus if we crash at unfortunate moment, the
> file can have smaller size although O_SYNC IO has reported successful
> completion. The problem happens because update of on-disk inode size is
> handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus
> dio_complete() in particular) has returned and generic_file_sync() gets
> called by dio_complete(). Fix the problem by handling on-disk inode size
> update directly in our ->end_io completion handler.
>
> References: https://lore.kernel.org/all/02d18236-26ef-09b0-90ad-030c4fe3ee20@linux.alibaba.com
> Reported-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

Guess we need a fixes tag.

> ---
>  fs/ext4/file.c | 139 ++++++++++++++++++-------------------------------
>  1 file changed, 52 insertions(+), 87 deletions(-)
>
> So finally I've hopefully got all the corner cases right ;) At least fstest
> pass now.
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 1492b1ae21f4..d0711c1a9b06 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -306,80 +306,34 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  }
>  
>  static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> -					   ssize_t written, size_t count)
> +					   ssize_t count)
>  {
>  	handle_t *handle;
> -	bool truncate = false;
> -	u8 blkbits = inode->i_blkbits;
> -	ext4_lblk_t written_blk, end_blk;
> -	int ret;
> -
> -	/*
> -	 * Note that EXT4_I(inode)->i_disksize can get extended up to
> -	 * inode->i_size while the I/O was running due to writeback of delalloc
> -	 * blocks. But, the code in ext4_iomap_alloc() is careful to use
> -	 * zeroed/unwritten extents if this is possible; thus we won't leave
> -	 * uninitialized blocks in a file even if we didn't succeed in writing
> -	 * as much as we intended.
> -	 */
> -	WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
> -	if (offset + count <= EXT4_I(inode)->i_disksize) {
> -		/*
> -		 * We need to ensure that the inode is removed from the orphan
> -		 * list if it has been added prematurely, due to writeback of
> -		 * delalloc blocks.
> -		 */
> -		if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
> -			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> -
> -			if (IS_ERR(handle)) {
> -				ext4_orphan_del(NULL, inode);
> -				return PTR_ERR(handle);
> -			}
> -
> -			ext4_orphan_del(handle, inode);
> -			ext4_journal_stop(handle);
> -		}
> -
> -		return written;
> -	}
> -
> -	if (written < 0)
> -		goto truncate;
>  
> +	lockdep_assert_held_write(&inode->i_rwsem);
>  	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> -	if (IS_ERR(handle)) {
> -		written = PTR_ERR(handle);
> -		goto truncate;
> -	}
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
>  
> -	if (ext4_update_inode_size(inode, offset + written)) {
> -		ret = ext4_mark_inode_dirty(handle, inode);
> +	if (ext4_update_inode_size(inode, offset + count)) {
> +		int ret = ext4_mark_inode_dirty(handle, inode);
>  		if (unlikely(ret)) {
> -			written = ret;
>  			ext4_journal_stop(handle);
> -			goto truncate;
> +			return ret;
>  		}
>  	}
>  
> -	/*
> -	 * We may need to truncate allocated but not written blocks beyond EOF.
> -	 */
> -	written_blk = ALIGN(offset + written, 1 << blkbits);
> -	end_blk = ALIGN(offset + count, 1 << blkbits);
> -	if (written_blk < end_blk && ext4_can_truncate(inode))
> -		truncate = true;
> -
> -	/*
> -	 * Remove the inode from the orphan list if it has been extended and
> -	 * everything went OK.
> -	 */
> -	if (!truncate && inode->i_nlink)
> +	if (inode->i_nlink)
>  		ext4_orphan_del(handle, inode);
>  	ext4_journal_stop(handle);
>  
> -	if (truncate) {
> -truncate:
> +	return count;
> +}
> +
> +static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
> +{
> +	lockdep_assert_held_write(&inode->i_rwsem);
> +	if (count < 0) {
>  		ext4_truncate_failed_write(inode);
>  		/*
>  		 * If the truncate operation failed early, then the inode may
> @@ -388,9 +342,28 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
>  		 */
>  		if (inode->i_nlink)
>  			ext4_orphan_del(NULL, inode);
> +		return;
>  	}
> +	/*
> +	 * If i_disksize got extended due to writeback of delalloc blocks while
> +	 * the DIO was running we could fail to cleanup the orphan list in
> +	 * ext4_handle_inode_extension(). Do it now.
> +	 */
> +	if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
> +		handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
>  
> -	return written;
> +		if (IS_ERR(handle)) {
> +			/*
> +			 * The write has successfully completed. Not much to
> +			 * do with the error here so just cleanup the orphan
> +			 * list and hope for the best.
> +			 */
> +			ext4_orphan_del(NULL, inode);
> +			return;
> +		}
> +		ext4_orphan_del(handle, inode);
> +		ext4_journal_stop(handle);
> +	}
>  }
>  
>  static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> @@ -399,31 +372,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
>  	loff_t pos = iocb->ki_pos;
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  
> +	if (!error && size && flags & IOMAP_DIO_END_UNWRITTEN)

Do we have IOMAP_DIO_END_UNWRITTEN? or should it be IOMAP_DIO_UNWRITTEN?
Also we don't need to check !error case if we return early in case of an error.

> +		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
>  	if (error)
>  		return error;
> -
> -	if (size && flags & IOMAP_DIO_END_UNWRITTEN) {

ditto.

> -		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
> -		if (error < 0)
> -			return error;
> -	}
>  	/*
> -	 * If we are extending the file, we have to update i_size here before
> -	 * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
> -	 * buffered reads could zero out too much from page cache pages. Update
> -	 * of on-disk size will happen later in ext4_dio_write_iter() where
> -	 * we have enough information to also perform orphan list handling etc.
> -	 * Note that we perform all extending writes synchronously under
> -	 * i_rwsem held exclusively so i_size update is safe here in that case.
> -	 * If the write was not extending, we cannot see pos > i_size here
> -	 * because operations reducing i_size like truncate wait for all
> -	 * outstanding DIO before updating i_size.
> +	 * Note that EXT4_I(inode)->i_disksize can get extended up to
> +	 * inode->i_size while the I/O was running due to writeback of delalloc
> +	 * blocks. But the code in ext4_iomap_alloc() is careful to use
> +	 * zeroed/unwritten extents if this is possible; thus we won't leave
> +	 * uninitialized blocks in a file even if we didn't succeed in writing
> +	 * as much as we intended.
>  	 */
> -	pos += size;
> -	if (pos > i_size_read(inode))
> -		i_size_write(inode, pos);
> -
> -	return 0;
> +	WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
> +	if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
> +		return 0;
> +	return ext4_handle_inode_extension(inode, pos, size);
>  }

Although it is not a problem, but we are sometimes returning 0 and
sometimes count here.

>  
>  static const struct iomap_dio_ops ext4_dio_write_ops = {
> @@ -606,9 +570,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  			   dio_flags, NULL, 0);
>  	if (ret == -ENOTBLK)
>  		ret = 0;
> -
>  	if (extend)
> -		ret = ext4_handle_inode_extension(inode, offset, ret, count);
> +		ext4_inode_extension_cleanup(inode, ret);
>  
>  out:
>  	if (ilock_shared)
> @@ -689,8 +652,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  
>  	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
>  
> -	if (extend)
> -		ret = ext4_handle_inode_extension(inode, offset, ret, count);
> +	if (extend) {
> +		ret = ext4_handle_inode_extension(inode, offset, ret);
> +		ext4_inode_extension_cleanup(inode, ret);

ok, looks like we are using that return value here.

> +	}
>  out:
>  	inode_unlock(inode);
>  	if (ret > 0)
> -- 
> 2.35.3

Otherwise it looks good to me.

-ritesh

  parent reply	other threads:[~2023-10-12 15:25 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-11 14:21 [PATCH] ext4: Properly sync file size update after O_SYNC direct IO Jan Kara
2023-10-12  0:26 ` Dave Chinner
2023-10-12  2:21   ` Gao Xiang
2023-10-12  8:59   ` Jan Kara
2023-10-12 23:33     ` Dave Chinner
2023-10-13 10:14       ` Jan Kara
2023-10-12 15:25 ` Ritesh Harjani [this message]
2023-10-12 16:17   ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87il7bj21i.fsf@doe.com \
    --to=ritesh.list@gmail.com \
    --cc=hsiangkao@linux.alibaba.com \
    --cc=jack@suse.cz \
    --cc=linux-ext4@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.