* [PATCH v2] xfs: update i_size after unwritten conversion in dio completion
@ 2017-09-21 15:52 Eryu Guan
  2017-09-26 15:00 ` Christoph Hellwig
  0 siblings, 1 reply; 2+ messages in thread
From: Eryu Guan @ 2017-09-21 15:52 UTC (permalink / raw)
  To: linux-xfs; +Cc: Eryu Guan
Since commit d531d91d6990 ("xfs: always use unwritten extents for
direct I/O writes"), we start allocating unwritten extents for all
direct writes to allow appending aio in XFS.
But for dio writes that could extend file size we update the in-core
inode size first, then convert the unwritten extents to real
allocations at dio completion time in xfs_dio_write_end_io(). Thus a
racing direct read could see the new i_size and find the unwritten
extents first and read zeros instead of actual data, if the direct
writer also takes a shared iolock.
Fix it by updating the in-core inode size after the unwritten extent
conversion. To do this, introduce a new boolean argument to
xfs_iomap_write_unwritten() to tell if we want to update in-core
i_size or not.
Suggested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Eryu Guan <eguan@redhat.com>
---
v2:
- fix comments by copying Brian's words :)
- fix code style, remove unnecessary blank lines and braces
v1: https://marc.info/?l=linux-xfs&m=150599032124565&w=2
 fs/xfs/xfs_aops.c  |  3 ++-
 fs/xfs/xfs_file.c  | 33 +++++++++++++++++++--------------
 fs/xfs/xfs_iomap.c |  7 +++++--
 fs/xfs/xfs_iomap.h |  2 +-
 fs/xfs/xfs_pnfs.c  |  2 +-
 5 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 29172609f2a3..f18e5932aec4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -343,7 +343,8 @@ xfs_end_io(
 		error = xfs_reflink_end_cow(ip, offset, size);
 		break;
 	case XFS_IO_UNWRITTEN:
-		error = xfs_iomap_write_unwritten(ip, offset, size);
+		/* writeback should never update isize */
+		error = xfs_iomap_write_unwritten(ip, offset, size, false);
 		break;
 	default:
 		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 350b6d43ba23..309e26c9dddb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -434,7 +434,6 @@ xfs_dio_write_end_io(
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			offset = iocb->ki_pos;
-	bool			update_size = false;
 	int			error = 0;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
@@ -445,6 +444,21 @@ xfs_dio_write_end_io(
 	if (size <= 0)
 		return size;
 
+	if (flags & IOMAP_DIO_COW) {
+		error = xfs_reflink_end_cow(ip, offset, size);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Unwritten conversion updates the in-core isize after extent
+	 * conversion but before updating the on-disk size. Updating isize any
+	 * earlier allows a racing dio read to find unwritten extents before
+	 * they are converted.
+	 */
+	if (flags & IOMAP_DIO_UNWRITTEN)
+		return xfs_iomap_write_unwritten(ip, offset, size, true);
+
 	/*
 	 * We need to update the in-core inode size here so that we don't end up
 	 * with the on-disk inode size being outside the in-core inode size. We
@@ -459,20 +473,11 @@ xfs_dio_write_end_io(
 	spin_lock(&ip->i_flags_lock);
 	if (offset + size > i_size_read(inode)) {
 		i_size_write(inode, offset + size);
-		update_size = true;
-	}
-	spin_unlock(&ip->i_flags_lock);
-
-	if (flags & IOMAP_DIO_COW) {
-		error = xfs_reflink_end_cow(ip, offset, size);
-		if (error)
-			return error;
-	}
-
-	if (flags & IOMAP_DIO_UNWRITTEN)
-		error = xfs_iomap_write_unwritten(ip, offset, size);
-	else if (update_size)
+		spin_unlock(&ip->i_flags_lock);
 		error = xfs_setfilesize(ip, offset, size);
+	} else {
+		spin_unlock(&ip->i_flags_lock);
+	}
 
 	return error;
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a1909bc064e9..f179bdf1644d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -829,7 +829,8 @@ int
 xfs_iomap_write_unwritten(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
-	xfs_off_t	count)
+	xfs_off_t	count,
+	bool		update_isize)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -840,6 +841,7 @@ xfs_iomap_write_unwritten(
 	xfs_trans_t	*tp;
 	xfs_bmbt_irec_t imap;
 	struct xfs_defer_ops dfops;
+	struct inode	*inode = VFS_I(ip);
 	xfs_fsize_t	i_size;
 	uint		resblks;
 	int		error;
@@ -899,7 +901,8 @@ xfs_iomap_write_unwritten(
 		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
 		if (i_size > offset + count)
 			i_size = offset + count;
-
+		if (update_isize && i_size > i_size_read(inode))
+			i_size_write(inode, i_size);
 		i_size = xfs_new_eof(ip, i_size);
 		if (i_size) {
 			ip->i_d.di_size = i_size;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 00db3ecea084..ee535065c5d0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
 int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
 			struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 2f2dc3c09ad0..4246876df7b7 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -274,7 +274,7 @@ xfs_fs_commit_blocks(
 					(end - 1) >> PAGE_SHIFT);
 		WARN_ON_ONCE(error);
 
-		error = xfs_iomap_write_unwritten(ip, start, length);
+		error = xfs_iomap_write_unwritten(ip, start, length, false);
 		if (error)
 			goto out_drop_iolock;
 	}
-- 
2.13.5
^ permalink raw reply related	[flat|nested] 2+ messages in thread
* Re: [PATCH v2] xfs: update i_size after unwritten conversion in dio completion
  2017-09-21 15:52 [PATCH v2] xfs: update i_size after unwritten conversion in dio completion Eryu Guan
@ 2017-09-26 15:00 ` Christoph Hellwig
  0 siblings, 0 replies; 2+ messages in thread
From: Christoph Hellwig @ 2017-09-26 15:00 UTC (permalink / raw)
  To: Eryu Guan; +Cc: linux-xfs
On Thu, Sep 21, 2017 at 11:52:46PM +0800, Eryu Guan wrote:
> Since commit d531d91d6990 ("xfs: always use unwritten extents for
> direct I/O writes"), we start allocating unwritten extents for all
> direct writes to allow appending aio in XFS.
> 
> But for dio writes that could extend file size we update the in-core
> inode size first, then convert the unwritten extents to real
> allocations at dio completion time in xfs_dio_write_end_io(). Thus a
> racing direct read could see the new i_size and find the unwritten
> extents first and read zeros instead of actual data, if the direct
> writer also takes a shared iolock.
> 
> Fix it by updating the in-core inode size after the unwritten extent
> conversion. To do this, introduce a new boolean argument to
> xfs_iomap_write_unwritten() to tell if we want to update in-core
> i_size or not.
> 
> Suggested-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> Signed-off-by: Eryu Guan <eguan@redhat.com>
> ---
> v2:
> - fix comments by copying Brian's words :)
> - fix code style, remove unnecessary blank lines and braces
> 
> v1: https://marc.info/?l=linux-xfs&m=150599032124565&w=2
> 
>  fs/xfs/xfs_aops.c  |  3 ++-
>  fs/xfs/xfs_file.c  | 33 +++++++++++++++++++--------------
>  fs/xfs/xfs_iomap.c |  7 +++++--
>  fs/xfs/xfs_iomap.h |  2 +-
>  fs/xfs/xfs_pnfs.c  |  2 +-
>  5 files changed, 28 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 29172609f2a3..f18e5932aec4 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -343,7 +343,8 @@ xfs_end_io(
>  		error = xfs_reflink_end_cow(ip, offset, size);
>  		break;
>  	case XFS_IO_UNWRITTEN:
> -		error = xfs_iomap_write_unwritten(ip, offset, size);
> +		/* writeback should never update isize */
> +		error = xfs_iomap_write_unwritten(ip, offset, size, false);
Can we expand this to
	/* writeback should never update the in-core inode size */
> @@ -459,20 +473,11 @@ xfs_dio_write_end_io(
>  	spin_lock(&ip->i_flags_lock);
>  	if (offset + size > i_size_read(inode)) {
>  		i_size_write(inode, offset + size);
> +		spin_unlock(&ip->i_flags_lock);
>  		error = xfs_setfilesize(ip, offset, size);
> +	} else {
> +		spin_unlock(&ip->i_flags_lock);
> +	}
I find the old update_isize scheme a little easier to read, but it
shouldn't matter in the end:
  	spin_lock(&ip->i_flags_lock);
  	if (offset + size > i_size_read(inode)) {
  		i_size_write(inode, offset + size);
		update_isize = true;
	}
	spin_unlock(&ip->i_flags_lock);
	if (!update_isize)
		return 0;
	return xfs_setfilesize(ip, offset, size);
>  	xfs_bmbt_irec_t imap;
>  	struct xfs_defer_ops dfops;
> +	struct inode	*inode = VFS_I(ip);
>  	xfs_fsize_t	i_size;
>  	uint		resblks;
>  	int		error;
> @@ -899,7 +901,8 @@ xfs_iomap_write_unwritten(
>  		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
>  		if (i_size > offset + count)
>  			i_size = offset + count;
> -
> +		if (update_isize && i_size > i_size_read(inode))
> +			i_size_write(inode, i_size);
>  		i_size = xfs_new_eof(ip, i_size);
>  		if (i_size) {
>  			ip->i_d.di_size = i_size;
I wonder if this might be cleaner if we expand xfs_new_eof for
the update_isize case.  Something like the untested helper below:
static bool
xfs_alloc_update_isize(
	struct xfs_inode	*ip,
	xfs_fileoff_t		offset_fsb,
	xfs_filblks_t		count_fsb,
	bool			update_incore_isize)
{
	xfs_fsize_t		i_size;
	i_size = XFS_FSB_TO_B(ip->i_mount, offset_fsb + count_fsb);
	if (i_size > offset + count)
		i_size = offset + count;
	if (update_isize) {
		if (i_size <= i_size_read(inode))
			return;
		i_size_write(inode, i_size);
	} else {
		i_size = xfs_new_eof(ip, i_size);
		if (!i_size)
			return;
	}
	ip->i_d.di_size = i_size;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
^ permalink raw reply	[flat|nested] 2+ messages in thread
end of thread, other threads:[~2017-09-26 15:00 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-09-21 15:52 [PATCH v2] xfs: update i_size after unwritten conversion in dio completion Eryu Guan
2017-09-26 15:00 ` Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).