linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: Dave Chinner <david@fromorbit.com>
Cc: linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH 6/7] xfs: use iomap_valid method to detect stale cached iomaps
Date: Wed, 2 Nov 2022 10:19:33 -0700	[thread overview]
Message-ID: <Y2KmpcD5ioig6n/O@magnolia> (raw)
In-Reply-To: <20221101003412.3842572-7-david@fromorbit.com>

On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that iomap supports a mechanism to validate cached iomaps for
> buffered write operations, hook it up to the XFS buffered write ops
> so that we can avoid data corruptions that result from stale cached
> iomaps. See:
> 
> https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
> 
> or the ->iomap_valid() introduction commit for exact details of the
> corruption vector.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c |  4 +--
>  fs/xfs/xfs_aops.c        |  2 +-
>  fs/xfs/xfs_iomap.c       | 69 ++++++++++++++++++++++++++++++----------
>  fs/xfs/xfs_iomap.h       |  4 +--
>  fs/xfs/xfs_pnfs.c        |  5 +--
>  5 files changed, 61 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 49d0d4ea63fc..db225130618c 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc(
>  	 * the extent.  Just return the real extent at this offset.
>  	 */
>  	if (!isnullstartblock(bma.got.br_startblock)) {
> -		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
>  		*seq = READ_ONCE(ifp->if_seq);
> +		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
>  		goto out_trans_cancel;
>  	}
>  
> @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc(
>  	XFS_STATS_INC(mp, xs_xstrat_quick);
>  
>  	ASSERT(!isnullstartblock(bma.got.br_startblock));
> -	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
>  	*seq = READ_ONCE(ifp->if_seq);
> +	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
>  
>  	if (whichfork == XFS_COW_FORK)

Hmm.  @ifp here could be the cow fork, which means *seq will be from the
cow fork.  This I think is going to cause problems in
xfs_buffered_write_iomap_valid...

>  		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 5d1a995b15f8..ca5a9e45a48c 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -373,7 +373,7 @@ xfs_map_blocks(
>  	    isnullstartblock(imap.br_startblock))
>  		goto allocate_blocks;
>  
> -	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
> +	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
>  	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
>  	return 0;
>  allocate_blocks:
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 2d48fcc7bd6f..5053ffcf10fe 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap(
>  	struct iomap		*iomap,
>  	struct xfs_bmbt_irec	*imap,
>  	unsigned int		mapping_flags,
> -	u16			iomap_flags)
> +	u16			iomap_flags,
> +	int			sequence)

Nit: The sequence numbers are unsigned int, not signed int.

>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> @@ -91,6 +92,9 @@ xfs_bmbt_to_iomap(
>  	if (xfs_ipincount(ip) &&
>  	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
>  		iomap->flags |= IOMAP_F_DIRTY;
> +
> +	/* The extent tree sequence is needed for iomap validity checking. */
> +	*((int *)&iomap->private) = sequence;
>  	return 0;
>  }
>  
> @@ -195,7 +199,8 @@ xfs_iomap_write_direct(
>  	xfs_fileoff_t		offset_fsb,
>  	xfs_fileoff_t		count_fsb,
>  	unsigned int		flags,
> -	struct xfs_bmbt_irec	*imap)
> +	struct xfs_bmbt_irec	*imap,
> +	int			*seq)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_trans	*tp;
> @@ -285,6 +290,8 @@ xfs_iomap_write_direct(
>  		error = xfs_alert_fsblock_zero(ip, imap);
>  
>  out_unlock:
> +	if (seq)
> +		*seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
>  
> @@ -743,6 +750,7 @@ xfs_direct_write_iomap_begin(
>  	bool			shared = false;
>  	u16			iomap_flags = 0;
>  	unsigned int		lockmode = XFS_ILOCK_SHARED;
> +	int			seq;
>  
>  	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
>  
> @@ -811,9 +819,10 @@ xfs_direct_write_iomap_begin(
>  			goto out_unlock;
>  	}
>  
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
>  
>  allocate_blocks:
>  	error = -EAGAIN;
> @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin(
>  	xfs_iunlock(ip, lockmode);
>  
>  	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
> -			flags, &imap);
> +			flags, &imap, &seq);
>  	if (error)
>  		return error;
>  
>  	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
>  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> -				 iomap_flags | IOMAP_F_NEW);
> +				 iomap_flags | IOMAP_F_NEW, seq);
>  
>  out_found_cow:
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
>  	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
>  	if (imap.br_startblock != HOLESTARTBLOCK) {
> -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
>  		if (error)
>  			return error;
>  	}
> -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);

Here we've found a mapping from the COW fork and we're encoding it into
the struct iomap.  Why is the sequence number from the *data* fork and
not the COW fork?

>  
>  out_unlock:
>  	if (lockmode)
> @@ -915,6 +925,7 @@ xfs_buffered_write_iomap_begin(
>  	int			allocfork = XFS_DATA_FORK;
>  	int			error = 0;
>  	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -1094,26 +1105,29 @@ xfs_buffered_write_iomap_begin(
>  	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
>  	 * them out if the write happens to fail.
>  	 */
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
>  
>  found_imap:
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
>  
>  found_cow:
> +	seq = READ_ONCE(ip->i_df.if_seq);

Same question here.

>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	if (imap.br_startoff <= offset_fsb) {
> -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
>  		if (error)
>  			return error;
>  		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> -					 IOMAP_F_SHARED);
> +					 IOMAP_F_SHARED, seq);
>  	}
>  
>  	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
> -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
>  
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end(
>  	return 0;
>  }
>  
> +/*
> + * Check that the iomap passed to us is still valid for the given offset and
> + * length.
> + */
> +static bool
> +xfs_buffered_write_iomap_valid(
> +	struct inode		*inode,
> +	const struct iomap	*iomap)
> +{
> +	int			seq = *((int *)&iomap->private);
> +
> +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))

Why is it ok to sample the data fork's sequence number even if the
mapping came from the COW fork?  That doesn't make sense to me,
conceptually.  I definitely think it's buggy that the revalidation
might not sample from the same sequence counter as the original
measurement.

My curiosity is now increased about why shouldn't we return the u32
counter and u32 pointer in the struct iomap so that we can keep the
sample/revalidation straight in XFS, and iomap can avoid a second
indirect call?

> +		return false;
> +	return true;
> +}
> +
>  const struct iomap_ops xfs_buffered_write_iomap_ops = {
>  	.iomap_begin		= xfs_buffered_write_iomap_begin,
>  	.iomap_end		= xfs_buffered_write_iomap_end,
> +	.iomap_valid		= xfs_buffered_write_iomap_valid,
>  };
>  
>  /*
> @@ -1359,6 +1390,7 @@ xfs_read_iomap_begin(
>  	int			nimaps = 1, error = 0;
>  	bool			shared = false;
>  	unsigned int		lockmode = XFS_ILOCK_SHARED;
> +	int			seq;
>  
>  	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
>  
> @@ -1372,13 +1404,14 @@ xfs_read_iomap_begin(
>  			       &nimaps, 0);
>  	if (!error && (flags & IOMAP_REPORT))
>  		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  
>  	if (error)
>  		return error;
>  	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
>  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> -				 shared ? IOMAP_F_SHARED : 0);
> +				 shared ? IOMAP_F_SHARED : 0, seq);
>  }
>  
>  const struct iomap_ops xfs_read_iomap_ops = {
> @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin(
>  			end_fsb = min(end_fsb, data_fsb);
>  		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
>  		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> -					  IOMAP_F_SHARED);
> +				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));

Here we explicitly sample from the cow fork but the comparison is done
against the data fork.  That /probably/ doesn't matter for reads since
you're not proposing that we revalidate on those.  Should we?  What
happens if userspace hands us a large read() from an unwritten extent
and the same dirty/writeback/reclaim sequence happens to the last folio
in that read() range before read_iter actually gets there?

>  		/*
>  		 * This is a COW extent, so we must probe the page cache
>  		 * because there could be dirty page cache being backed
> @@ -1460,7 +1493,8 @@ xfs_seek_iomap_begin(
>  	imap.br_state = XFS_EXT_NORM;
>  done:
>  	xfs_trim_extent(&imap, offset_fsb, end_fsb);
> -	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0,
> +			READ_ONCE(ip->i_df.if_seq));
>  out_unlock:
>  	xfs_iunlock(ip, lockmode);
>  	return error;
> @@ -1486,6 +1520,7 @@ xfs_xattr_iomap_begin(
>  	struct xfs_bmbt_irec	imap;
>  	int			nimaps = 1, error = 0;
>  	unsigned		lockmode;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -1502,12 +1537,14 @@ xfs_xattr_iomap_begin(
>  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>  			       &nimaps, XFS_BMAPI_ATTRFORK);
>  out_unlock:
> +
> +	seq = READ_ONCE(ip->i_af.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  
>  	if (error)
>  		return error;
>  	ASSERT(nimaps);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
>  }
>  
>  const struct iomap_ops xfs_xattr_iomap_ops = {
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 0f62ab633040..792fed2a9072 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -13,14 +13,14 @@ struct xfs_bmbt_irec;
>  
>  int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
>  		xfs_fileoff_t count_fsb, unsigned int flags,
> -		struct xfs_bmbt_irec *imap);
> +		struct xfs_bmbt_irec *imap, int *sequence);
>  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
>  xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
>  		xfs_fileoff_t end_fsb);
>  
>  int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
>  		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
> -		u16 iomap_flags);
> +		u16 iomap_flags, int sequence);
>  
>  int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
>  		bool *did_zero);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 37a24f0f7cd4..eea507a80c5c 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -125,6 +125,7 @@ xfs_fs_map_blocks(
>  	int			nimaps = 1;
>  	uint			lock_flags;
>  	int			error = 0;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -189,7 +190,7 @@ xfs_fs_map_blocks(
>  		xfs_iunlock(ip, lock_flags);
>  
>  		error = xfs_iomap_write_direct(ip, offset_fsb,
> -				end_fsb - offset_fsb, 0, &imap);
> +				end_fsb - offset_fsb, 0, &imap, &seq);

I got a compiler warning about seq not being set in the case where we
don't call xfs_iomap_write_direct.

--D

>  		if (error)
>  			goto out_unlock;
>  
> @@ -209,7 +210,7 @@ xfs_fs_map_blocks(
>  	}
>  	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
>  
> -	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
>  	*device_generation = mp->m_generation;
>  	return error;
>  out_unlock:
> -- 
> 2.37.2
> 

  parent reply	other threads:[~2022-11-02 17:19 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-01  0:34 xfs, iomap: fix data corrupton due to stale cached iomaps Dave Chinner
2022-11-01  0:34 ` [PATCH 1/7] xfs: write page faults in iomap are not buffered writes Dave Chinner
2022-11-02  7:17   ` Christoph Hellwig
2022-11-02 16:12   ` Darrick J. Wong
2022-11-02 21:11     ` Dave Chinner
2022-11-01  0:34 ` [PATCH 2/7] xfs: punching delalloc extents on write failure is racy Dave Chinner
2022-11-02  7:18   ` Christoph Hellwig
2022-11-02 16:22   ` Darrick J. Wong
2022-11-01  0:34 ` [PATCH 3/7] xfs: use byte ranges for write cleanup ranges Dave Chinner
2022-11-02  7:20   ` Christoph Hellwig
2022-11-02 16:32   ` Darrick J. Wong
2022-11-04  5:40     ` Dave Chinner
2022-11-07 23:53       ` Darrick J. Wong
2022-11-01  0:34 ` [PATCH 4/7] xfs: buffered write failure should not truncate the page cache Dave Chinner
2022-11-01 11:57   ` kernel test robot
2022-11-02  7:24   ` Christoph Hellwig
2022-11-02 20:57     ` Dave Chinner
2022-11-02 16:41   ` Darrick J. Wong
2022-11-02 21:04     ` Dave Chinner
2022-11-02 22:26       ` Darrick J. Wong
2022-11-04  8:08   ` Christoph Hellwig
2022-11-04 23:10     ` Dave Chinner
2022-11-07 23:48       ` Darrick J. Wong
2022-11-01  0:34 ` [PATCH 5/7] iomap: write iomap validity checks Dave Chinner
2022-11-02  8:36   ` Christoph Hellwig
2022-11-02 16:43     ` Darrick J. Wong
2022-11-02 16:58       ` Darrick J. Wong
2022-11-03  0:35         ` Dave Chinner
2022-11-04  8:12           ` Christoph Hellwig
2022-11-02 16:57   ` Darrick J. Wong
2022-11-01  0:34 ` [PATCH 6/7] xfs: use iomap_valid method to detect stale cached iomaps Dave Chinner
2022-11-01  9:15   ` kernel test robot
2022-11-02  8:41   ` Christoph Hellwig
2022-11-02 21:39     ` Dave Chinner
2022-11-04  8:14       ` Christoph Hellwig
2022-11-02 17:19   ` Darrick J. Wong [this message]
2022-11-02 22:36     ` Dave Chinner
2022-11-08  0:00       ` Darrick J. Wong
2022-11-01  0:34 ` [PATCH 7/7] xfs: drop write error injection is unfixable, remove it Dave Chinner
2022-11-01  3:39 ` xfs, iomap: fix data corrupton due to stale cached iomaps Darrick J. Wong
2022-11-01  4:21   ` Dave Chinner
2022-11-02 17:23     ` Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Y2KmpcD5ioig6n/O@magnolia \
    --to=djwong@kernel.org \
    --cc=david@fromorbit.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).