Re: [PATCH] xfs: rewrite getbmap using the xfs_iext_* helpers

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: Christoph Hellwig <hch@lst.de>
Cc: linux-xfs@vger.kernel.org
Subject: Re: [PATCH] xfs: rewrite getbmap using the xfs_iext_* helpers
Date: Mon, 28 Aug 2017 14:20:24 -0700	[thread overview]
Message-ID: <20170828212024.GI4757@magnolia> (raw)
In-Reply-To: <20170828150612.16437-1-hch@lst.de>

On Mon, Aug 28, 2017 at 05:06:12PM +0200, Christoph Hellwig wrote:
> Currently getbmap uses xfs_bmapi_read to query the extent map, and then
> fixes up various bits that are eventually reported to userspace.
> 
> This patch instead rewrites it to use xfs_iext_lookup_extent and
> xfs_iext_get_extent to iteratively process the extent map.  This not
> only avoids the need to allocate a map for the returned xfs_bmbt_irec
> structures but also greatly simplified the code.
> 
> There are two intentional behavior changes compared to the old code:
> 
>  - the current code reports unwritten extents that don't directly border
>    a written one as unwritten even when not passing the BMV_IF_PREALLOC
>    option, contrary to the documentation.  The new code requires the
>    BMV_IF_PREALLOC flag to report the unwrittent extent bit.
>  - The new code does never merges consecutive extents, unlike the old
>    code that sometimes does it based on the boundaries of the
>    xfs_bmapi_read calls.  Note that the extent merging behavior was
>    entirely undocumented.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_bmap_util.c | 499 ++++++++++++++++++-------------------------------
>  1 file changed, 185 insertions(+), 314 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 93e955262d07..5962f119d4ff 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -404,125 +404,69 @@ xfs_bmap_count_blocks(
>  	return 0;
>  }
>  
> -/*
> - * returns 1 for success, 0 if we failed to map the extent.
> - */
> -STATIC int
> -xfs_getbmapx_fix_eof_hole(
> -	xfs_inode_t		*ip,		/* xfs incore inode pointer */
> -	int			whichfork,
> -	struct getbmapx		*out,		/* output structure */
> -	int			prealloced,	/* this is a file with
> -						 * preallocated data space */
> -	int64_t			end,		/* last block requested */
> -	xfs_fsblock_t		startblock,
> -	bool			moretocome)
> +static void
> +xfs_getbmap_report_one(
> +	struct xfs_inode		*ip,
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	struct xfs_bmbt_irec		*got,
> +	struct getbmapx			*p)
>  {
> -	int64_t			fixlen;
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	xfs_ifork_t		*ifp;		/* inode fork pointer */
> -	xfs_extnum_t		lastx;		/* last extent pointer */
> -	xfs_fileoff_t		fileblock;
> -
> -	if (startblock == HOLESTARTBLOCK) {
> -		mp = ip->i_mount;
> -		out->bmv_block = -1;
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
> -		fixlen -= out->bmv_offset;
> -		if (prealloced && out->bmv_offset + out->bmv_length == end) {
> -			/* Came to hole at EOF. Trim it. */
> -			if (fixlen <= 0)
> -				return 0;
> -			out->bmv_length = fixlen;
> -		}
> +	if (isnullstartblock(got->br_startblock) ||
> +	    got->br_startblock == DELAYSTARTBLOCK) {
> +		/*
> +		 * Delalloc extents that start beyond EOF can occur due to
> +		 * speculative EOF allocation when the delalloc extent is larger
> +		 * than the largest freespace extent at conversion time.  These
> +		 * extents cannot be converted by data writeback, so can exist
> +		 * here even if we are not supposed to be finding delalloc
> +		 * extents.
> +		 */
> +		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
> +			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
> +
> +		p->bmv_oflags |= BMV_OF_DELALLOC;
> +		p->bmv_block = -2;
>  	} else {
> -		if (startblock == DELAYSTARTBLOCK)
> -			out->bmv_block = -2;
> -		else
> -			out->bmv_block = xfs_fsb_to_db(ip, startblock);
> -		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
> -		ifp = XFS_IFORK_PTR(ip, whichfork);
> -		if (!moretocome &&
> -		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
> -		   (lastx == xfs_iext_count(ifp) - 1))
> -			out->bmv_oflags |= BMV_OF_LAST;
> +		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
>  	}
>  
> -	return 1;
> +	if (got->br_state == XFS_EXT_UNWRITTEN &&
> +	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
> +		p->bmv_oflags |= BMV_OF_PREALLOC;
> +
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
> +
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
>  }
>  
> -/* Adjust the reported bmap around shared/unshared extent transitions. */
> -STATIC int
> -xfs_getbmap_adjust_shared(
> +static void
> +xfs_getbmap_report_hole(
>  	struct xfs_inode		*ip,
> -	int				whichfork,
> -	struct xfs_bmbt_irec		*map,
> -	struct getbmapx			*out,
> -	struct xfs_bmbt_irec		*next_map)
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	xfs_fileoff_t			bno,
> +	xfs_fileoff_t			end,
> +	struct getbmapx			*p)
>  {
> -	struct xfs_mount		*mp = ip->i_mount;
> -	xfs_agnumber_t			agno;
> -	xfs_agblock_t			agbno;
> -	xfs_agblock_t			ebno;
> -	xfs_extlen_t			elen;
> -	xfs_extlen_t			nlen;
> -	int				error;
> +	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
> +		return;
>  
> -	next_map->br_startblock = NULLFSBLOCK;
> -	next_map->br_startoff = NULLFILEOFF;
> -	next_map->br_blockcount = 0;
> +	p->bmv_block = -1;
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
>  
> -	/* Only written data blocks can be shared. */
> -	if (!xfs_is_reflink_inode(ip) ||
> -	    whichfork != XFS_DATA_FORK ||
> -	    !xfs_bmap_is_real_extent(map))
> -		return 0;
> -
> -	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
> -	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
> -	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
> -			map->br_blockcount, &ebno, &elen, true);
> -	if (error)
> -		return error;
> -
> -	if (ebno == NULLAGBLOCK) {
> -		/* No shared blocks at all. */
> -		return 0;
> -	} else if (agbno == ebno) {
> -		/*
> -		 * Shared extent at (agbno, elen).  Shrink the reported
> -		 * extent length and prepare to move the start of map[i]
> -		 * to agbno+elen, with the aim of (re)formatting the new
> -		 * map[i] the next time through the inner loop.
> -		 */
> -		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
> -		out->bmv_oflags |= BMV_OF_SHARED;
> -		if (elen != map->br_blockcount) {
> -			*next_map = *map;
> -			next_map->br_startblock += elen;
> -			next_map->br_startoff += elen;
> -			next_map->br_blockcount -= elen;
> -		}
> -		map->br_blockcount -= elen;
> -	} else {
> -		/*
> -		 * There's an unshared extent (agbno, ebno - agbno)
> -		 * followed by shared extent at (ebno, elen).  Shrink
> -		 * the reported extent length to cover only the unshared
> -		 * extent and prepare to move up the start of map[i] to
> -		 * ebno, with the aim of (re)formatting the new map[i]
> -		 * the next time through the inner loop.
> -		 */
> -		*next_map = *map;
> -		nlen = ebno - agbno;
> -		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
> -		next_map->br_startblock += nlen;
> -		next_map->br_startoff += nlen;
> -		next_map->br_blockcount -= nlen;
> -		map->br_blockcount -= nlen;
> -	}
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
> +}
>  
> -	return 0;
> +static inline bool xfs_getbmap_full(struct getbmapx *bmv, int nr_entries)
> +{
> +	return bmv->bmv_length == 0 || nr_entries >= bmv->bmv_count - 1;
>  }
>  
>  /*
> @@ -539,119 +483,72 @@ xfs_getbmap(
>  	xfs_bmap_format_t	formatter,	/* format to user */
>  	void			*arg)		/* formatter arg */
>  {
> -	int64_t			bmvend;		/* last block requested */
> -	int			error = 0;	/* return value */
> -	int64_t			fixlen;		/* length for -1 case */
> -	int			i;		/* extent number */
> -	int			lock;		/* lock state */
> -	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	int			nex;		/* # of user extents can do */
> -	int			subnex;		/* # of bmapi's can do */
> -	int			nmap;		/* number of map entries */
> -	struct getbmapx		*out;		/* output structure */
> -	int			whichfork;	/* data or attr fork */
> -	int			prealloced;	/* this is a file with
> -						 * preallocated data space */
> -	int			iflags;		/* interface flags */
> -	int			bmapi_flags;	/* flags for xfs_bmapi */
> -	int			cur_ext = 0;
> -	struct xfs_bmbt_irec	inject_map;
> -
> -	mp = ip->i_mount;
> -	iflags = bmv->bmv_iflags;
> +	struct xfs_mount	*mp = ip->i_mount;
> +	int			iflags = bmv->bmv_iflags;
> +	int			whichfork, lock, i, nr_entries = 0, error = 0;
> +	int64_t			bmv_end, max_len;
> +	xfs_fileoff_t		bno, first_bno;
> +	struct xfs_ifork	*ifp;
> +	struct getbmapx		*out;
> +	struct xfs_bmbt_irec	got, rec;
> +	xfs_filblks_t		len;
> +	xfs_extnum_t		idx;
>  
>  #ifndef DEBUG
>  	/* Only allow CoW fork queries if we're debugging. */
>  	if (iflags & BMV_IF_COWFORK)
>  		return -EINVAL;
>  #endif
> +
>  	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
>  		return -EINVAL;
>  
> +	if (bmv->bmv_count <= 1)
> +		return -EINVAL;
> +	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> +		return -ENOMEM;
> +
> +	if (bmv->bmv_length < -1)
> +		return -EINVAL;
> +
> +	bmv->bmv_entries = 0;
> +	if (bmv->bmv_length == 0)
> +		return 0;
> +
> +	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> +	if (!out)
> +		return -ENOMEM;
> +
>  	if (iflags & BMV_IF_ATTRFORK)
>  		whichfork = XFS_ATTR_FORK;
>  	else if (iflags & BMV_IF_COWFORK)
>  		whichfork = XFS_COW_FORK;
>  	else
>  		whichfork = XFS_DATA_FORK;
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
>  
> +	xfs_ilock(ip, XFS_IOLOCK_SHARED);
>  	switch (whichfork) {
>  	case XFS_ATTR_FORK:
> -		if (XFS_IFORK_Q(ip)) {
> -			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
> -				return -EINVAL;
> -		} else if (unlikely(
> -			   ip->i_d.di_aformat != 0 &&
> -			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
> -			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
> -					 ip->i_mount);
> -			return -EFSCORRUPTED;
> -		}
> +		if (!XFS_IFORK_Q(ip))
> +			goto out_unlock_iolock;
>  
> -		prealloced = 0;
> -		fixlen = 1LL << 32;
> +		max_len = 1LL << 32;
> +		lock = xfs_ilock_attr_map_shared(ip);
>  		break;
>  	case XFS_COW_FORK:
> -		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
> -			return -EINVAL;
> +		/* No CoW fork? Just return */
> +		if (!ifp)
> +			goto out_unlock_iolock;
>  
> -		if (xfs_get_cowextsz_hint(ip)) {
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> -		break;
> -	default:
> -		/* Local format data forks report no extents. */
> -		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
> -			bmv->bmv_entries = 0;
> -			return 0;
> -		}
> -		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
> -		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
> -			return -EINVAL;
> +		if (xfs_get_cowextsz_hint(ip))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
>  
> -		if (xfs_get_extsz_hint(ip) ||
> -		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> +		lock = XFS_ILOCK_SHARED;
> +		xfs_ilock(ip, lock);
>  		break;
> -	}
> -
> -	if (bmv->bmv_length == -1) {
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
> -		bmv->bmv_length =
> -			max_t(int64_t, fixlen - bmv->bmv_offset, 0);
> -	} else if (bmv->bmv_length == 0) {
> -		bmv->bmv_entries = 0;
> -		return 0;
> -	} else if (bmv->bmv_length < 0) {
> -		return -EINVAL;
> -	}
> -
> -	nex = bmv->bmv_count - 1;
> -	if (nex <= 0)
> -		return -EINVAL;
> -	bmvend = bmv->bmv_offset + bmv->bmv_length;
> -
> -
> -	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> -		return -ENOMEM;
> -	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> -	if (!out)
> -		return -ENOMEM;
> -
> -	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	switch (whichfork) {
>  	case XFS_DATA_FORK:
>  		if (!(iflags & BMV_IF_DELALLOC) &&
>  		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
> @@ -669,147 +566,121 @@ xfs_getbmap(
>  			 */
>  		}
>  
> +		if (xfs_get_extsz_hint(ip) ||
> +		    (ip->i_d.di_flags &
> +		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
> +
>  		lock = xfs_ilock_data_map_shared(ip);
>  		break;
> -	case XFS_COW_FORK:
> -		lock = XFS_ILOCK_SHARED;
> -		xfs_ilock(ip, lock);
> -		break;
> -	case XFS_ATTR_FORK:
> -		lock = xfs_ilock_attr_map_shared(ip);
> +	}
> +
> +	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
> +	case XFS_DINODE_FMT_EXTENTS:
> +	case XFS_DINODE_FMT_BTREE:
>  		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +		/* Local format inode forks report no extents. */
> +		goto out_unlock_ilock;
> +	default:
> +		error = -EINVAL;
> +		goto out_unlock_ilock;
>  	}
>  
> -	/*
> -	 * Don't let nex be bigger than the number of extents
> -	 * we can have assuming alternating holes and real extents.
> -	 */
> -	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
> -		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
> +	if (bmv->bmv_length == -1) {
> +		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
> +		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
> +	}
>  
> -	bmapi_flags = xfs_bmapi_aflag(whichfork);
> -	if (!(iflags & BMV_IF_PREALLOC))
> -		bmapi_flags |= XFS_BMAPI_IGSTATE;
> +	bmv_end = bmv->bmv_offset + bmv->bmv_length;
>  
> -	/*
> -	 * Allocate enough space to handle "subnex" maps at a time.
> -	 */
> -	error = -ENOMEM;
> -	subnex = 16;
> -	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
> -	if (!map)
> +	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
> +	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +		error = xfs_iread_extents(NULL, ip, whichfork);
> +		if (error)
> +			goto out_unlock_ilock;
> +	}
> +
> +	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
>  		goto out_unlock_ilock;
>  
> -	bmv->bmv_entries = 0;
> +	while (!xfs_getbmap_full(bmv, nr_entries)) {
> +		struct getbmapx		*p = &out[nr_entries];
>  
> -	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
> -	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
> -		error = 0;
> -		goto out_free_map;
> -	}
> +		xfs_trim_extent(&got, first_bno, len);
>  
> -	do {
> -		nmap = (nex> subnex) ? subnex : nex;
> -		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
> -				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
> -				       map, &nmap, bmapi_flags);
> -		if (error)
> -			goto out_free_map;
> -		ASSERT(nmap <= subnex);
> -
> -		for (i = 0; i < nmap && bmv->bmv_length &&
> -				cur_ext < bmv->bmv_count - 1; i++) {
> -			out[cur_ext].bmv_oflags = 0;
> -			if (map[i].br_state == XFS_EXT_UNWRITTEN)
> -				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
> -			else if (map[i].br_startblock == DELAYSTARTBLOCK)
> -				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
> -			out[cur_ext].bmv_offset =
> -				XFS_FSB_TO_BB(mp, map[i].br_startoff);
> -			out[cur_ext].bmv_length =
> -				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
> -			out[cur_ext].bmv_unused1 = 0;
> -			out[cur_ext].bmv_unused2 = 0;
> +		/*
> +		 * Report an entry for a hole if this extent doesn't directly
> +		 * follow the previous one.
> +		 */
> +		if (got.br_startoff > bno) {
> +			xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +					got.br_startoff, p++);
> +			if (xfs_getbmap_full(bmv, ++nr_entries))
> +				break;
> +		}
>  
> -			/*
> -			 * delayed allocation extents that start beyond EOF can
> -			 * occur due to speculative EOF allocation when the
> -			 * delalloc extent is larger than the largest freespace
> -			 * extent at conversion time. These extents cannot be
> -			 * converted by data writeback, so can exist here even
> -			 * if we are not supposed to be finding delalloc
> -			 * extents.
> -			 */
> -			if (map[i].br_startblock == DELAYSTARTBLOCK &&
> -			    map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
> -				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
> -
> -                        if (map[i].br_startblock == HOLESTARTBLOCK &&
> -			    whichfork == XFS_ATTR_FORK) {
> -				/* came to the end of attribute fork */
> -				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
> -				goto out_free_map;
> -			}
> +		/*
> +		 * In order to report shared extents accurately, we report each
> +		 * distinct shared / unshared part of a single bmbt record with
> +		 * an individual getbmapx record.
> +		 */
> +		rec = got;
> +		for (;;) {

while (!xfs_getbmap_full()) ?

> +			bool			shared = false, trimmed = false;
> +			xfs_fileoff_t		len;
>  
> -			/* Is this a shared block? */
> -			error = xfs_getbmap_adjust_shared(ip, whichfork,
> -					&map[i], &out[cur_ext], &inject_map);
> +			error = xfs_reflink_trim_around_shared(ip, &rec,
> +					&shared, &trimmed);
>  			if (error)
> -				goto out_free_map;
> +				goto out_unlock_ilock;
>  
> -			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
> -					&out[cur_ext], prealloced, bmvend,
> -					map[i].br_startblock,
> -					inject_map.br_startblock != NULLFSBLOCK))
> -				goto out_free_map;
> +			xfs_getbmap_report_one(ip, bmv, bmv_end, &rec, p);
> +			if (shared)
> +				p->bmv_oflags |= BMV_OF_SHARED;

Shouldn't we advance p/nr_entries?  What if we have a single partially
shared extent?  Also, what's the difference between nr_entries and
bmv->bmv_entries?  They both track the number of bmv entries we've
filled out, right?

I tried a simple test (which I guess we should turn into an xfstests, sigh):

$ xfs_io -c 'pwrite 0 1m' /opt/a -f
wrote 1048576/1048576 bytes at offset 0
1 MiB, 256 ops; 0.0000 sec (180.571 MiB/sec and 46226.0744 ops/sec)
$ cp --reflink=always /opt/a /opt/b
$ xfs_io -c 'bmap -elpv' /opt/a
/opt/a:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..2047]:       192..2239         0 (192..2239)       2048 100000
$ xfs_io -c 'bmap -elpv' /opt/b
/opt/b:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..2047]:       192..2239         0 (192..2239)       2048 100000

Then tried to CoW the middle of the extent:

$ xfs_io -c 'pwrite 200k 4k' -c 'pwrite 700k 4k' -c fsync /opt/b
wrote 4096/4096 bytes at offset 204800
4 KiB, 1 ops; 0.0000 sec (5.964 MiB/sec and 1526.7176 ops/sec)
wrote 4096/4096 bytes at offset 716800
4 KiB, 1 ops; 0.0000 sec (19.829 MiB/sec and 5076.1421 ops/sec)
$ xfs_io -c 'bmap -elpv' /opt/a
/opt/a:
 EXT: FILE-OFFSET                     BLOCK-RANGE             AG AG-OFFSET                                TOTAL FLAGS
   0: [1408..2047]:                   1600..2239               0 (1600..2239)                               640 100000
   1: [4289600..8495807]:             delalloc                                                          4206208
   2: [4289432..8591138]:             4206160..8507866         2 (929360..5231066)                      4301707 000000
   3: [4611686027017322496..461168602 4301714..8602913         2 (1024914..5326113)                     4301200 000000
   4: [4226304..-4611686009833227009] 0..-4611686009837453313  0 (0..-4611686009837453313) -4611686009837453312 000000
$ xfs_io -c 'bmap -elpv' /opt/b
/opt/b:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..399]:        192..591          0 (192..591)         400 100000
   1: [400..407]:      2384..2391        0 (2384..2391)         8 000000
   2: [408..1399]:     600..1591         0 (600..1591)        992 100000
   3: [1400..1407]:    3384..3391        0 (3384..3391)         8 000000
   4: [1408..2047]:    1600..2239        0 (1600..2239)       640 100000

Ugh, something is seriously messed up here.  By comparison, FIEMAP works fine:

$ filefrag -v /opt/a
Filesystem type is: 58465342
File size of /opt/a is 1048576 (256 blocks of 4096 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..      49:         24..        73:     50:             shared
   1:       50..      50:         74..        74:      1:            
   2:       51..     174:         75..       198:    124:             shared
   3:      175..     175:        199..       199:      1:            
   4:      176..     255:        200..       279:     80:             last,shared,eof
/opt/a: 1 extent found
$ filefrag -v /opt/b
Filesystem type is: 58465342
File size of /opt/b is 1048576 (256 blocks of 4096 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..      49:         24..        73:     50:             shared
   1:       50..      50:        298..       298:      1:         74:
   2:       51..     174:         75..       198:    124:        299: shared
   3:      175..     175:        423..       423:      1:        199:
   4:      176..     255:        200..       279:     80:        424: last,shared,eof
/opt/b: 5 extents found

> +			if (!trimmed)
> +				break;
>  
> -			bmv->bmv_offset =
> -				out[cur_ext].bmv_offset +
> -				out[cur_ext].bmv_length;
> -			bmv->bmv_length =
> -				max_t(int64_t, 0, bmvend - bmv->bmv_offset);
> +			len = got.br_startoff + got.br_blockcount -
> +				(rec.br_startoff + rec.br_blockcount);
>  
> -			/*
> -			 * In case we don't want to return the hole,
> -			 * don't increase cur_ext so that we can reuse
> -			 * it in the next loop.
> -			 */
> -			if ((iflags & BMV_IF_NO_HOLES) &&
> -			    map[i].br_startblock == HOLESTARTBLOCK) {
> -				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
> -				continue;
> -			}
> +			rec.br_startoff += rec.br_blockcount;
> +			if (rec.br_startblock != DELAYSTARTBLOCK)
> +				rec.br_startblock += rec.br_blockcount;
> +			rec.br_blockcount = len;
> +		}
>  
> -			/*
> -			 * In order to report shared extents accurately,
> -			 * we report each distinct shared/unshared part
> -			 * of a single bmbt record using multiple bmap
> -			 * extents.  To make that happen, we iterate the
> -			 * same map array item multiple times, each
> -			 * time trimming out the subextent that we just
> -			 * reported.
> -			 *
> -			 * Because of this, we must check the out array
> -			 * index (cur_ext) directly against bmv_count-1
> -			 * to avoid overflows.
> -			 */
> -			if (inject_map.br_startblock != NULLFSBLOCK) {
> -				map[i] = inject_map;
> -				i--;
> +		bno = got.br_startoff + got.br_blockcount;
> +		nr_entries++;
> +
> +		if (!xfs_iext_get_extent(ifp, ++idx, &got)) {
> +			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
> +
> +			p->bmv_oflags |= BMV_OF_LAST;

Isn't BMV_OF_LAST supposed to be set only on the last extent of the
dataset returned?  If I ask for the mappings for blocks 100-200 and
there's a hole from 190-200, shouldn't OF_LAST be set on the 190-200
extent?

--D

> +
> +			if (whichfork != XFS_ATTR_FORK && bno < end &&
> +			    !xfs_getbmap_full(bmv, nr_entries)) {
> +				xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +						end, ++p);
> +				nr_entries++;
>  			}
> -			bmv->bmv_entries++;
> -			cur_ext++;
> +			break;
>  		}
> -	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
>  
> - out_free_map:
> -	kmem_free(map);
> - out_unlock_ilock:
> +		if (bno >= first_bno + len)
> +			break;
> +	}
> +
> +out_unlock_ilock:
>  	xfs_iunlock(ip, lock);
> - out_unlock_iolock:
> +out_unlock_iolock:
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
> -	for (i = 0; i < cur_ext; i++) {
> +	for (i = 0; i < nr_entries; i++) {
>  		/* format results & advance arg */
>  		error = formatter(&arg, &out[i]);
>  		if (error)
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

next prev parent reply	other threads:[~2017-08-28 21:20 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-08-28 15:06 [PATCH] xfs: rewrite getbmap using the xfs_iext_* helpers Christoph Hellwig
2017-08-28 18:31 ` Darrick J. Wong
2017-08-28 19:35   ` Christoph Hellwig
2017-08-28 21:01     ` Darrick J. Wong
2017-08-29 14:41       ` Christoph Hellwig
2017-08-28 21:20 ` Darrick J. Wong [this message]
2017-08-29 14:38   ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170828212024.GI4757@magnolia \
    --to=darrick.wong@oracle.com \
    --cc=hch@lst.de \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).