Re: [PATCH 3/3] repair: fix discontiguous directory block support

From: Brian Foster <bfoster@redhat.com>
To: Dave Chinner <david@fromorbit.com>, xfs@oss.sgi.com
Subject: Re: [PATCH 3/3] repair: fix discontiguous directory block support
Date: Thu, 23 Jan 2014 12:15:22 -0500	[thread overview]
Message-ID: <52E14E2A.7000902@redhat.com> (raw)
In-Reply-To: <1390375033-10483-4-git-send-email-david@fromorbit.com>

On 01/22/2014 02:17 AM, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> xfs/291 tests fragmented multi-block directories, and xfs_repair
> throws lots of lookup badness errors in phase 3:
> 
>         - agno = 1
> 7fa3d2758740: Badness in key lookup (length)
> bp=(bno 0x1e040, len 4096 bytes) key=(bno 0x1e040, len 16384 bytes)
>         - agno = 2
> 7fa3d2758740: Badness in key lookup (length)
> bp=(bno 0x2d0e8, len 4096 bytes) key=(bno 0x2d0e8, len 16384 bytes)
> 7fa3d2758740: Badness in key lookup (length)
> bp=(bno 0x2d068, len 4096 bytes) key=(bno 0x2d068, len 16384 bytes)
>         - agno = 3
> 7fa3d2758740: Badness in key lookup (length)
> bp=(bno 0x39130, len 4096 bytes) key=(bno 0x39130, len 16384 bytes)
> 7fa3d2758740: Badness in key lookup (length)
> bp=(bno 0x391b0, len 4096 bytes) key=(bno 0x391b0, len 16384 bytes)
> 7fa3d2758740: Badness in key lookup (length)
> 
> This is because it is trying to read a directory buffer in full
> (16k), but is finding a single 4k block in the cache instead.
> 
> The opposite is happening in phase 6 - phase 6 is trying to read 4k
> buffers but is finding a 16k buffer there instead.
> 
> The problem is caused by the fact that directory buffers can be
> represented as compound buffers or as individual buffers depending
> on the code reading the directory blocks. The main problem is that
> the IO prefetch code doesn't understand compound buffers, so teach
> it about compound buffers to make the problem go away.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---

I think I get the gist of what's going on here. One potential issue
noted below along with notes to self to confirm whether I follow the
code correctly.

>  repair/prefetch.c | 121 ++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 100 insertions(+), 21 deletions(-)
> 
> diff --git a/repair/prefetch.c b/repair/prefetch.c
> index d3491da..ab90b92 100644
> --- a/repair/prefetch.c
> +++ b/repair/prefetch.c
> @@ -105,11 +105,12 @@ pf_start_io_workers(
>  static void
>  pf_queue_io(
>  	prefetch_args_t		*args,
> -	xfs_fsblock_t		fsbno,
> -	int			blen,
> +	struct xfs_buf_map	*map,
> +	int			nmaps,
>  	int			flag)
>  {
> -	xfs_buf_t		*bp;
> +	struct xfs_buf		*bp;
> +	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
>  
>  	/*
>  	 * Never block on a buffer lock here, given that the actual repair
> @@ -117,8 +118,7 @@ pf_queue_io(
>  	 * the lock holder is either reading it from disk himself or
>  	 * completely overwriting it this behaviour is perfectly fine.
>  	 */
> -	bp = libxfs_getbuf_flags(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
> -			XFS_FSB_TO_BB(mp, blen), LIBXFS_GETBUF_TRYLOCK);
> +	bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
>  	if (!bp)
>  		return;
>  

Interface change to support queuing a discontig buffer.

> @@ -167,6 +167,14 @@ pf_read_bmbt_reclist(
>  	xfs_bmbt_irec_t		irec;
>  	xfs_dfilblks_t		cp = 0;		/* prev count */
>  	xfs_dfiloff_t		op = 0;		/* prev offset */
> +#define MAP_ARRAY_SZ 4
> +	struct xfs_buf_map	map_array[MAP_ARRAY_SZ];
> +	struct xfs_buf_map	*map = map_array;
> +	int			max_extents = MAP_ARRAY_SZ;
> +	int			nmaps = 0;;
> +	unsigned int		len = 0;
> +	int			ret = 0;
> +

So if I understand correctly, the idea here is to now batch extent reads
into buffers of the directory block size, quieting the messages
described in the commit log.

>  
>  	for (i = 0; i < numrecs; i++) {
>  		libxfs_bmbt_disk_get_all(rp + i, &irec);
> @@ -174,11 +182,11 @@ pf_read_bmbt_reclist(
>  		if (((i > 0) && (op + cp > irec.br_startoff)) ||
>  				(irec.br_blockcount == 0) ||
>  				(irec.br_startoff >= fs_max_file_offset))
> -			return 0;
> +			goto out_free;
>  
>  		if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
>  				irec.br_startblock + irec.br_blockcount - 1))
> -			return 0;
> +			goto out_free;
>  
>  		if (!args->dirs_only && ((irec.br_startoff +
>  				irec.br_blockcount) >= mp->m_dirfreeblk))
> @@ -188,18 +196,60 @@ pf_read_bmbt_reclist(
>  		cp = irec.br_blockcount;
>  
>  		while (irec.br_blockcount) {
> -			unsigned int	len;
> +			unsigned int	bm_len;
>  
>  			pftrace("queuing dir extent in AG %d", args->agno);
>  
> -			len = (irec.br_blockcount > mp->m_dirblkfsbs) ?
> -					mp->m_dirblkfsbs : irec.br_blockcount;
> -			pf_queue_io(args, irec.br_startblock, len, B_DIR_META);
> -			irec.br_blockcount -= len;
> -			irec.br_startblock += len;
> +			if (len + irec.br_blockcount >= mp->m_dirblkfsbs) {
> +				bm_len = mp->m_dirblkfsbs - len;
> +				len = 0;
> +			} else {
> +				len += irec.br_blockcount;
> +				bm_len = irec.br_blockcount;
> +			}

So len represents the total length of the maps attached to the current
array...

> +
> +			map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
> +							irec.br_startblock);
> +			map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
> +			nmaps++;
> +
> +			if (len == 0) {
> +				pf_queue_io(args, map, nmaps, B_DIR_META);
> +				nmaps = 0;
> +			}

Kind of a nit, but this looks a little weird to me. The logic would be a
bit more clear with something like:

	if (len + irec.br_blockcount > mp->dirblkfsbs)
		bm_len = mp->m_dirblkfsbs - len;
	else
		bm_len = irec.br_blockcount;
	len += bm_len;

	...

	if (len == mp->dirblkfsbs) {
		len = 0;
		pf_queue_io(...)
	}

... which then raises the question of what happens if the directory
we're reading doesn't end with len == mp->dirblkfsbs? If so, perhaps not
a performance regression, but it looks like we wouldn't queue the last
I/O. Some of the directory code suggests that we fail if we don't alloc
the dirblkfsbs block count, so maybe this doesn't happen.

> +
> +			irec.br_blockcount -= bm_len;
> +			irec.br_startblock += bm_len;
> +
> +			/*
> +			 * Handle very fragmented dir2 blocks with dynamically
> +			 * allocated buffer maps.
> +			 */
> +			if (nmaps >= max_extents) {
> +				struct xfs_buf_map *old_map = NULL;
> +
> +				if (map == map_array) {
> +					old_map = map;
> +					map = NULL;
> +				}
> +				max_extents *= 2;
> +				map = realloc(map, max_extents * sizeof(*map));
> +				if (map == NULL) {
> +					do_error(
> +			_("couldn't malloc dir2 buffer list\n"));
> +					exit(1);
> +				}
> +				if (old_map)
> +					memcpy(map, old_map, sizeof(map_array));
> +			}
> +
>  		}
>  	}
> -	return 1;
> +	ret = 1;
> +out_free:
> +	if (map != map_array)
> +		free(map);
> +	return ret;
>  }
>  
>  /*
> @@ -395,9 +445,28 @@ pf_read_inode_dirs(
>  }
>  
>  /*
> - * pf_batch_read must be called with the lock locked.
> + * Discontiguous buffers require multiple IOs to fill, so we can't use any
> + * linearising, hole filling algorithms on them to avoid seeks. Just remove them
> + * for the prefetch queue and read them straight into the cache and release
> + * them.
>   */
> +static void
> +pf_read_discontig(
> +	struct prefetch_args	*args,
> +	struct xfs_buf		*bp)
> +{
> +	if (!btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, bp->b_bn)))
> +		do_error(_("prefetch corruption\n"));
> +
> +	pthread_mutex_unlock(&args->lock);
> +	libxfs_readbufr_map(mp->m_ddev_targp, bp, 0);
> +	libxfs_putbuf(bp);
> +	pthread_mutex_lock(&args->lock);
> +}
>  
> +/*
> + * pf_batch_read must be called with the lock locked.
> + */
>  static void
>  pf_batch_read(
>  	prefetch_args_t		*args,
> @@ -426,8 +495,15 @@ pf_batch_read(
>  			max_fsbno = fsbno + pf_max_fsbs;
>  		}
>  		while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
> -			if (which != PF_META_ONLY ||
> -			    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
> +			/*
> +			 * Handle discontiguous buffers outside the seek
> +			 * optimised IO loop below.
> +			 */
> +			if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
> +				pf_read_discontig(args, bplist[num]);
> +				bplist[num] = NULL;

So we pull these out from the processing below (which appears to want to
issue largish reads comprised of multiple buffers, via bplist). Thanks
for the comment above pf_read_discontig().

> +			} else if (which != PF_META_ONLY ||
> +				   !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
>  				num++;
>  			if (num == MAX_BUFS)
>  				break;
> @@ -664,10 +740,13 @@ pf_queuing_worker(
>  		bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
>  
>  		do {
> -			pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
> -					blks_per_cluster,
> -					(cur_irec->ino_isa_dir != 0) ?
> -						B_DIR_INODE : B_INODE);
> +			struct xfs_buf_map	map;
> +
> +			map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
> +			map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
> +			pf_queue_io(args, &map, 1,
> +				    (cur_irec->ino_isa_dir != 0) ?  B_DIR_INODE
> +		

Straightforward change based on the new pf_queue_io().

Brian

						 : B_INODE);
>  			bno += blks_per_cluster;
>  			num_inos += inodes_per_cluster;
>  		} while (num_inos < XFS_IALLOC_INODES(mp));
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs