All of lore.kernel.org
 help / color / mirror / Atom feed
From: Brian Foster <bfoster@redhat.com>
To: Dave Chinner <david@fromorbit.com>
Cc: xfs@oss.sgi.com
Subject: Re: [PATCH 4/8] xfs: add DAX file operations support
Date: Tue, 2 Jun 2015 12:02:19 -0400	[thread overview]
Message-ID: <20150602160217.GA14182@bfoster.bfoster> (raw)
In-Reply-To: <1432856755-7859-5-git-send-email-david@fromorbit.com>

On Fri, May 29, 2015 at 09:45:51AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Add the initial support for DAX file operations to XFS. This
> includes the necessary block allocation and mmap page fault hooks
> for DAX to function.
> 
> Note that there are changes to the splice interfaces to ensure that
> for DAX splice avoids direct page cache manipulations and instead
> takes the DAX IO paths for read/write operations.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_aops.c | 116 ++++++++++++++++++++++++++++++++++++++---------------
>  fs/xfs/xfs_aops.h |   7 +++-
>  fs/xfs/xfs_file.c | 118 +++++++++++++++++++++++++++++++-----------------------
>  3 files changed, 158 insertions(+), 83 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index a56960d..1d195e8 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
...
> @@ -1607,6 +1588,75 @@ out_end_io:
>  	return;
>  }
>  
> +/*
> + * Complete a direct I/O write request.
> + *
> + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
> + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
> + * wholly within the EOF and so there is nothing for us to do. Note that in this
> + * case the completion can be called in interrupt context, whereas if we have an
> + * ioend we will always be called in task context (i.e. from a workqueue).
> + */
> +STATIC void
> +xfs_end_io_direct_write(
> +	struct kiocb		*iocb,
> +	loff_t			offset,
> +	ssize_t			size,
> +	void			*private)
> +{
> +	struct inode		*inode = file_inode(iocb->ki_filp);
> +	struct xfs_ioend	*ioend = private;
> +
> +	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
> +				     ioend ? ioend->io_type : 0, NULL);
> +
> +	if (!ioend) {
> +		ASSERT(offset + size <= i_size_read(inode));
> +		return;
> +	}
> +
> +	__xfs_end_io_direct_write(inode, ioend, offset, size);
> +}
> +
> +/*
> + * For DAX we need a mapping buffer callback for unwritten extent conversion
> + * when page faults allocate blocks and then zero them. Note that in this
> + * case the mapping indicated by the ioend may extend beyond EOF. We most
> + * definitely do not want to extend EOF here, so we trim back the ioend size to
> + * EOF.
> + */
> +#ifdef CONFIG_FS_DAX
> +void
> +xfs_end_io_dax_write(
> +	struct buffer_head	*bh,
> +	int			uptodate)
> +{
> +	struct xfs_ioend	*ioend = bh->b_private;
> +	struct inode		*inode = ioend->io_inode;
> +	ssize_t			size = ioend->io_size;
> +
> +	ASSERT(IS_DAX(ioend->io_inode));
> +

A trace_xfs_gbmap_dax_endio() might be nice here. Otherwise this looks
good to me:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +	/* if there was an error zeroing, then don't convert it */
> +	if (!uptodate)
> +		ioend->io_error = -EIO;
> +
> +	/*
> +	 * Trim update to EOF, so we don't extend EOF during unwritten extent
> +	 * conversion of partial EOF blocks.
> +	 */
> +	spin_lock(&XFS_I(inode)->i_flags_lock);
> +	if (ioend->io_offset + size > i_size_read(inode))
> +		size = i_size_read(inode) - ioend->io_offset;
> +	spin_unlock(&XFS_I(inode)->i_flags_lock);
> +
> +	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
> +
> +}
> +#else
> +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
> +#endif
> +
>  STATIC ssize_t
>  xfs_vm_direct_IO(
>  	struct kiocb		*iocb,
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index ac644e0..86afd1a 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -53,7 +53,12 @@ typedef struct xfs_ioend {
>  } xfs_ioend_t;
>  
>  extern const struct address_space_operations xfs_address_space_operations;
> -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
> +
> +int	xfs_get_blocks(struct inode *inode, sector_t offset,
> +		       struct buffer_head *map_bh, int create);
> +int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
> +			      struct buffer_head *map_bh, int create);
> +void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
>  
>  extern void xfs_count_page_state(struct page *, int *, int *);
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 4a0af89..c2af282 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -284,7 +284,7 @@ xfs_file_read_iter(
>  	if (file->f_mode & FMODE_NOCMTIME)
>  		ioflags |= XFS_IO_INVIS;
>  
> -	if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
> +	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
>  		xfs_buftarg_t	*target =
>  			XFS_IS_REALTIME_INODE(ip) ?
>  				mp->m_rtdev_targp : mp->m_ddev_targp;
> @@ -378,7 +378,11 @@ xfs_file_splice_read(
>  
>  	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
>  
> -	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
> +	/* for dax, we need to avoid the page cache */
> +	if (IS_DAX(VFS_I(ip)))
> +		ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
> +	else
> +		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
>  	if (ret > 0)
>  		XFS_STATS_ADD(xs_read_bytes, ret);
>  
> @@ -672,7 +676,7 @@ xfs_file_dio_aio_write(
>  					mp->m_rtdev_targp : mp->m_ddev_targp;
>  
>  	/* DIO must be aligned to device logical sector size */
> -	if ((pos | count) & target->bt_logical_sectormask)
> +	if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
>  		return -EINVAL;
>  
>  	/* "unaligned" here means not aligned to a filesystem block */
> @@ -758,8 +762,11 @@ xfs_file_dio_aio_write(
>  out:
>  	xfs_rw_iunlock(ip, iolock);
>  
> -	/* No fallback to buffered IO on errors for XFS. */
> -	ASSERT(ret < 0 || ret == count);
> +	/*
> +	 * No fallback to buffered IO on errors for XFS. DAX can result in
> +	 * partial writes, but direct IO will either complete fully or fail.
> +	 */
> +	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
>  	return ret;
>  }
>  
> @@ -842,7 +849,7 @@ xfs_file_write_iter(
>  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
>  		return -EIO;
>  
> -	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
> +	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
>  		ret = xfs_file_dio_aio_write(iocb, from);
>  	else
>  		ret = xfs_file_buffered_aio_write(iocb, from);
> @@ -1063,17 +1070,6 @@ xfs_file_readdir(
>  	return xfs_readdir(ip, ctx, bufsize);
>  }
>  
> -STATIC int
> -xfs_file_mmap(
> -	struct file	*filp,
> -	struct vm_area_struct *vma)
> -{
> -	vma->vm_ops = &xfs_file_vm_ops;
> -
> -	file_accessed(filp);
> -	return 0;
> -}
> -
>  /*
>   * This type is designed to indicate the type of offset we would like
>   * to search from page cache for xfs_seek_hole_data().
> @@ -1454,26 +1450,11 @@ xfs_file_llseek(
>   * ordering of:
>   *
>   * mmap_sem (MM)
> - *   i_mmap_lock (XFS - truncate serialisation)
> - *     page_lock (MM)
> - *       i_lock (XFS - extent map serialisation)
> + *   sb_start_pagefault(vfs, freeze)
> + *     i_mmap_lock (XFS - truncate serialisation)
> + *       page_lock (MM)
> + *         i_lock (XFS - extent map serialisation)
>   */
> -STATIC int
> -xfs_filemap_fault(
> -	struct vm_area_struct	*vma,
> -	struct vm_fault		*vmf)
> -{
> -	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
> -	int			error;
> -
> -	trace_xfs_filemap_fault(ip);
> -
> -	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
> -	error = filemap_fault(vma, vmf);
> -	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
> -
> -	return error;
> -}
>  
>  /*
>   * mmap()d file has taken write protection fault and is being made writable. We
> @@ -1486,21 +1467,66 @@ xfs_filemap_page_mkwrite(
>  	struct vm_area_struct	*vma,
>  	struct vm_fault		*vmf)
>  {
> -	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
> +	struct inode		*inode = file_inode(vma->vm_file);
>  	int			ret;
>  
> -	trace_xfs_filemap_page_mkwrite(ip);
> +	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
>  
> -	sb_start_pagefault(VFS_I(ip)->i_sb);
> +	sb_start_pagefault(inode->i_sb);
>  	file_update_time(vma->vm_file);
> -	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
> +	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +
> +	if (IS_DAX(inode)) {
> +		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
> +				    xfs_end_io_dax_write);
> +	} else {
> +		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
> +		ret = block_page_mkwrite_return(ret);
> +	}
> +
> +	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +	sb_end_pagefault(inode->i_sb);
> +
> +	return ret;
> +}
> +
> +STATIC int
> +xfs_filemap_fault(
> +	struct vm_area_struct	*vma,
> +	struct vm_fault		*vmf)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(vma->vm_file));
> +	int			ret;
>  
> -	ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
> +	trace_xfs_filemap_fault(ip);
> +
> +	/* DAX can shortcut the normal fault path on write faults! */
> +	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
> +		return xfs_filemap_page_mkwrite(vma, vmf);
>  
> +	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
> +	ret = filemap_fault(vma, vmf);
>  	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
> -	sb_end_pagefault(VFS_I(ip)->i_sb);
>  
> -	return block_page_mkwrite_return(ret);
> +	return ret;
> +}
> +
> +static const struct vm_operations_struct xfs_file_vm_ops = {
> +	.fault		= xfs_filemap_fault,
> +	.map_pages	= filemap_map_pages,
> +	.page_mkwrite	= xfs_filemap_page_mkwrite,
> +};
> +
> +STATIC int
> +xfs_file_mmap(
> +	struct file	*filp,
> +	struct vm_area_struct *vma)
> +{
> +	file_accessed(filp);
> +	vma->vm_ops = &xfs_file_vm_ops;
> +	if (IS_DAX(file_inode(filp)))
> +		vma->vm_flags |= VM_MIXEDMAP;
> +	return 0;
>  }
>  
>  const struct file_operations xfs_file_operations = {
> @@ -1531,9 +1557,3 @@ const struct file_operations xfs_dir_file_operations = {
>  #endif
>  	.fsync		= xfs_dir_fsync,
>  };
> -
> -static const struct vm_operations_struct xfs_file_vm_ops = {
> -	.fault		= xfs_filemap_fault,
> -	.map_pages	= filemap_map_pages,
> -	.page_mkwrite	= xfs_filemap_page_mkwrite,
> -};
> -- 
> 2.0.0
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

  reply	other threads:[~2015-06-02 16:02 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-28 23:45 [PATCH 0/8 v3] xfs: DAX support Dave Chinner
2015-05-28 23:45 ` [PATCH 1/8] xfs: mmap lock needs to be inside freeze protection Dave Chinner
2015-05-28 23:45 ` [PATCH 2/8] dax: don't abuse get_block mapping for endio callbacks Dave Chinner
2015-05-28 23:45 ` [PATCH 3/8] dax: expose __dax_fault for filesystems with locking constraints Dave Chinner
2015-05-28 23:45 ` [PATCH 4/8] xfs: add DAX file operations support Dave Chinner
2015-06-02 16:02   ` Brian Foster [this message]
2015-05-28 23:45 ` [PATCH 5/8] xfs: add DAX block zeroing support Dave Chinner
2015-06-02 16:02   ` Brian Foster
2015-06-02 20:12     ` [PATCH 5/8 v2] " Dave Chinner
2015-06-02 20:47       ` Brian Foster
2015-05-28 23:45 ` [PATCH 6/8] xfs: add DAX truncate support Dave Chinner
2015-05-28 23:45 ` [PATCH 7/8] xfs: add DAX IO path support Dave Chinner
2015-05-28 23:45 ` [PATCH 8/8] xfs: add initial DAX support Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150602160217.GA14182@bfoster.bfoster \
    --to=bfoster@redhat.com \
    --cc=david@fromorbit.com \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.