All of lore.kernel.org
 help / color / mirror / Atom feed
From: Brian Foster <bfoster@redhat.com>
To: Christoph Hellwig <hch@lst.de>
Cc: xfs@oss.sgi.com
Subject: Re: [PATCH] xfs: don't allocate an ioend for direct I/O completions
Date: Fri, 30 Jan 2015 09:42:23 -0500	[thread overview]
Message-ID: <20150130144223.GA27441@laptop.bfoster> (raw)
In-Reply-To: <1422485661-520-1-git-send-email-hch@lst.de>

On Wed, Jan 28, 2015 at 11:54:21PM +0100, Christoph Hellwig wrote:
> Back in the days when the direct I/O ->end_io callback could be called
> from interrupt context for AIO we needed a structure to hand off to the
> workqueue, and reused the ioend structure for this purpose.  These days
> ->end_io is always called from user or workqueue context, which allows us
> to avoid this memory allocation and simplify the code significantly.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Looks mostly Ok to me. In fact, with xfs_finish_ioend_sync() calling
xfs_end_io() directly, I don't see how we currently get into the wq at
all. Anyways, a few notes...

>  fs/xfs/xfs_aops.c | 133 ++++++++++++++++++++++++------------------------------
>  fs/xfs/xfs_aops.h |   3 --
>  2 files changed, 59 insertions(+), 77 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 18e2f3b..31d3d7d 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
>   */
>  STATIC int
>  xfs_setfilesize(
> -	struct xfs_ioend	*ioend)
> +	struct xfs_inode	*ip,
> +	struct xfs_trans	*tp,
> +	xfs_off_t		offset,
> +	size_t			size)
>  {
> -	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> -	struct xfs_trans	*tp = ioend->io_append_trans;
>  	xfs_fsize_t		isize;
>  
> -	/*
> -	 * The transaction may have been allocated in the I/O submission thread,
> -	 * thus we need to mark ourselves as beeing in a transaction manually.
> -	 * Similarly for freeze protection.
> -	 */
> -	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> -	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> -			   0, 1, _THIS_IP_);
> -
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
> -	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
> +	isize = xfs_new_eof(ip, offset + size);
>  	if (!isize) {
>  		xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  		xfs_trans_cancel(tp, 0);
>  		return 0;
>  	}
>  
> -	trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
> +	trace_xfs_setfilesize(ip, offset, size);
>  
>  	ip->i_d.di_size = isize;
>  	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> @@ -167,6 +159,25 @@ xfs_setfilesize(
>  	return xfs_trans_commit(tp, 0);
>  }
>  
> +STATIC int
> +xfs_setfilesize_ioend(
> +	struct xfs_ioend	*ioend)
> +{
> +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> +	struct xfs_trans	*tp = ioend->io_append_trans;
> +
> +	/*
> +	 * The transaction may have been allocated in the I/O submission thread,
> +	 * thus we need to mark ourselves as beeing in a transaction manually.

Copied from the the original, but since we're here: "being."

> +	 * Similarly for freeze protection.
> +	 */
> +	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> +	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> +			   0, 1, _THIS_IP_);
> +
> +	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
> +}
> +
>  /*
>   * Schedule IO completion handling on the final put of an ioend.
>   *
> @@ -182,8 +193,7 @@ xfs_finish_ioend(
>  
>  		if (ioend->io_type == XFS_IO_UNWRITTEN)
>  			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
> -		else if (ioend->io_append_trans ||
> -			 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
> +		else if (ioend->io_append_trans)
>  			queue_work(mp->m_data_workqueue, &ioend->io_work);
>  		else
>  			xfs_destroy_ioend(ioend);
> @@ -215,22 +225,8 @@ xfs_end_io(
>  	if (ioend->io_type == XFS_IO_UNWRITTEN) {
>  		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
>  						  ioend->io_size);
> -	} else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
> -		/*
> -		 * For direct I/O we do not know if we need to allocate blocks
> -		 * or not so we can't preallocate an append transaction as that
> -		 * results in nested reservations and log space deadlocks. Hence
> -		 * allocate the transaction here. While this is sub-optimal and
> -		 * can block IO completion for some time, we're stuck with doing
> -		 * it this way until we can pass the ioend to the direct IO
> -		 * allocation callbacks and avoid nesting that way.
> -		 */
> -		error = xfs_setfilesize_trans_alloc(ioend);
> -		if (error)
> -			goto done;
> -		error = xfs_setfilesize(ioend);
>  	} else if (ioend->io_append_trans) {
> -		error = xfs_setfilesize(ioend);
> +		error = xfs_setfilesize_ioend(ioend);
>  	} else {
>  		ASSERT(!xfs_ioend_is_append(ioend));
>  	}
> @@ -273,7 +269,6 @@ xfs_alloc_ioend(
>  	 * all the I/O from calling the completion routine too early.
>  	 */
>  	atomic_set(&ioend->io_remaining, 1);
> -	ioend->io_isdirect = 0;
>  	ioend->io_error = 0;
>  	ioend->io_list = NULL;
>  	ioend->io_type = type;
> @@ -1459,11 +1454,7 @@ xfs_get_blocks_direct(
>   *
>   * If the private argument is non-NULL __xfs_get_blocks signals us that we
>   * need to issue a transaction to convert the range from unwritten to written
> - * extents.  In case this is regular synchronous I/O we just call xfs_end_io
> - * to do this and we are done.  But in case this was a successful AIO
> - * request this handler is called from interrupt context, from which we
> - * can't start transactions.  In that case offload the I/O completion to
> - * the workqueues we also use for buffered I/O completion.
> + * extents.
>   */
>  STATIC void
>  xfs_end_io_direct_write(
> @@ -1472,7 +1463,12 @@ xfs_end_io_direct_write(
>  	ssize_t			size,
>  	void			*private)
>  {
> -	struct xfs_ioend	*ioend = iocb->private;
> +	struct inode		*inode = file_inode(iocb->ki_filp);
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return;
>  
>  	/*
>  	 * While the generic direct I/O code updates the inode size, it does
> @@ -1480,22 +1476,33 @@ xfs_end_io_direct_write(
>  	 * end_io handler thinks the on-disk size is outside the in-core
>  	 * size.  To prevent this just update it a little bit earlier here.
>  	 */
> -	if (offset + size > i_size_read(ioend->io_inode))
> -		i_size_write(ioend->io_inode, offset + size);
> +	if (offset + size > i_size_read(inode))
> +		i_size_write(inode, offset + size);
>  
>  	/*
> -	 * blockdev_direct_IO can return an error even after the I/O
> -	 * completion handler was called.  Thus we need to protect
> -	 * against double-freeing.
> +	 * For direct I/O we do not know if we need to allocate blocks or not,
> +	 * so we can't preallocate an append transaction, as that results in
> +	 * nested reservations and log space deadlocks. Hence allocate the
> +	 * transaction here. While this is sub-optimal and can block IO
> +	 * completion for some time, we're stuck with doing it this way until
> +	 * we can pass the ioend to the direct IO allocation callbacks and
> +	 * avoid nesting that way.
>  	 */
> -	iocb->private = NULL;
> -
> -	ioend->io_offset = offset;
> -	ioend->io_size = size;
> -	if (private && size > 0)
> -		ioend->io_type = XFS_IO_UNWRITTEN;
> +	if (private && size > 0) {
> +		xfs_iomap_write_unwritten(ip, offset, size);
> +	} else if (offset + size > ip->i_d.di_size) {
> +		struct xfs_trans	*tp;
> +		int			error;
> +
> +		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
> +		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
> +		if (error) {
> +			xfs_trans_cancel(tp, 0);
> +			return;
> +		}
>  
> -	xfs_finish_ioend_sync(ioend);

I get an unused function warning for xfs_finish_ioend_sync() now, so I
guess we should kill that.

> +		xfs_setfilesize(ip, tp, offset, size);
> +	}
>  }
>  
>  STATIC ssize_t
> @@ -1507,39 +1514,17 @@ xfs_vm_direct_IO(
>  {
>  	struct inode		*inode = iocb->ki_filp->f_mapping->host;
>  	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
> -	struct xfs_ioend	*ioend = NULL;
> -	ssize_t			ret;
>  
>  	if (rw & WRITE) {

A nit, but I guess you could kill the braces here now too.

Brian

> -		size_t size = iov_iter_count(iter);
> -
> -		/*
> -		 * We cannot preallocate a size update transaction here as we
> -		 * don't know whether allocation is necessary or not. Hence we
> -		 * can only tell IO completion that one is necessary if we are
> -		 * not doing unwritten extent conversion.
> -		 */
> -		iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
> -		if (offset + size > XFS_I(inode)->i_d.di_size)
> -			ioend->io_isdirect = 1;
> -
> -		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
> +		return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
>  					    offset, xfs_get_blocks_direct,
>  					    xfs_end_io_direct_write, NULL,
>  					    DIO_ASYNC_EXTEND);
> -		if (ret != -EIOCBQUEUED && iocb->private)
> -			goto out_destroy_ioend;
>  	} else {
> -		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
> +		return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
>  					    offset, xfs_get_blocks_direct,
>  					    NULL, NULL, 0);
>  	}
> -
> -	return ret;
> -
> -out_destroy_ioend:
> -	xfs_destroy_ioend(ioend);
> -	return ret;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index f94dd45..ac644e0 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
>   * Types of I/O for bmap clustering and I/O completion tracking.
>   */
>  enum {
> -	XFS_IO_DIRECT = 0,	/* special case for direct I/O ioends */
>  	XFS_IO_DELALLOC,	/* covers delalloc region */
>  	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
>  	XFS_IO_OVERWRITE,	/* covers already allocated extent */
>  };
>  
>  #define XFS_IO_TYPES \
> -	{ 0,			"" }, \
>  	{ XFS_IO_DELALLOC,		"delalloc" }, \
>  	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
>  	{ XFS_IO_OVERWRITE,		"overwrite" }
> @@ -45,7 +43,6 @@ typedef struct xfs_ioend {
>  	unsigned int		io_type;	/* delalloc / unwritten */
>  	int			io_error;	/* I/O error code */
>  	atomic_t		io_remaining;	/* hold count */
> -	unsigned int		io_isdirect : 1;/* direct I/O */
>  	struct inode		*io_inode;	/* file being written to */
>  	struct buffer_head	*io_buffer_head;/* buffer linked list head */
>  	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
> -- 
> 1.9.1
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

  reply	other threads:[~2015-01-30 14:42 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-28 22:54 [PATCH] xfs: don't allocate an ioend for direct I/O completions Christoph Hellwig
2015-01-30 14:42 ` Brian Foster [this message]
2015-02-01 23:04   ` Dave Chinner
2015-02-02  7:27     ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150130144223.GA27441@laptop.bfoster \
    --to=bfoster@redhat.com \
    --cc=hch@lst.de \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.