linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: Dave Chinner <david@fromorbit.com>
Cc: xfs@oss.sgi.com, linux-fsdevel@vger.kernel.org, jack@suse.cz,
	willy@linux.intel.com
Subject: Re: [PATCH 1/6] dax: don't abuse get_block mapping for endio callbacks
Date: Wed, 4 Mar 2015 16:54:08 +0100	[thread overview]
Message-ID: <20150304155408.GA2799@quack.suse.cz> (raw)
In-Reply-To: <1425425427-16283-2-git-send-email-david@fromorbit.com>

On Wed 04-03-15 10:30:22, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> dax_fault() currently relies on the get_block callback to attach an
> io completion callback to the mapping buffer head so that it can
> run unwritten extent conversion after zeroing allocated blocks.
> 
> Instead of this hack, pass the conversion callback directly into
> dax_fault() similar to the get_block callback. When the filesystem
> allocates unwritten extents, it will set the buffer_unwritten()
> flag, and hence the dax_fault code can call the completion function
> in the contexts where it is necessary without overloading the
> mapping buffer head.
> 
> Note: The changes to ext4 to use this interface are suspect at best.
> In fact, the way ext4 did this end_io assignment in the first place
> looks suspect because it only set a completion callback when there
> wasn't already some other write() call taking place on the same
> inode. The ext4 end_io code looks rather intricate and fragile with
> all it's reference counting and passing to different contexts for
> modification via inode private pointers that aren't protected by
> locks...
  Yeah, ext4 is currently broken in that regard so if we won't make things
worse, I'm OK.

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/dax.c           | 15 ++++++++-------
>  fs/ext2/file.c     |  4 ++--
>  fs/ext4/file.c     | 16 ++++++++++++++--
>  fs/ext4/inode.c    | 21 +++++++--------------
>  include/linux/fs.h |  6 ++++--
>  5 files changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ed1619e..d7b4dba 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -269,7 +269,8 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
>  }
>  
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
> -			struct vm_area_struct *vma, struct vm_fault *vmf)
> +			struct vm_area_struct *vma, struct vm_fault *vmf,
> +			dax_iodone_t complete_unwritten)
>  {
>  	struct address_space *mapping = inode->i_mapping;
>  	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
> @@ -310,14 +311,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>   out:
>  	i_mmap_unlock_read(mapping);
>  
> -	if (bh->b_end_io)
> -		bh->b_end_io(bh, 1);
> +	if (buffer_unwritten(bh))
> +		complete_unwritten(bh, 1);
>  
>  	return error;
>  }
  So frankly I don't see a big point in passing completion callback into
dax_insert_mapping() only to call the function at the end of it. We could
as well call the completion function from do_dax_fault() where it would
seem more natural to me. But I don't feel too strongly about this.

Instead of the above I was also thinking about some way to pass information
out of do_dax_fault() into filesystem so that it could just call completion
handler itself but the completion callback is more standard interface I
guess.

								Honza

>  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -			get_block_t get_block)
> +			get_block_t get_block, dax_iodone_t complete_unwritten)
>  {
>  	struct file *file = vma->vm_file;
>  	struct address_space *mapping = file->f_mapping;
> @@ -418,7 +419,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		page_cache_release(page);
>  	}
>  
> -	error = dax_insert_mapping(inode, &bh, vma, vmf);
> +	error = dax_insert_mapping(inode, &bh, vma, vmf, complete_unwritten);
>  
>   out:
>  	if (error == -ENOMEM)
> @@ -446,7 +447,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>   * fault handler for DAX files.
>   */
>  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -			get_block_t get_block)
> +	      get_block_t get_block, dax_iodone_t complete_unwritten)
>  {
>  	int result;
>  	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> @@ -455,7 +456,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		sb_start_pagefault(sb);
>  		file_update_time(vma->vm_file);
>  	}
> -	result = do_dax_fault(vma, vmf, get_block);
> +	result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
>  	if (vmf->flags & FAULT_FLAG_WRITE)
>  		sb_end_pagefault(sb);
>  
> diff --git a/fs/ext2/file.c b/fs/ext2/file.c
> index e317017..8da747a 100644
> --- a/fs/ext2/file.c
> +++ b/fs/ext2/file.c
> @@ -28,12 +28,12 @@
>  #ifdef CONFIG_FS_DAX
>  static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_fault(vma, vmf, ext2_get_block);
> +	return dax_fault(vma, vmf, ext2_get_block, NULL);
>  }
>  
>  static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_mkwrite(vma, vmf, ext2_get_block);
> +	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
>  }
>  
>  static const struct vm_operations_struct ext2_dax_vm_ops = {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 33a09da..f7dabb1 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -192,15 +192,27 @@ errout:
>  }
>  
>  #ifdef CONFIG_FS_DAX
> +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> +{
> +	struct inode *inode = bh->b_assoc_map->host;
> +	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> +	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> +	int err;
> +	if (!uptodate)
> +		return;
> +	WARN_ON(!buffer_unwritten(bh));
> +	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> +}
> +
>  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_fault(vma, vmf, ext4_get_block);
> +	return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
>  					/* Is this the right get_block? */
>  }
>  
>  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_mkwrite(vma, vmf, ext4_get_block);
> +	return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
>  }
>  
>  static const struct vm_operations_struct ext4_dax_vm_ops = {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cb9a21..43433de 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -657,18 +657,6 @@ has_zeroout:
>  	return retval;
>  }
>  
> -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> -{
> -	struct inode *inode = bh->b_assoc_map->host;
> -	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> -	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> -	int err;
> -	if (!uptodate)
> -		return;
> -	WARN_ON(!buffer_unwritten(bh));
> -	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> -}
> -
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -706,10 +694,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> -		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> +		if (IS_DAX(inode) && buffer_unwritten(bh)) {
> +			/*
> +			 * dgc: I suspect unwritten conversion on ext4+DAX is
> +			 * fundamentally broken here when there are concurrent
> +			 * read/write in progress on this inode.
> +			 */
> +			WARN_ON_ONCE(io_end);
>  			bh->b_assoc_map = inode->i_mapping;
>  			bh->b_private = (void *)(unsigned long)iblock;
> -			bh->b_end_io = ext4_end_io_unwritten;
>  		}
>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 937e280..82100ae 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
>  			struct buffer_head *bh_result, int create);
>  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  			ssize_t bytes, void *private);
> +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
>  
>  #define MAY_EXEC		0x00000001
>  #define MAY_WRITE		0x00000002
> @@ -2603,8 +2604,9 @@ ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
>  int dax_clear_blocks(struct inode *, sector_t block, long size);
>  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
>  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
> -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
> -#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
> +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
> +		dax_iodone_t);
> +#define dax_mkwrite(vma, vmf, gb, iod)	dax_fault(vma, vmf, gb, iod)
>  
>  #ifdef CONFIG_BLOCK
>  typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
> -- 
> 2.0.0
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

  reply	other threads:[~2015-03-04 15:54 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-03-03 23:30 [RFC PATCH 0/6] xfs: DAX support Dave Chinner
2015-03-03 23:30 ` [PATCH 1/6] dax: don't abuse get_block mapping for endio callbacks Dave Chinner
2015-03-04 15:54   ` Jan Kara [this message]
2015-03-04 22:29     ` Dave Chinner
2015-03-03 23:30 ` [PATCH 2/6] xfs: add DAX block zeroing support Dave Chinner
2015-03-03 23:30 ` [PATCH 3/6] xfs: add DAX file operations support Dave Chinner
2015-03-04 10:09   ` Boaz Harrosh
2015-03-04 13:01     ` Dave Chinner
2015-03-04 14:54       ` Boaz Harrosh
2015-03-04 22:03         ` Dave Chinner
2015-03-24  4:27           ` Dave Chinner
2015-03-24  7:01             ` Christoph Hellwig
2015-03-24  8:13             ` Boaz Harrosh
2015-03-04 16:18   ` Jan Kara
2015-03-04 22:00     ` Dave Chinner
2015-03-05 11:05       ` Jan Kara
2015-03-22 23:02         ` Dave Chinner
2015-03-03 23:30 ` [PATCH 4/6] xfs: add DAX truncate support Dave Chinner
2015-03-03 23:30 ` [PATCH 5/6] xfs: add DAX IO path support Dave Chinner
2015-03-03 23:30 ` [PATCH 6/6] xfs: add initial DAX support Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150304155408.GA2799@quack.suse.cz \
    --to=jack@suse.cz \
    --cc=david@fromorbit.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=willy@linux.intel.com \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).