linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	willy@linux.intel.com
Subject: Re: [PATCH v7 20/22] ext4: Add DAX functionality
Date: Wed, 9 Apr 2014 14:17:17 +0200	[thread overview]
Message-ID: <20140409121717.GN32103@quack.suse.cz> (raw)
In-Reply-To: <490bf3041f0e0633964ca84bf4fb0bb3dd999694.1395591795.git.matthew.r.wilcox@intel.com>

On Sun 23-03-14 15:08:46, Matthew Wilcox wrote:
> From: Ross Zwisler <ross.zwisler@linux.intel.com>
> 
> This is a port of the DAX functionality found in the current version of
> ext2.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
> [heavily tweaked]
> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
> ---
  I have some comments below.

> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 1a50739..42a8ccd 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -190,7 +190,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
>  		}
>  	}
>  
> -	if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
> +	if (io_is_direct(iocb->ki_filp))
>  		ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
>  	else
>  		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> @@ -198,6 +198,27 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_fault(vma, vmf, ext4_get_block);
> +					/* Is this the right get_block? */
  Yes, it is the right one.

> +}
> +
> +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_mkwrite(vma, vmf, ext4_get_block);
> +}
  Umm, I'm afraid it won't be this easy here. So you rely on
ext4_get_block() to start a transaction for you and do the block
allocation. However if the system crashes after ext4_get_block() has
allocated the block and finished the transaction but before dax_mkwrite()
had a chance to zero out the page, the filesystem will be referencing block
with uninitialized data when the system boots again (this is a security
issue for multiuser systems). What you need to do is to start a transaction
here in ext4_dax_mkwrite(), call dax_mkwrite() (ext4_get_block() will
notice the transaction is already started and don't start it again so you
don't have to care about that), and stop the transaction after
dax_mkwrite() returns. Except it's not so easy because
sb_start_pagefault() locking ranks above transaction start so ext4 will
really need to call into something like do_dax_fault() - I'd suggest we
create dax_mkwrite() and __dax_mkwrite() similarly to how
block_page_mkwrite() and __block_page_mkwrite() from fs/buffer.c do.

> +
> +static const struct vm_operations_struct ext4_dax_vm_ops = {
> +	.fault		= ext4_dax_fault,
> +	.page_mkwrite	= ext4_dax_mkwrite,
> +	.remap_pages	= generic_file_remap_pages,
> +};
> +#else
> +#define ext4_dax_vm_ops	ext4_file_vm_ops
> +#endif
> +
>  static const struct vm_operations_struct ext4_file_vm_ops = {
>  	.fault		= filemap_fault,
>  	.page_mkwrite   = ext4_page_mkwrite,
> @@ -206,12 +227,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
>  
>  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> -	struct address_space *mapping = file->f_mapping;
> -
> -	if (!mapping->a_ops->readpage)
> -		return -ENOEXEC;
>  	file_accessed(file);
> -	vma->vm_ops = &ext4_file_vm_ops;
> +	if (IS_DAX(file_inode(file))) {
> +		vma->vm_ops = &ext4_dax_vm_ops;
> +		vma->vm_flags |= VM_MIXEDMAP;
> +	} else {
> +		vma->vm_ops = &ext4_file_vm_ops;
> +	}
>  	return 0;
>  }
>  
> @@ -609,6 +631,25 @@ const struct file_operations ext4_file_operations = {
>  	.fallocate	= ext4_fallocate,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +const struct file_operations ext4_dax_file_operations = {
> +	.llseek		= ext4_llseek,
> +	.read		= do_sync_read,
> +	.write		= do_sync_write,
> +	.aio_read	= generic_file_aio_read,
> +	.aio_write	= ext4_file_write,
> +	.unlocked_ioctl = ext4_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= ext4_compat_ioctl,
> +#endif
> +	.mmap		= ext4_file_mmap,
> +	.open		= ext4_file_open,
> +	.release	= ext4_release_file,
> +	.fsync		= ext4_sync_file,
> +	.fallocate	= ext4_fallocate,
> +};
> +#endif
> +
>  const struct inode_operations ext4_file_inode_operations = {
>  	.setattr	= ext4_setattr,
>  	.getattr	= ext4_getattr,
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 594009f..5fdb414 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -686,15 +686,22 @@ retry:
>  			inode_dio_done(inode);
>  			goto locked;
>  		}
> -		ret = __blockdev_direct_IO(rw, iocb, inode,
> -				 inode->i_sb->s_bdev, iov,
> -				 offset, nr_segs,
> -				 ext4_get_block, NULL, NULL, 0);
> +		if (IS_DAX(inode))
> +			ret = dax_do_io(rw, iocb, inode, iov, offset, nr_segs,
> +					ext4_get_block, NULL, 0);
> +		else
> +			ret = __blockdev_direct_IO(rw, iocb, inode,
> +					inode->i_sb->s_bdev, iov, offset,
> +					nr_segs, ext4_get_block, NULL, NULL, 0);
>  		inode_dio_done(inode);
>  	} else {
>  locked:
> -		ret = blockdev_direct_IO(rw, iocb, inode, iov,
> -				 offset, nr_segs, ext4_get_block);
> +		if (IS_DAX(inode))
> +			ret = dax_do_io(rw, iocb, inode, iov, offset, nr_segs,
> +					ext4_get_block, NULL, DIO_LOCKING);
> +		else
> +			ret = blockdev_direct_IO(rw, iocb, inode, iov,
> +					offset, nr_segs, ext4_get_block);
>  
>  		if (unlikely((rw & WRITE) && ret < 0)) {
>  			loff_t isize = i_size_read(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ce7341c..9462730 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3140,13 +3140,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  		get_block_func = ext4_get_block_write;
>  		dio_flags = DIO_LOCKING;
>  	}
> -	ret = __blockdev_direct_IO(rw, iocb, inode,
> -				   inode->i_sb->s_bdev, iov,
> -				   offset, nr_segs,
> -				   get_block_func,
> -				   ext4_end_io_dio,
> -				   NULL,
> -				   dio_flags);
> +	if (IS_DAX(inode))
> +		ret = dax_do_io(rw, iocb, inode, iov, offset, nr_segs,
> +				get_block_func, ext4_end_io_dio, dio_flags);
> +	else
> +		ret = __blockdev_direct_IO(rw, iocb, inode,
> +					   inode->i_sb->s_bdev, iov, offset,
> +					   nr_segs, get_block_func,
> +					   ext4_end_io_dio, NULL, dio_flags);
>  
  Since you don't do real AIO for DAX, you could handle async iocbs for DAX
inodes the same way as normal sync iocbs (i.e., you don't need to allocate
ioend and do completion from a workqueue but handle everything necessary in
ext4_ext_direct_IO()). That will be noticeably faster and with smaller CPU
load as well. I'm not saying you have to do that now (although it shouldn't
be complicated) but at least note that in a comment please.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2014-04-09 12:17 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-23 19:08 [PATCH v7 00/22] Support ext4 on NV-DIMMs Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 01/22] Fix XIP fault vs truncate race Matthew Wilcox
2014-03-29 15:57   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 02/22] Allow page fault handlers to perform the COW Matthew Wilcox
2014-04-08 16:34   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 03/22] axonram: Fix bug in direct_access Matthew Wilcox
2014-03-29 16:22   ` Jan Kara
2014-04-02 19:24     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 04/22] Change direct_access calling convention Matthew Wilcox
2014-03-29 16:30   ` Jan Kara
2014-04-02 19:27     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 05/22] Introduce IS_DAX(inode) Matthew Wilcox
2014-04-08 15:32   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 06/22] Replace XIP read and write with DAX I/O Matthew Wilcox
2014-04-08 17:56   ` Jan Kara
2014-04-08 20:21     ` Matthew Wilcox
2014-04-09  9:14       ` Jan Kara
2014-04-09 15:19         ` Matthew Wilcox
2014-04-09 20:55           ` Jan Kara
2014-04-13 18:05             ` Matthew Wilcox
2014-04-09 12:04   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler Matthew Wilcox
2014-04-08 22:05   ` Jan Kara
2014-04-09 20:48     ` Matthew Wilcox
2014-04-09 21:12       ` Jan Kara
2014-04-13 11:21         ` Matthew Wilcox
2014-04-14 16:04           ` Jan Kara
2014-04-09 10:27   ` Jan Kara
2014-04-09 20:51     ` Matthew Wilcox
2014-04-09 21:43       ` Jan Kara
2014-04-13 18:03         ` Matthew Wilcox
2014-07-29 12:12         ` Matthew Wilcox
2014-07-29 21:04           ` Jan Kara
2014-07-29 21:23             ` Matthew Wilcox
2014-07-30  9:52               ` Jan Kara
2014-07-30 21:02                 ` Matthew Wilcox
2014-08-09 11:00                 ` Matthew Wilcox
2014-08-11  8:51                   ` Jan Kara
2014-08-11 14:13                     ` Matthew Wilcox
2014-08-11 14:35                       ` Jan Kara
2014-08-11 15:02                         ` Matthew Wilcox
2014-08-11 15:25                           ` Jan Kara
2014-05-21 20:35   ` Toshi Kani
2014-06-05 22:38     ` Toshi Kani
2014-03-23 19:08 ` [PATCH v7 08/22] Replace xip_truncate_page with dax_truncate_page Matthew Wilcox
2014-04-08 22:17   ` Jan Kara
2014-04-09  9:26     ` Jan Kara
2014-04-13 19:07       ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 09/22] Remove mm/filemap_xip.c Matthew Wilcox
2014-04-08 18:21   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 10/22] Remove get_xip_mem Matthew Wilcox
2014-04-08 18:20   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 11/22] Replace ext2_clear_xip_target with dax_clear_blocks Matthew Wilcox
2014-04-09  9:46   ` Jan Kara
2014-04-10 14:16     ` Matthew Wilcox
2014-04-10 18:31       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 12/22] ext2: Remove ext2_xip_verify_sb() Matthew Wilcox
2014-04-09  9:52   ` Jan Kara
2014-04-10 14:22     ` Matthew Wilcox
2014-04-10 18:35       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 13/22] ext2: Remove ext2_use_xip Matthew Wilcox
2014-04-09  9:55   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 14/22] ext2: Remove xip.c and xip.h Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 15/22] Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-04-10 14:23     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 16/22] ext2: Remove ext2_aops_xip Matthew Wilcox
2014-04-09 10:02   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 17/22] Get rid of most mentions of XIP in ext2 Matthew Wilcox
2014-04-09 10:04   ` Jan Kara
2014-04-10 14:26     ` Matthew Wilcox
2014-04-10 18:40       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 18/22] xip: Add xip_zero_page_range Matthew Wilcox
2014-04-09 10:15   ` Jan Kara
2014-04-10 14:27     ` Matthew Wilcox
2014-04-10 18:43       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 19/22] ext4: Make ext4_block_zero_page_range static Matthew Wilcox
2014-03-24 19:11   ` tytso
2014-03-23 19:08 ` [PATCH v7 20/22] ext4: Add DAX functionality Matthew Wilcox
2014-04-09 12:17   ` Jan Kara [this message]
2014-03-23 19:08 ` [PATCH v7 21/22] ext4: Fix typos Matthew Wilcox
2014-03-24 19:16   ` tytso
2014-03-23 19:08 ` [PATCH v7 22/22] brd: Rename XIP to DAX Matthew Wilcox
2014-04-09 10:07   ` Jan Kara
2014-05-18 14:58 ` [PATCH v7 00/22] Support ext4 on NV-DIMMs Boaz Harrosh
2014-05-18 23:24   ` Matthew Wilcox
2014-06-17 18:11 ` Boaz Harrosh
2014-06-17 18:19   ` Matthew Wilcox
2014-06-17 18:39     ` Boaz Harrosh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140409121717.GN32103@quack.suse.cz \
    --to=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=ross.zwisler@linux.intel.com \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).