[PATCH v4 5/7] fs: prioritize and separate direct_io from dax

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
       [not found] <1461878218-3844-1-git-send-email-vishal.l.verma@intel.com>
@ 2016-04-28 21:16 ` Vishal Verma
  2016-05-02 14:56   ` Christoph Hellwig
  2016-05-02 15:41   ` Boaz Harrosh
  0 siblings, 2 replies; 25+ messages in thread
From: Vishal Verma @ 2016-04-28 21:16 UTC (permalink / raw)
  To: linux-nvdimm
  Cc: Vishal Verma, linux-fsdevel, linux-block, xfs, linux-ext4,
	linux-mm, Matthew Wilcox, Ross Zwisler, Dan Williams,
	Dave Chinner, Jan Kara, Jens Axboe, Al Viro, Andrew Morton,
	linux-kernel, Christoph Hellwig, Jeff Moyer

All IO in a dax filesystem used to go through dax_do_io, which cannot
handle media errors, and thus cannot provide a recovery path that can
send a write through the driver to clear errors.

Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
path for DAX filesystems, use the same direct_IO path for both DAX and
direct_io iocbs, but use the flags to identify when we are in O_DIRECT
mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
direct_IO path instead of DAX.

This allows us a recovery path in the form of opening the file with
O_DIRECT and writing to it with the usual O_DIRECT semantics (sector
alignment restrictions).

Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 drivers/block/loop.c |  2 +-
 fs/block_dev.c       | 17 +++++++++++++----
 fs/ext2/inode.c      | 16 ++++++++++++----
 fs/ext4/file.c       |  2 +-
 fs/ext4/inode.c      | 19 +++++++++++++------
 fs/xfs/xfs_aops.c    | 20 +++++++++++++-------
 fs/xfs/xfs_file.c    |  4 ++--
 include/linux/fs.h   | 15 ++++++++++++---
 mm/filemap.c         |  4 ++--
 9 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 80cf8ad..c0a24c3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -568,7 +568,7 @@ struct switch_request {
 
 static inline void loop_update_dio(struct loop_device *lo)
 {
-	__loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
+	__loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
 			lo->use_dio);
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 79defba..97a1f5f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -167,12 +167,21 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 
-	if (IS_DAX(inode))
+	if (iocb_is_direct(iocb))
+		return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
+					    offset, blkdev_get_block, NULL,
+					    NULL, DIO_SKIP_DIO_COUNT);
+	else if (iocb_is_dax(iocb))
 		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
 				NULL, DIO_SKIP_DIO_COUNT);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
-				    blkdev_get_block, NULL, NULL,
-				    DIO_SKIP_DIO_COUNT);
+	else {
+		/*
+		 * If we're in the direct_IO path, either the IOCB_DIRECT or
+		 * IOCB_DAX flags must be set.
+		 */
+		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
+		return -ENXIO;
+	}
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 35f2b0bf..45f2b51 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -861,12 +861,20 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
-				DIO_LOCKING);
-	else
+	if (iocb_is_direct(iocb))
 		ret = blockdev_direct_IO(iocb, inode, iter, offset,
 					 ext2_get_block);
+	else if (iocb_is_dax(iocb))
+		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+				DIO_LOCKING);
+	else {
+		/*
+		 * If we're in the direct_IO path, either the IOCB_DIRECT or
+		 * IOCB_DAX flags must be set.
+		 */
+		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
+		return -ENXIO;
+	}
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		ext2_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e9aa49..165a0b8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -94,7 +94,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct blk_plug plug;
-	int o_direct = iocb->ki_flags & IOCB_DIRECT;
+	int o_direct = iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX);
 	int unaligned_aio = 0;
 	int overwrite = 0;
 	ssize_t ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6d5d5c1..0b6d77a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3410,15 +3410,22 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
-				ext4_end_io_dio, dio_flags);
-	} else
+	if (iocb_is_direct(iocb))
 		ret = __blockdev_direct_IO(iocb, inode,
 					   inode->i_sb->s_bdev, iter, offset,
 					   get_block_func,
 					   ext4_end_io_dio, NULL, dio_flags);
-
+	else if (iocb_is_dax(iocb))
+		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
+				ext4_end_io_dio, dio_flags);
+	else {
+		/*
+		 * If we're in the direct_IO path, either the IOCB_DIRECT or
+		 * IOCB_DAX flags must be set.
+		 */
+		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
+		return -ENXIO;
+	}
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
@@ -3503,7 +3510,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
 		else
 			unlocked = 1;
 	}
-	if (IS_DAX(inode)) {
+	if (iocb_is_dax(iocb)) {
 		ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
 				NULL, unlocked ? 0 : DIO_LOCKING);
 	} else {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e49b240..8134e99 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1412,21 +1412,27 @@ xfs_vm_direct_IO(
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	dio_iodone_t		*endio = NULL;
 	int			flags = 0;
-	struct block_device	*bdev;
+	struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
 
 	if (iov_iter_rw(iter) == WRITE) {
 		endio = xfs_end_io_direct_write;
 		flags = DIO_ASYNC_EXTEND;
 	}
 
-	if (IS_DAX(inode)) {
+	if (iocb_is_direct(iocb))
+		return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+				xfs_get_blocks_direct, endio, NULL, flags);
+	else if (iocb_is_dax(iocb))
 		return dax_do_io(iocb, inode, iter, offset,
-				 xfs_get_blocks_direct, endio, 0);
+				xfs_get_blocks_direct, endio, 0);
+	else {
+		/*
+		 * If we're in the direct_IO path, either the IOCB_DIRECT or
+		 * IOCB_DAX flags must be set.
+		 */
+		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
+		return -ENXIO;
 	}
-
-	bdev = xfs_find_bdev_for_inode(inode);
-	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c2946f4..3d5d3c2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -300,7 +300,7 @@ xfs_file_read_iter(
 
 	XFS_STATS_INC(mp, xs_read_calls);
 
-	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+	if (unlikely(iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
 		ioflags |= XFS_IO_ISDIRECT;
 	if (file->f_mode & FMODE_NOCMTIME)
 		ioflags |= XFS_IO_INVIS;
@@ -898,7 +898,7 @@ xfs_file_write_iter(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+	if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
 		ret = xfs_file_dio_aio_write(iocb, from);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, from);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f28130..adca1d8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -322,6 +322,7 @@ struct writeback_control;
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
 #define IOCB_HIPRI		(1 << 3)
+#define IOCB_DAX		(1 << 4)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2930,9 +2931,15 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
-static inline bool io_is_direct(struct file *filp)
+static inline bool iocb_is_dax(struct kiocb *iocb)
 {
-	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
+	return IS_DAX(file_inode(iocb->ki_filp)) &&
+		(iocb->ki_flags & IOCB_DAX);
+}
+
+static inline bool iocb_is_direct(struct kiocb *iocb)
+{
+	return iocb->ki_flags & IOCB_DIRECT;
 }
 
 static inline int iocb_flags(struct file *file)
@@ -2940,8 +2947,10 @@ static inline int iocb_flags(struct file *file)
 	int res = 0;
 	if (file->f_flags & O_APPEND)
 		res |= IOCB_APPEND;
-	if (io_is_direct(file))
+	if (file->f_flags & O_DIRECT)
 		res |= IOCB_DIRECT;
+	if (IS_DAX(file_inode(file)))
+		res |= IOCB_DAX;
 	return res;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 3effd5c..b959acf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1849,7 +1849,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	if (!count)
 		goto out; /* skip atime */
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)) {
 		struct address_space *mapping = file->f_mapping;
 		struct inode *inode = mapping->host;
 		loff_t size;
@@ -2719,7 +2719,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err)
 		goto out;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)) {
 		loff_t pos, endbyte;
 
 		written = generic_file_direct_write(iocb, from, iocb->ki_pos);
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-04-28 21:16 ` [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io Vishal Verma
@ 2016-05-02 14:56   ` Christoph Hellwig
  2016-05-02 15:45     ` Vishal Verma
  2016-05-02 15:41   ` Boaz Harrosh
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-05-02 14:56 UTC (permalink / raw)
  To: Vishal Verma
  Cc: linux-nvdimm, linux-fsdevel, linux-block, xfs, linux-ext4,
	linux-mm, Matthew Wilcox, Ross Zwisler, Dan Williams,
	Dave Chinner, Jan Kara, Jens Axboe, Al Viro, Andrew Morton,
	linux-kernel, Christoph Hellwig, Jeff Moyer

> index 79defba..97a1f5f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -167,12 +167,21 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = bdev_file_inode(file);
>  
> -	if (IS_DAX(inode))
> +	if (iocb_is_direct(iocb))
> +		return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
> +					    offset, blkdev_get_block, NULL,
> +					    NULL, DIO_SKIP_DIO_COUNT);
> +	else if (iocb_is_dax(iocb))
>  		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
>  				NULL, DIO_SKIP_DIO_COUNT);
> -	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
> -				    blkdev_get_block, NULL, NULL,
> -				    DIO_SKIP_DIO_COUNT);
> +	else {
> +		/*
> +		 * If we're in the direct_IO path, either the IOCB_DIRECT or
> +		 * IOCB_DAX flags must be set.
> +		 */
> +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> +		return -ENXIO;
> +	}

DAX should not even end up in ->direct_IO.

> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -300,7 +300,7 @@ xfs_file_read_iter(
>  
>  	XFS_STATS_INC(mp, xs_read_calls);
>  
> -	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
> +	if (unlikely(iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
>  		ioflags |= XFS_IO_ISDIRECT;

please also add a XFS_IO_ISDAX flag to propagate the information
properly and allow tracing to display the actual I/O type.

> +static inline bool iocb_is_dax(struct kiocb *iocb)
>  {
> +	return IS_DAX(file_inode(iocb->ki_filp)) &&
> +		(iocb->ki_flags & IOCB_DAX);
> +}
> +
> +static inline bool iocb_is_direct(struct kiocb *iocb)
> +{
> +	return iocb->ki_flags & IOCB_DIRECT;
>  }

No need for these helpers - especially as IOCB_DAX should never be set
if IS_DAX is false.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 14:56   ` Christoph Hellwig
@ 2016-05-02 15:45     ` Vishal Verma
  0 siblings, 0 replies; 25+ messages in thread
From: Vishal Verma @ 2016-05-02 15:45 UTC (permalink / raw)
  To: Christoph Hellwig, Vishal Verma
  Cc: linux-nvdimm, linux-fsdevel, linux-block, xfs, linux-ext4,
	linux-mm, Matthew Wilcox, Ross Zwisler, Dan Williams,
	Dave Chinner, Jan Kara, Jens Axboe, Al Viro, Andrew Morton,
	linux-kernel, Jeff Moyer

On Mon, 2016-05-02 at 07:56 -0700, Christoph Hellwig wrote:
> > 
> > index 79defba..97a1f5f 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -167,12 +167,21 @@ blkdev_direct_IO(struct kiocb *iocb, struct
> > iov_iter *iter, loff_t offset)
> >  	struct file *file = iocb->ki_filp;
> >  	struct inode *inode = bdev_file_inode(file);
> >  
> > -	if (IS_DAX(inode))
> > +	if (iocb_is_direct(iocb))
> > +		return __blockdev_direct_IO(iocb, inode,
> > I_BDEV(inode), iter,
> > +					    offset,
> > blkdev_get_block, NULL,
> > +					    NULL,
> > DIO_SKIP_DIO_COUNT);
> > +	else if (iocb_is_dax(iocb))
> >  		return dax_do_io(iocb, inode, iter, offset,
> > blkdev_get_block,
> >  				NULL, DIO_SKIP_DIO_COUNT);
> > -	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode),
> > iter, offset,
> > -				    blkdev_get_block, NULL, NULL,
> > -				    DIO_SKIP_DIO_COUNT);
> > +	else {
> > +		/*
> > +		 * If we're in the direct_IO path, either the
> > IOCB_DIRECT or
> > +		 * IOCB_DAX flags must be set.
> > +		 */
> > +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> > +		return -ENXIO;
> > +	}
> DAX should not even end up in ->direct_IO.

Do you mean to say remove the last 'else' clause entirely?
I agree that it should never be hit, which is why it is a WARN..
But I'm happy to remove it.

> 
> > 
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -300,7 +300,7 @@ xfs_file_read_iter(
> >  
> >  	XFS_STATS_INC(mp, xs_read_calls);
> >  
> > -	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
> > +	if (unlikely(iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
> >  		ioflags |= XFS_IO_ISDIRECT;
> please also add a XFS_IO_ISDAX flag to propagate the information
> properly and allow tracing to display the actual I/O type.

Will do.

> 
> > 
> > +static inline bool iocb_is_dax(struct kiocb *iocb)
> >  {
> > +	return IS_DAX(file_inode(iocb->ki_filp)) &&
> > +		(iocb->ki_flags & IOCB_DAX);
> > +}
> > +
> > +static inline bool iocb_is_direct(struct kiocb *iocb)
> > +{
> > +	return iocb->ki_flags & IOCB_DIRECT;
> >  }
> No need for these helpers - especially as IOCB_DAX should never be
> set
> if IS_DAX is false.

Ok. So check the flags directly where needed?

> --
> To unsubscribe from this list: send the line "unsubscribe linux-
> block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-04-28 21:16 ` [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io Vishal Verma
  2016-05-02 14:56   ` Christoph Hellwig
@ 2016-05-02 15:41   ` Boaz Harrosh
  2016-05-02 15:51     ` Vishal Verma
                       ` (2 more replies)
  1 sibling, 3 replies; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 15:41 UTC (permalink / raw)
  To: Vishal Verma, linux-nvdimm
  Cc: Jens Axboe, Jan Kara, Andrew Morton, Matthew Wilcox, Dave Chinner,
	linux-kernel, xfs, linux-block, linux-mm, Al Viro,
	Christoph Hellwig, linux-fsdevel, linux-ext4

On 04/29/2016 12:16 AM, Vishal Verma wrote:
> All IO in a dax filesystem used to go through dax_do_io, which cannot
> handle media errors, and thus cannot provide a recovery path that can
> send a write through the driver to clear errors.
> 
> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
> path for DAX filesystems, use the same direct_IO path for both DAX and
> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
> direct_IO path instead of DAX.
> 

Really? What are your thinking here?

What about all the current users of O_DIRECT, you have just made them
4 times slower and "less concurrent*" then "buffred io" users. Since
direct_IO path will queue an IO request and all.
(And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])

I hate it that you overload the semantics of a known and expected
O_DIRECT flag, for special pmem quirks. This is an incompatible
and unrelated overload of the semantics of O_DIRECT.

> This allows us a recovery path in the form of opening the file with
> O_DIRECT and writing to it with the usual O_DIRECT semantics (sector
> alignment restrictions).
> 

I understand that you want a sector aligned IO, right? for the
clear of errors. But I hate it that you forced all O_DIRECT IO
to be slow for this.
Can you not make dax_do_io handle media errors? At least for the
parts of the IO that are aligned.
(And your recovery path application above can use only aligned
 IO to make sure)

Please look for another solution. Even a special IOCTL_DAX_CLEAR_ERROR

[*"less concurrent" because of the queuing done in bdev. Note how
  pmem is not even multi-queue, and even if it was it will be much
  slower then DAX because of the code depth and all the locks and task
  switches done in the block layer. In DAX the final memcpy is done directly
  on the user-mode thread]

Thanks
Boaz

> Cc: Matthew Wilcox <matthew@wil.cx>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jens Axboe <axboe@fb.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  drivers/block/loop.c |  2 +-
>  fs/block_dev.c       | 17 +++++++++++++----
>  fs/ext2/inode.c      | 16 ++++++++++++----
>  fs/ext4/file.c       |  2 +-
>  fs/ext4/inode.c      | 19 +++++++++++++------
>  fs/xfs/xfs_aops.c    | 20 +++++++++++++-------
>  fs/xfs/xfs_file.c    |  4 ++--
>  include/linux/fs.h   | 15 ++++++++++++---
>  mm/filemap.c         |  4 ++--
>  9 files changed, 69 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 80cf8ad..c0a24c3 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -568,7 +568,7 @@ struct switch_request {
>  
>  static inline void loop_update_dio(struct loop_device *lo)
>  {
> -	__loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
> +	__loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
>  			lo->use_dio);
>  }
>  
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 79defba..97a1f5f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -167,12 +167,21 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = bdev_file_inode(file);
>  
> -	if (IS_DAX(inode))
> +	if (iocb_is_direct(iocb))
> +		return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
> +					    offset, blkdev_get_block, NULL,
> +					    NULL, DIO_SKIP_DIO_COUNT);
> +	else if (iocb_is_dax(iocb))
>  		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
>  				NULL, DIO_SKIP_DIO_COUNT);
> -	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
> -				    blkdev_get_block, NULL, NULL,
> -				    DIO_SKIP_DIO_COUNT);
> +	else {
> +		/*
> +		 * If we're in the direct_IO path, either the IOCB_DIRECT or
> +		 * IOCB_DAX flags must be set.
> +		 */
> +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> +		return -ENXIO;
> +	}
>  }
>  
>  int __sync_blockdev(struct block_device *bdev, int wait)
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 35f2b0bf..45f2b51 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -861,12 +861,20 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
>  	size_t count = iov_iter_count(iter);
>  	ssize_t ret;
>  
> -	if (IS_DAX(inode))
> -		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
> -				DIO_LOCKING);
> -	else
> +	if (iocb_is_direct(iocb))
>  		ret = blockdev_direct_IO(iocb, inode, iter, offset,
>  					 ext2_get_block);
> +	else if (iocb_is_dax(iocb))
> +		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
> +				DIO_LOCKING);
> +	else {
> +		/*
> +		 * If we're in the direct_IO path, either the IOCB_DIRECT or
> +		 * IOCB_DAX flags must be set.
> +		 */
> +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> +		return -ENXIO;
> +	}
>  	if (ret < 0 && iov_iter_rw(iter) == WRITE)
>  		ext2_write_failed(mapping, offset + count);
>  	return ret;
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 2e9aa49..165a0b8 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -94,7 +94,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	struct blk_plug plug;
> -	int o_direct = iocb->ki_flags & IOCB_DIRECT;
> +	int o_direct = iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX);
>  	int unaligned_aio = 0;
>  	int overwrite = 0;
>  	ssize_t ret;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 6d5d5c1..0b6d77a 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3410,15 +3410,22 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
>  #ifdef CONFIG_EXT4_FS_ENCRYPTION
>  	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
>  #endif
> -	if (IS_DAX(inode)) {
> -		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
> -				ext4_end_io_dio, dio_flags);
> -	} else
> +	if (iocb_is_direct(iocb))
>  		ret = __blockdev_direct_IO(iocb, inode,
>  					   inode->i_sb->s_bdev, iter, offset,
>  					   get_block_func,
>  					   ext4_end_io_dio, NULL, dio_flags);
> -
> +	else if (iocb_is_dax(iocb))
> +		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
> +				ext4_end_io_dio, dio_flags);
> +	else {
> +		/*
> +		 * If we're in the direct_IO path, either the IOCB_DIRECT or
> +		 * IOCB_DAX flags must be set.
> +		 */
> +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> +		return -ENXIO;
> +	}
>  	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
>  						EXT4_STATE_DIO_UNWRITTEN)) {
>  		int err;
> @@ -3503,7 +3510,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
>  		else
>  			unlocked = 1;
>  	}
> -	if (IS_DAX(inode)) {
> +	if (iocb_is_dax(iocb)) {
>  		ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
>  				NULL, unlocked ? 0 : DIO_LOCKING);
>  	} else {
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index e49b240..8134e99 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1412,21 +1412,27 @@ xfs_vm_direct_IO(
>  	struct inode		*inode = iocb->ki_filp->f_mapping->host;
>  	dio_iodone_t		*endio = NULL;
>  	int			flags = 0;
> -	struct block_device	*bdev;
> +	struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
>  
>  	if (iov_iter_rw(iter) == WRITE) {
>  		endio = xfs_end_io_direct_write;
>  		flags = DIO_ASYNC_EXTEND;
>  	}
>  
> -	if (IS_DAX(inode)) {
> +	if (iocb_is_direct(iocb))
> +		return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
> +				xfs_get_blocks_direct, endio, NULL, flags);
> +	else if (iocb_is_dax(iocb))
>  		return dax_do_io(iocb, inode, iter, offset,
> -				 xfs_get_blocks_direct, endio, 0);
> +				xfs_get_blocks_direct, endio, 0);
> +	else {
> +		/*
> +		 * If we're in the direct_IO path, either the IOCB_DIRECT or
> +		 * IOCB_DAX flags must be set.
> +		 */
> +		WARN_ONCE(1, "Kernel Bug with iocb flags\n");
> +		return -ENXIO;
>  	}
> -
> -	bdev = xfs_find_bdev_for_inode(inode);
> -	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
> -			xfs_get_blocks_direct, endio, NULL, flags);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index c2946f4..3d5d3c2 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -300,7 +300,7 @@ xfs_file_read_iter(
>  
>  	XFS_STATS_INC(mp, xs_read_calls);
>  
> -	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
> +	if (unlikely(iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
>  		ioflags |= XFS_IO_ISDIRECT;
>  	if (file->f_mode & FMODE_NOCMTIME)
>  		ioflags |= XFS_IO_INVIS;
> @@ -898,7 +898,7 @@ xfs_file_write_iter(
>  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
>  		return -EIO;
>  
> -	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
> +	if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)))
>  		ret = xfs_file_dio_aio_write(iocb, from);
>  	else
>  		ret = xfs_file_buffered_aio_write(iocb, from);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9f28130..adca1d8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -322,6 +322,7 @@ struct writeback_control;
>  #define IOCB_APPEND		(1 << 1)
>  #define IOCB_DIRECT		(1 << 2)
>  #define IOCB_HIPRI		(1 << 3)
> +#define IOCB_DAX		(1 << 4)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -2930,9 +2931,15 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
>  extern void save_mount_options(struct super_block *sb, char *options);
>  extern void replace_mount_options(struct super_block *sb, char *options);
>  
> -static inline bool io_is_direct(struct file *filp)
> +static inline bool iocb_is_dax(struct kiocb *iocb)
>  {
> -	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
> +	return IS_DAX(file_inode(iocb->ki_filp)) &&
> +		(iocb->ki_flags & IOCB_DAX);
> +}
> +
> +static inline bool iocb_is_direct(struct kiocb *iocb)
> +{
> +	return iocb->ki_flags & IOCB_DIRECT;
>  }
>  
>  static inline int iocb_flags(struct file *file)
> @@ -2940,8 +2947,10 @@ static inline int iocb_flags(struct file *file)
>  	int res = 0;
>  	if (file->f_flags & O_APPEND)
>  		res |= IOCB_APPEND;
> -	if (io_is_direct(file))
> +	if (file->f_flags & O_DIRECT)
>  		res |= IOCB_DIRECT;
> +	if (IS_DAX(file_inode(file)))
> +		res |= IOCB_DAX;
>  	return res;
>  }
>  
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 3effd5c..b959acf 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1849,7 +1849,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>  	if (!count)
>  		goto out; /* skip atime */
>  
> -	if (iocb->ki_flags & IOCB_DIRECT) {
> +	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)) {
>  		struct address_space *mapping = file->f_mapping;
>  		struct inode *inode = mapping->host;
>  		loff_t size;
> @@ -2719,7 +2719,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (err)
>  		goto out;
>  
> -	if (iocb->ki_flags & IOCB_DIRECT) {
> +	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_DAX)) {
>  		loff_t pos, endbyte;
>  
>  		written = generic_file_direct_write(iocb, from, iocb->ki_pos);
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 15:41   ` Boaz Harrosh
@ 2016-05-02 15:51     ` Vishal Verma
  2016-05-02 16:03       ` Boaz Harrosh
  2016-05-02 16:01     ` Dan Williams
  2016-05-05 14:24     ` Christoph Hellwig
  2 siblings, 1 reply; 25+ messages in thread
From: Vishal Verma @ 2016-05-02 15:51 UTC (permalink / raw)
  To: Boaz Harrosh, Vishal Verma, linux-nvdimm
  Cc: linux-block, Jan Kara, Matthew Wilcox, Dave Chinner, linux-kernel,
	xfs, Jens Axboe, linux-mm, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On Mon, 2016-05-02 at 18:41 +0300, Boaz Harrosh wrote:
> On 04/29/2016 12:16 AM, Vishal Verma wrote:
> > 
> > All IO in a dax filesystem used to go through dax_do_io, which
> > cannot
> > handle media errors, and thus cannot provide a recovery path that
> > can
> > send a write through the driver to clear errors.
> > 
> > Add a new iocb flag for DAX, and set it only for DAX mounts. In the
> > IO
> > path for DAX filesystems, use the same direct_IO path for both DAX
> > and
> > direct_io iocbs, but use the flags to identify when we are in
> > O_DIRECT
> > mode vs non O_DIRECT with DAX, and for O_DIRECT, use the
> > conventional
> > direct_IO path instead of DAX.
> > 
> Really? What are your thinking here?
> 
> What about all the current users of O_DIRECT, you have just made them
> 4 times slower and "less concurrent*" then "buffred io" users. Since
> direct_IO path will queue an IO request and all.
> (And if it is not so slow then why do we need dax_do_io at all?
> [Rhetorical])
> 
> I hate it that you overload the semantics of a known and expected
> O_DIRECT flag, for special pmem quirks. This is an incompatible
> and unrelated overload of the semantics of O_DIRECT.

We overloaded O_DIRECT a long time ago when we made DAX piggyback on
the same path:

static inline bool io_is_direct(struct file *filp)
{
	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
}

Yes O_DIRECT on a DAX mounted file system will now be slower, but -

> 
> > 
> > This allows us a recovery path in the form of opening the file with
> > O_DIRECT and writing to it with the usual O_DIRECT semantics
> > (sector
> > alignment restrictions).
> > 
> I understand that you want a sector aligned IO, right? for the
> clear of errors. But I hate it that you forced all O_DIRECT IO
> to be slow for this.
> Can you not make dax_do_io handle media errors? At least for the
> parts of the IO that are aligned.
> (And your recovery path application above can use only aligned
>  IO to make sure)
> 
> Please look for another solution. Even a special
> IOCTL_DAX_CLEAR_ERROR

 - see all the versions of this series prior to this one, where we try
to do a fallback...

> 
> [*"less concurrent" because of the queuing done in bdev. Note how
>   pmem is not even multi-queue, and even if it was it will be much
>   slower then DAX because of the code depth and all the locks and
> task
>   switches done in the block layer. In DAX the final memcpy is done
> directly
>   on the user-mode thread]
> 
> Thanks
> Boaz
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 15:51     ` Vishal Verma
@ 2016-05-02 16:03       ` Boaz Harrosh
  2016-05-02 18:52         ` Verma, Vishal L
  0 siblings, 1 reply; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 16:03 UTC (permalink / raw)
  To: Vishal Verma, Vishal Verma, linux-nvdimm
  Cc: linux-block, Jan Kara, Matthew Wilcox, Dave Chinner, linux-kernel,
	xfs, Jens Axboe, linux-mm, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On 05/02/2016 06:51 PM, Vishal Verma wrote:
> On Mon, 2016-05-02 at 18:41 +0300, Boaz Harrosh wrote:
>> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>>>
>>> All IO in a dax filesystem used to go through dax_do_io, which
>>> cannot
>>> handle media errors, and thus cannot provide a recovery path that
>>> can
>>> send a write through the driver to clear errors.
>>>
>>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the
>>> IO
>>> path for DAX filesystems, use the same direct_IO path for both DAX
>>> and
>>> direct_io iocbs, but use the flags to identify when we are in
>>> O_DIRECT
>>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the
>>> conventional
>>> direct_IO path instead of DAX.
>>>
>> Really? What are your thinking here?
>>
>> What about all the current users of O_DIRECT, you have just made them
>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>> direct_IO path will queue an IO request and all.
>> (And if it is not so slow then why do we need dax_do_io at all?
>> [Rhetorical])
>>
>> I hate it that you overload the semantics of a known and expected
>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>> and unrelated overload of the semantics of O_DIRECT.
> 
> We overloaded O_DIRECT a long time ago when we made DAX piggyback on
> the same path:
> 
> static inline bool io_is_direct(struct file *filp)
> {
> 	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
> }
> 

No as far as the user is concerned we have not. The O_DIRECT user
is still getting all the semantics he wants, .i.e no syncs no
memory cache usage, no copies ...

Only with DAX the buffered IO is the same since with pmem it is faster.
Then why not? The basic contract with the user did not break.

The above was just an implementation detail to easily navigate
through the Linux vfs IO stack and make the least amount of changes
in every FS that wanted to support DAX.(And since dax_do_io is much
more like direct_IO then like page-cache IO)

> Yes O_DIRECT on a DAX mounted file system will now be slower, but -
> 
>>
>>>
>>> This allows us a recovery path in the form of opening the file with
>>> O_DIRECT and writing to it with the usual O_DIRECT semantics
>>> (sector
>>> alignment restrictions).
>>>
>> I understand that you want a sector aligned IO, right? for the
>> clear of errors. But I hate it that you forced all O_DIRECT IO
>> to be slow for this.
>> Can you not make dax_do_io handle media errors? At least for the
>> parts of the IO that are aligned.
>> (And your recovery path application above can use only aligned
>>  IO to make sure)
>>
>> Please look for another solution. Even a special
>> IOCTL_DAX_CLEAR_ERROR
> 
>  - see all the versions of this series prior to this one, where we try
> to do a fallback...
> 

And?

So now all O_DIRECT APPs go 4 times slower. I will have a look but if
it is really so bad than please consider an IOCTL or syscall. Or a special
O_DAX_ERRORS flag ...

Please do not trash all the O_DIRECT users, they are the more important
clients, like DBs and VMs.

Thanks
Boaz

>>
>> [*"less concurrent" because of the queuing done in bdev. Note how
>>   pmem is not even multi-queue, and even if it was it will be much
>>   slower then DAX because of the code depth and all the locks and
>> task
>>   switches done in the block layer. In DAX the final memcpy is done
>> directly
>>   on the user-mode thread]
>>
>> Thanks
>> Boaz
>>
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 16:03       ` Boaz Harrosh
@ 2016-05-02 18:52         ` Verma, Vishal L
  0 siblings, 0 replies; 25+ messages in thread
From: Verma, Vishal L @ 2016-05-02 18:52 UTC (permalink / raw)
  To: linux-nvdimm@lists.01.org, boaz@plexistor.com
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	hch@infradead.org, xfs@oss.sgi.com, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, axboe@fb.com, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

T24gTW9uLCAyMDE2LTA1LTAyIGF0IDE5OjAzICswMzAwLCBCb2F6IEhhcnJvc2ggd3JvdGU6DQo+
IE9uIDA1LzAyLzIwMTYgMDY6NTEgUE0sIFZpc2hhbCBWZXJtYSB3cm90ZToNCj4gPiANCj4gPiBP
biBNb24sIDIwMTYtMDUtMDIgYXQgMTg6NDEgKzAzMDAsIEJvYXogSGFycm9zaCB3cm90ZToNCj4g
PiA+IA0KPiA+ID4gT24gMDQvMjkvMjAxNiAxMjoxNiBBTSwgVmlzaGFsIFZlcm1hIHdyb3RlOg0K
PiA+ID4gPiANCj4gPiA+ID4gDQo+ID4gPiA+IEFsbCBJTyBpbiBhIGRheCBmaWxlc3lzdGVtIHVz
ZWQgdG8gZ28gdGhyb3VnaCBkYXhfZG9faW8sIHdoaWNoDQo+ID4gPiA+IGNhbm5vdA0KPiA+ID4g
PiBoYW5kbGUgbWVkaWEgZXJyb3JzLCBhbmQgdGh1cyBjYW5ub3QgcHJvdmlkZSBhIHJlY292ZXJ5
IHBhdGgNCj4gPiA+ID4gdGhhdA0KPiA+ID4gPiBjYW4NCj4gPiA+ID4gc2VuZCBhIHdyaXRlIHRo
cm91Z2ggdGhlIGRyaXZlciB0byBjbGVhciBlcnJvcnMuDQo+ID4gPiA+IA0KPiA+ID4gPiBBZGQg
YSBuZXcgaW9jYiBmbGFnIGZvciBEQVgsIGFuZCBzZXQgaXQgb25seSBmb3IgREFYIG1vdW50cy4g
SW4NCj4gPiA+ID4gdGhlDQo+ID4gPiA+IElPDQo+ID4gPiA+IHBhdGggZm9yIERBWCBmaWxlc3lz
dGVtcywgdXNlIHRoZSBzYW1lIGRpcmVjdF9JTyBwYXRoIGZvciBib3RoDQo+ID4gPiA+IERBWA0K
PiA+ID4gPiBhbmQNCj4gPiA+ID4gZGlyZWN0X2lvIGlvY2JzLCBidXQgdXNlIHRoZSBmbGFncyB0
byBpZGVudGlmeSB3aGVuIHdlIGFyZSBpbg0KPiA+ID4gPiBPX0RJUkVDVA0KPiA+ID4gPiBtb2Rl
IHZzIG5vbiBPX0RJUkVDVCB3aXRoIERBWCwgYW5kIGZvciBPX0RJUkVDVCwgdXNlIHRoZQ0KPiA+
ID4gPiBjb252ZW50aW9uYWwNCj4gPiA+ID4gZGlyZWN0X0lPIHBhdGggaW5zdGVhZCBvZiBEQVgu
DQo+ID4gPiA+IA0KPiA+ID4gUmVhbGx5PyBXaGF0IGFyZSB5b3VyIHRoaW5raW5nIGhlcmU/DQo+
ID4gPiANCj4gPiA+IFdoYXQgYWJvdXQgYWxsIHRoZSBjdXJyZW50IHVzZXJzIG9mIE9fRElSRUNU
LCB5b3UgaGF2ZSBqdXN0IG1hZGUNCj4gPiA+IHRoZW0NCj4gPiA+IDQgdGltZXMgc2xvd2VyIGFu
ZCAibGVzcyBjb25jdXJyZW50KiIgdGhlbiAiYnVmZnJlZCBpbyIgdXNlcnMuDQo+ID4gPiBTaW5j
ZQ0KPiA+ID4gZGlyZWN0X0lPIHBhdGggd2lsbCBxdWV1ZSBhbiBJTyByZXF1ZXN0IGFuZCBhbGwu
DQo+ID4gPiAoQW5kIGlmIGl0IGlzIG5vdCBzbyBzbG93IHRoZW4gd2h5IGRvIHdlIG5lZWQgZGF4
X2RvX2lvIGF0IGFsbD8NCj4gPiA+IFtSaGV0b3JpY2FsXSkNCj4gPiA+IA0KPiA+ID4gSSBoYXRl
IGl0IHRoYXQgeW91IG92ZXJsb2FkIHRoZSBzZW1hbnRpY3Mgb2YgYSBrbm93biBhbmQgZXhwZWN0
ZWQNCj4gPiA+IE9fRElSRUNUIGZsYWcsIGZvciBzcGVjaWFsIHBtZW0gcXVpcmtzLiBUaGlzIGlz
IGFuIGluY29tcGF0aWJsZQ0KPiA+ID4gYW5kIHVucmVsYXRlZCBvdmVybG9hZCBvZiB0aGUgc2Vt
YW50aWNzIG9mIE9fRElSRUNULg0KPiA+IFdlIG92ZXJsb2FkZWQgT19ESVJFQ1QgYSBsb25nIHRp
bWUgYWdvIHdoZW4gd2UgbWFkZSBEQVggcGlnZ3liYWNrIG9uDQo+ID4gdGhlIHNhbWUgcGF0aDoN
Cj4gPiANCj4gPiBzdGF0aWMgaW5saW5lIGJvb2wgaW9faXNfZGlyZWN0KHN0cnVjdCBmaWxlICpm
aWxwKQ0KPiA+IHsNCj4gPiAJcmV0dXJuIChmaWxwLT5mX2ZsYWdzICYgT19ESVJFQ1QpIHx8IElT
X0RBWChmaWxwLT5mX21hcHBpbmctDQo+ID4gPmhvc3QpOw0KPiA+IH0NCj4gPiANCj4gTm8gYXMg
ZmFyIGFzIHRoZSB1c2VyIGlzIGNvbmNlcm5lZCB3ZSBoYXZlIG5vdC4gVGhlIE9fRElSRUNUIHVz
ZXINCj4gaXMgc3RpbGwgZ2V0dGluZyBhbGwgdGhlIHNlbWFudGljcyBoZSB3YW50cywgLmkuZSBu
byBzeW5jcyBubw0KPiBtZW1vcnkgY2FjaGUgdXNhZ2UsIG5vIGNvcGllcyAuLi4NCj4gDQo+IE9u
bHkgd2l0aCBEQVggdGhlIGJ1ZmZlcmVkIElPIGlzIHRoZSBzYW1lIHNpbmNlIHdpdGggcG1lbSBp
dCBpcw0KPiBmYXN0ZXIuDQo+IFRoZW4gd2h5IG5vdD8gVGhlIGJhc2ljIGNvbnRyYWN0IHdpdGgg
dGhlIHVzZXIgZGlkIG5vdCBicmVhay4NCj4gDQo+IFRoZSBhYm92ZSB3YXMganVzdCBhbiBpbXBs
ZW1lbnRhdGlvbiBkZXRhaWwgdG8gZWFzaWx5IG5hdmlnYXRlDQo+IHRocm91Z2ggdGhlIExpbnV4
IHZmcyBJTyBzdGFjayBhbmQgbWFrZSB0aGUgbGVhc3QgYW1vdW50IG9mIGNoYW5nZXMNCj4gaW4g
ZXZlcnkgRlMgdGhhdCB3YW50ZWQgdG8gc3VwcG9ydCBEQVguKEFuZCBzaW5jZSBkYXhfZG9faW8g
aXMgbXVjaA0KPiBtb3JlIGxpa2UgZGlyZWN0X0lPIHRoZW4gbGlrZSBwYWdlLWNhY2hlIElPKQ0K
PiANCj4gPiANCj4gPiBZZXMgT19ESVJFQ1Qgb24gYSBEQVggbW91bnRlZCBmaWxlIHN5c3RlbSB3
aWxsIG5vdyBiZSBzbG93ZXIsIGJ1dCAtDQo+ID4gDQo+ID4gPiANCj4gPiA+IA0KPiA+ID4gPiAN
Cj4gPiA+ID4gDQo+ID4gPiA+IFRoaXMgYWxsb3dzIHVzIGEgcmVjb3ZlcnkgcGF0aCBpbiB0aGUg
Zm9ybSBvZiBvcGVuaW5nIHRoZSBmaWxlDQo+ID4gPiA+IHdpdGgNCj4gPiA+ID4gT19ESVJFQ1Qg
YW5kIHdyaXRpbmcgdG8gaXQgd2l0aCB0aGUgdXN1YWwgT19ESVJFQ1Qgc2VtYW50aWNzDQo+ID4g
PiA+IChzZWN0b3INCj4gPiA+ID4gYWxpZ25tZW50IHJlc3RyaWN0aW9ucykuDQo+ID4gPiA+IA0K
PiA+ID4gSSB1bmRlcnN0YW5kIHRoYXQgeW91IHdhbnQgYSBzZWN0b3IgYWxpZ25lZCBJTywgcmln
aHQ/IGZvciB0aGUNCj4gPiA+IGNsZWFyIG9mIGVycm9ycy4gQnV0IEkgaGF0ZSBpdCB0aGF0IHlv
dSBmb3JjZWQgYWxsIE9fRElSRUNUIElPDQo+ID4gPiB0byBiZSBzbG93IGZvciB0aGlzLg0KPiA+
ID4gQ2FuIHlvdSBub3QgbWFrZSBkYXhfZG9faW8gaGFuZGxlIG1lZGlhIGVycm9ycz8gQXQgbGVh
c3QgZm9yIHRoZQ0KPiA+ID4gcGFydHMgb2YgdGhlIElPIHRoYXQgYXJlIGFsaWduZWQuDQo+ID4g
PiAoQW5kIHlvdXIgcmVjb3ZlcnkgcGF0aCBhcHBsaWNhdGlvbiBhYm92ZSBjYW4gdXNlIG9ubHkg
YWxpZ25lZA0KPiA+ID4gwqBJTyB0byBtYWtlIHN1cmUpDQo+ID4gPiANCj4gPiA+IFBsZWFzZSBs
b29rIGZvciBhbm90aGVyIHNvbHV0aW9uLiBFdmVuIGEgc3BlY2lhbA0KPiA+ID4gSU9DVExfREFY
X0NMRUFSX0VSUk9SDQo+ID4gwqAtIHNlZSBhbGwgdGhlIHZlcnNpb25zIG9mIHRoaXMgc2VyaWVz
IHByaW9yIHRvIHRoaXMgb25lLCB3aGVyZSB3ZQ0KPiA+IHRyeQ0KPiA+IHRvIGRvIGEgZmFsbGJh
Y2suLi4NCj4gPiANCj4gQW5kPw0KPiANCj4gU28gbm93IGFsbCBPX0RJUkVDVCBBUFBzIGdvIDQg
dGltZXMgc2xvd2VyLiBJIHdpbGwgaGF2ZSBhIGxvb2sgYnV0IGlmDQo+IGl0IGlzIHJlYWxseSBz
byBiYWQgdGhhbiBwbGVhc2UgY29uc2lkZXIgYW4gSU9DVEwgb3Igc3lzY2FsbC4gT3IgYQ0KPiBz
cGVjaWFsDQo+IE9fREFYX0VSUk9SUyBmbGFnIC4uLg0KDQpJJ20gY3VyaW91cyB3aGVyZSB0aGUg
NHggc2xvd2VyIGNvbWVzIGZyb20uLiBUaGUgT19ESVJFQ1QgcGF0aCBpcyBzdGlsbA0Kd2l0aG91
dCBwYWdlLWNhY2hlIGNvcGllcywgYW5kIG5vciBkb2VzIGl0IGdvIHRocm91Z2ggcmVxdWVzdCBx
dWV1ZXMNCihzaW5jZSBwbWVtIGlzIGEgYmlvLWJhc2VkIGRyaXZlcikuIFRoZSBvbmx5IG92ZXJo
ZWFkIGlzIHRoYXQgb2YNCnN1Ym1pdHRpbmcgYSBiaW8gLSBhbmQgd2hpbGUgSSBhZ3JlZSBpdCBp
cyBtb3JlIG92ZXJoZWFkIHRoYW4gZGF4X2RvX2lvLA0KNHggc2VlbXMgYSBiaXQgaGlnaC4NCg0K
PiANCj4gUGxlYXNlIGRvIG5vdCB0cmFzaCBhbGwgdGhlIE9fRElSRUNUIHVzZXJzLCB0aGV5IGFy
ZSB0aGUgbW9yZQ0KPiBpbXBvcnRhbnQNCj4gY2xpZW50cywgbGlrZSBEQnMgYW5kIFZNcy4NCg0K
U2hvdWxkbid0IHRoZXkgYmUgdXNpbmcgbW1hcHMgYW5kIGRheCBmYXVsdHM/IEkgd2FzIHVuZGVy
IHRoZSBpbXByZXNzaW9uDQp0aGF0IHRoZSBkYXhfZG9faW8gcGF0aCBpcyBhIG5pY2UtdG8taGF2
ZSwgYnV0IGZvciBhbnlvbmUgdGhhdCB3aWxsIHdhbnQNCnRvIHVzZSBEQVgsIHRoZXkgd2lsbCB3
YW50IHRoZSBtbWFwL2ZhdWx0IHBhdGgsIG5vdCB0aGUgSU8gcGF0aC4gVGhpcyBpcw0KanVzdCBt
YWtpbmcgdGhlIElPIHBhdGggJ21vcmUgY29ycmVjdCcgYnkgYWxsb3dpbmcgaXQgYSB3YXkgdG8g
ZGVhbCB3aXRoDQplcnJvcnMuDQoNCj4gDQo+IFRoYW5rcw0KPiBCb2F6DQo+IA0KPiA+IA0KPiA+
ID4gDQo+ID4gPiANCj4gPiA+IFsqImxlc3MgY29uY3VycmVudCIgYmVjYXVzZSBvZiB0aGUgcXVl
dWluZyBkb25lIGluIGJkZXYuIE5vdGUgaG93DQo+ID4gPiDCoCBwbWVtIGlzIG5vdCBldmVuIG11
bHRpLXF1ZXVlLCBhbmQgZXZlbiBpZiBpdCB3YXMgaXQgd2lsbCBiZSBtdWNoDQo+ID4gPiDCoCBz
bG93ZXIgdGhlbiBEQVggYmVjYXVzZSBvZiB0aGUgY29kZSBkZXB0aCBhbmQgYWxsIHRoZSBsb2Nr
cyBhbmQNCj4gPiA+IHRhc2sNCj4gPiA+IMKgIHN3aXRjaGVzIGRvbmUgaW4gdGhlIGJsb2NrIGxh
eWVyLiBJbiBEQVggdGhlIGZpbmFsIG1lbWNweSBpcw0KPiA+ID4gZG9uZQ0KPiA+ID4gZGlyZWN0
bHkNCj4gPiA+IMKgIG9uIHRoZSB1c2VyLW1vZGUgdGhyZWFkXQ0KPiA+ID4gDQo+ID4gPiBUaGFu
a3MNCj4gPiA+IEJvYXoNCj4gPiA+IA==

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 15:41   ` Boaz Harrosh
  2016-05-02 15:51     ` Vishal Verma
@ 2016-05-02 16:01     ` Dan Williams
  2016-05-02 16:22       ` Boaz Harrosh
  2016-05-05 14:24     ` Christoph Hellwig
  2 siblings, 1 reply; 25+ messages in thread
From: Dan Williams @ 2016-05-02 16:01 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On Mon, May 2, 2016 at 8:41 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>> All IO in a dax filesystem used to go through dax_do_io, which cannot
>> handle media errors, and thus cannot provide a recovery path that can
>> send a write through the driver to clear errors.
>>
>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>> path for DAX filesystems, use the same direct_IO path for both DAX and
>> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>> direct_IO path instead of DAX.
>>
>
> Really? What are your thinking here?
>
> What about all the current users of O_DIRECT, you have just made them
> 4 times slower and "less concurrent*" then "buffred io" users. Since
> direct_IO path will queue an IO request and all.
> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>
> I hate it that you overload the semantics of a known and expected
> O_DIRECT flag, for special pmem quirks. This is an incompatible
> and unrelated overload of the semantics of O_DIRECT.

I think it is the opposite situation, it us undoing the premature
overloading of O_DIRECT that went in without performance numbers.
This implementation clarifies that dax_do_io() handles the lack of a
page cache for buffered I/O and O_DIRECT behaves as it nominally would
by sending an I/O to the driver.  It has the benefit of matching the
error semantics of a typical block device where a buffered write could
hit an error filling the page cache, but an O_DIRECT write potentially
triggers the drive to remap the block.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 16:01     ` Dan Williams
@ 2016-05-02 16:22       ` Boaz Harrosh
  2016-05-02 16:49         ` Dan Williams
  0 siblings, 1 reply; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 16:22 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On 05/02/2016 07:01 PM, Dan Williams wrote:
> On Mon, May 2, 2016 at 8:41 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>>> All IO in a dax filesystem used to go through dax_do_io, which cannot
>>> handle media errors, and thus cannot provide a recovery path that can
>>> send a write through the driver to clear errors.
>>>
>>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>>> path for DAX filesystems, use the same direct_IO path for both DAX and
>>> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>>> direct_IO path instead of DAX.
>>>
>>
>> Really? What are your thinking here?
>>
>> What about all the current users of O_DIRECT, you have just made them
>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>> direct_IO path will queue an IO request and all.
>> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>>
>> I hate it that you overload the semantics of a known and expected
>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>> and unrelated overload of the semantics of O_DIRECT.
> 
> I think it is the opposite situation, it us undoing the premature
> overloading of O_DIRECT that went in without performance numbers.

We have tons of measurements. Is not hard to imagine the results though.
Specially the 1000 threads case

> This implementation clarifies that dax_do_io() handles the lack of a
> page cache for buffered I/O and O_DIRECT behaves as it nominally would
> by sending an I/O to the driver.  

> It has the benefit of matching the
> error semantics of a typical block device where a buffered write could
> hit an error filling the page cache, but an O_DIRECT write potentially
> triggers the drive to remap the block.
> 

I fail to see how in writes the device error semantics regarding remapping of
blocks is any different between buffered and direct IO. As far as the block
device it is the same exact code path. All The big difference is higher in the
VFS.

And ... So you are willing to sacrifice the 99% hotpath for the sake of the
1% error path? and piggybacking on poor O_DIRECT.

Again there are tons of O_DIRECT apps out there, why are you forcing them to
change if they want true pmem performance?

I still believe dax_do_io() can be made more resilient to errors, and clear
errors on writes. Me going digging in old patches ...

Cheers
Boaz


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 16:22       ` Boaz Harrosh
@ 2016-05-02 16:49         ` Dan Williams
  2016-05-02 17:44           ` Boaz Harrosh
  0 siblings, 1 reply; 25+ messages in thread
From: Dan Williams @ 2016-05-02 16:49 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On Mon, May 2, 2016 at 9:22 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 05/02/2016 07:01 PM, Dan Williams wrote:
>> On Mon, May 2, 2016 at 8:41 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>>> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>>>> All IO in a dax filesystem used to go through dax_do_io, which cannot
>>>> handle media errors, and thus cannot provide a recovery path that can
>>>> send a write through the driver to clear errors.
>>>>
>>>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>>>> path for DAX filesystems, use the same direct_IO path for both DAX and
>>>> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>>>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>>>> direct_IO path instead of DAX.
>>>>
>>>
>>> Really? What are your thinking here?
>>>
>>> What about all the current users of O_DIRECT, you have just made them
>>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>>> direct_IO path will queue an IO request and all.
>>> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>>>
>>> I hate it that you overload the semantics of a known and expected
>>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>>> and unrelated overload of the semantics of O_DIRECT.
>>
>> I think it is the opposite situation, it us undoing the premature
>> overloading of O_DIRECT that went in without performance numbers.
>
> We have tons of measurements. Is not hard to imagine the results though.
> Specially the 1000 threads case
>
>> This implementation clarifies that dax_do_io() handles the lack of a
>> page cache for buffered I/O and O_DIRECT behaves as it nominally would
>> by sending an I/O to the driver.
>
>> It has the benefit of matching the
>> error semantics of a typical block device where a buffered write could
>> hit an error filling the page cache, but an O_DIRECT write potentially
>> triggers the drive to remap the block.
>>
>
> I fail to see how in writes the device error semantics regarding remapping of
> blocks is any different between buffered and direct IO. As far as the block
> device it is the same exact code path. All The big difference is higher in the
> VFS.
>
> And ... So you are willing to sacrifice the 99% hotpath for the sake of the
> 1% error path? and piggybacking on poor O_DIRECT.
>
> Again there are tons of O_DIRECT apps out there, why are you forcing them to
> change if they want true pmem performance?

This isn't forcing them to change.  This is the path of least surprise
as error semantics are identical to a typical block device.  Yes, an
application can go faster by switching to the "buffered" / dax_do_io()
path it can go even faster to switch to mmap() I/O and use DAX
directly.  If we can later optimize the O_DIRECT path to bring it's
performance more in line with dax_do_io(), great, but the
implementation should be correct first and optimized later.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 16:49         ` Dan Williams
@ 2016-05-02 17:44           ` Boaz Harrosh
  2016-05-02 18:10             ` Dan Williams
  0 siblings, 1 reply; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 17:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On 05/02/2016 07:49 PM, Dan Williams wrote:
> On Mon, May 2, 2016 at 9:22 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>> On 05/02/2016 07:01 PM, Dan Williams wrote:
>>> On Mon, May 2, 2016 at 8:41 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>>>> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>>>>> All IO in a dax filesystem used to go through dax_do_io, which cannot
>>>>> handle media errors, and thus cannot provide a recovery path that can
>>>>> send a write through the driver to clear errors.
>>>>>
>>>>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>>>>> path for DAX filesystems, use the same direct_IO path for both DAX and
>>>>> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>>>>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>>>>> direct_IO path instead of DAX.
>>>>>
>>>>
>>>> Really? What are your thinking here?
>>>>
>>>> What about all the current users of O_DIRECT, you have just made them
>>>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>>>> direct_IO path will queue an IO request and all.
>>>> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>>>>
>>>> I hate it that you overload the semantics of a known and expected
>>>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>>>> and unrelated overload of the semantics of O_DIRECT.
>>>
>>> I think it is the opposite situation, it us undoing the premature
>>> overloading of O_DIRECT that went in without performance numbers.
>>
>> We have tons of measurements. Is not hard to imagine the results though.
>> Specially the 1000 threads case
>>
>>> This implementation clarifies that dax_do_io() handles the lack of a
>>> page cache for buffered I/O and O_DIRECT behaves as it nominally would
>>> by sending an I/O to the driver.
>>
>>> It has the benefit of matching the
>>> error semantics of a typical block device where a buffered write could
>>> hit an error filling the page cache, but an O_DIRECT write potentially
>>> triggers the drive to remap the block.
>>>
>>
>> I fail to see how in writes the device error semantics regarding remapping of
>> blocks is any different between buffered and direct IO. As far as the block
>> device it is the same exact code path. All The big difference is higher in the
>> VFS.
>>
>> And ... So you are willing to sacrifice the 99% hotpath for the sake of the
>> 1% error path? and piggybacking on poor O_DIRECT.
>>
>> Again there are tons of O_DIRECT apps out there, why are you forcing them to
>> change if they want true pmem performance?
> 
> This isn't forcing them to change.  This is the path of least surprise
> as error semantics are identical to a typical block device.  Yes, an
> application can go faster by switching to the "buffered" / dax_do_io()
> path it can go even faster to switch to mmap() I/O and use DAX
> directly.  If we can later optimize the O_DIRECT path to bring it's
> performance more in line with dax_do_io(), great, but the
> implementation should be correct first and optimized later.
> 

Why does it need to be either or. Why not both?
And also I disagree if you are correct and dax_do_io is bad and needs fixing
than you have broken applications. Because in current model:

read => -EIO, write-bufferd, sync()
gives you the same error semantics as: read => -EIO, write-direct-io

In fact this is what the delete, restore from backup model does today.
Who said it uses / must direct IO. Actually I think it does not.

Two things I can think of which are better:
[1]
Why not go deeper into the dax io loops, and for any WRITE
failed page call bdev_rw_page() to let the pmem.c clear / relocate
the error page.

So reads return -EIO - is what you wanted no?
writes get a memory error and retry with bdev_rw_page() to let the bdev
relocate / clear the error - is what you wanted no?

In the partial page WRITE case on bad sectors. we can carefully read-modify-write
sector-by-sector and zero-out the bad-sectors that could not be read, what else?
(Or enhance the bdev_rw_page() API)

[2]
Only switch to slow O_DIRECT, on presence of errors like you wanted. But I still
hate that you overload error semantics with O_DIRECT which does not exist today
see above

Thanks
Boaz


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 17:44           ` Boaz Harrosh
@ 2016-05-02 18:10             ` Dan Williams
  2016-05-02 18:32               ` Boaz Harrosh
  0 siblings, 1 reply; 25+ messages in thread
From: Dan Williams @ 2016-05-02 18:10 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On Mon, May 2, 2016 at 10:44 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 05/02/2016 07:49 PM, Dan Williams wrote:
>> On Mon, May 2, 2016 at 9:22 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>>> On 05/02/2016 07:01 PM, Dan Williams wrote:
>>>> On Mon, May 2, 2016 at 8:41 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>>>>> On 04/29/2016 12:16 AM, Vishal Verma wrote:
>>>>>> All IO in a dax filesystem used to go through dax_do_io, which cannot
>>>>>> handle media errors, and thus cannot provide a recovery path that can
>>>>>> send a write through the driver to clear errors.
>>>>>>
>>>>>> Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>>>>>> path for DAX filesystems, use the same direct_IO path for both DAX and
>>>>>> direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>>>>>> mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>>>>>> direct_IO path instead of DAX.
>>>>>>
>>>>>
>>>>> Really? What are your thinking here?
>>>>>
>>>>> What about all the current users of O_DIRECT, you have just made them
>>>>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>>>>> direct_IO path will queue an IO request and all.
>>>>> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>>>>>
>>>>> I hate it that you overload the semantics of a known and expected
>>>>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>>>>> and unrelated overload of the semantics of O_DIRECT.
>>>>
>>>> I think it is the opposite situation, it us undoing the premature
>>>> overloading of O_DIRECT that went in without performance numbers.
>>>
>>> We have tons of measurements. Is not hard to imagine the results though.
>>> Specially the 1000 threads case
>>>
>>>> This implementation clarifies that dax_do_io() handles the lack of a
>>>> page cache for buffered I/O and O_DIRECT behaves as it nominally would
>>>> by sending an I/O to the driver.
>>>
>>>> It has the benefit of matching the
>>>> error semantics of a typical block device where a buffered write could
>>>> hit an error filling the page cache, but an O_DIRECT write potentially
>>>> triggers the drive to remap the block.
>>>>
>>>
>>> I fail to see how in writes the device error semantics regarding remapping of
>>> blocks is any different between buffered and direct IO. As far as the block
>>> device it is the same exact code path. All The big difference is higher in the
>>> VFS.
>>>
>>> And ... So you are willing to sacrifice the 99% hotpath for the sake of the
>>> 1% error path? and piggybacking on poor O_DIRECT.
>>>
>>> Again there are tons of O_DIRECT apps out there, why are you forcing them to
>>> change if they want true pmem performance?
>>
>> This isn't forcing them to change.  This is the path of least surprise
>> as error semantics are identical to a typical block device.  Yes, an
>> application can go faster by switching to the "buffered" / dax_do_io()
>> path it can go even faster to switch to mmap() I/O and use DAX
>> directly.  If we can later optimize the O_DIRECT path to bring it's
>> performance more in line with dax_do_io(), great, but the
>> implementation should be correct first and optimized later.
>>
>
> Why does it need to be either or. Why not both?
> And also I disagree if you are correct and dax_do_io is bad and needs fixing
> than you have broken applications. Because in current model:
>
> read => -EIO, write-bufferd, sync()
> gives you the same error semantics as: read => -EIO, write-direct-io
> In fact this is what the delete, restore from backup model does today.
> Who said it uses / must direct IO. Actually I think it does not.

The semantic I am talking about preserving is:

buffered / unaligned write of a bad sector => -EIO on reading into the
page cache

...and that the only guaranteed way to clear an error (assuming the
block device supports it) is an O_DIRECT write.

>
> Two things I can think of which are better:
> [1]
> Why not go deeper into the dax io loops, and for any WRITE
> failed page call bdev_rw_page() to let the pmem.c clear / relocate
> the error page.

Where do you get the rest of the data to complete a full page write?

> So reads return -EIO - is what you wanted no?

That's well understood.  What we are debating is the method to clear
errors / ask the storage device to remap bad blocks.

> writes get a memory error and retry with bdev_rw_page() to let the bdev
> relocate / clear the error - is what you wanted no?
>
> In the partial page WRITE case on bad sectors. we can carefully read-modify-write
> sector-by-sector and zero-out the bad-sectors that could not be read, what else?
> (Or enhance the bdev_rw_page() API)

See all the previous discussions on why the fallback path is
problematic to implement.

>
> [2]
> Only switch to slow O_DIRECT, on presence of errors like you wanted. But I still
> hate that you overload error semantics with O_DIRECT which does not exist today
> see above

I still think we're talking past each other on this point.  This patch
set is not overloading error semantics, it's fixing the error handling
problem that was introduced in this commit:

   d475c6346a38 dax,ext2: replace XIP read and write with DAX I/O

...where we started overloading O_DIRECT and dax_do_io() semantics.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 18:10             ` Dan Williams
@ 2016-05-02 18:32               ` Boaz Harrosh
  2016-05-02 18:48                 ` Dan Williams
  0 siblings, 1 reply; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 18:32 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On 05/02/2016 09:10 PM, Dan Williams wrote:
<>
> 
> The semantic I am talking about preserving is:
> 
> buffered / unaligned write of a bad sector => -EIO on reading into the
> page cache
> 

What about aligned buffered write? like write 0-to-eof
This still broken? (and is what restore apps do)

> ...and that the only guaranteed way to clear an error (assuming the
> block device supports it) is an O_DIRECT write.
> 

Sure fixing dax_do_io will guaranty that.

<>
> I still think we're talking past each other on this point.  

Yes we are!

> This patch
> set is not overloading error semantics, it's fixing the error handling
> problem that was introduced in this commit:
> 
>    d475c6346a38 dax,ext2: replace XIP read and write with DAX I/O
> 
> ...where we started overloading O_DIRECT and dax_do_io() semantics.
> 

But above does not fix them does it? it just completely NULLs DAX for
O_DIRECT which is a great pity, why did we do all this work in the first
place.

And then it keeps broken the aligned buffered writes, which are still
broken after this set.

I have by now read the v2 patches. And I think you guys did not yet try
the proper fix for dax_do_io. I think you need to go deeper into the loops
and selectively call bdev_* when error on a specific page copy. No need to
go through direct_IO path at all.
Do you need that I send you a patch to demonstrate what I mean?

But yes I feel too that "we're talking past each other". I did want
to come to LSF and talk to you, but was not invited. Should I call you?

Thanks
Boaz

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 18:32               ` Boaz Harrosh
@ 2016-05-02 18:48                 ` Dan Williams
  2016-05-02 19:22                   ` Boaz Harrosh
  0 siblings, 1 reply; 25+ messages in thread
From: Dan Williams @ 2016-05-02 18:48 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On Mon, May 2, 2016 at 11:32 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 05/02/2016 09:10 PM, Dan Williams wrote:
> <>
>>
>> The semantic I am talking about preserving is:
>>
>> buffered / unaligned write of a bad sector => -EIO on reading into the
>> page cache
>>
>
> What about aligned buffered write? like write 0-to-eof
> This still broken? (and is what restore apps do)
>
>> ...and that the only guaranteed way to clear an error (assuming the
>> block device supports it) is an O_DIRECT write.
>>
>
> Sure fixing dax_do_io will guaranty that.
>
> <>
>> I still think we're talking past each other on this point.
>
> Yes we are!
>
>> This patch
>> set is not overloading error semantics, it's fixing the error handling
>> problem that was introduced in this commit:
>>
>>    d475c6346a38 dax,ext2: replace XIP read and write with DAX I/O
>>
>> ...where we started overloading O_DIRECT and dax_do_io() semantics.
>>
>
> But above does not fix them does it? it just completely NULLs DAX for
> O_DIRECT which is a great pity, why did we do all this work in the first
> place.

This is hyperbole.  We don't impact "all the work" we did for the mmap
I/O case and the acceleration of the non-direct-I/O case.

> And then it keeps broken the aligned buffered writes, which are still
> broken after this set.

...identical to the current situation with a traditional disk.

> I have by now read the v2 patches. And I think you guys did not yet try
> the proper fix for dax_do_io. I think you need to go deeper into the loops
> and selectively call bdev_* when error on a specific page copy. No need to
> go through direct_IO path at all.

We still reach a point where the minimum granularity of
bdev_direct_access() is larger than a sector, so you end up still
needing to have the application understand how to send a properly
aligned I/O.  The semantics of how to send a properly aligned
direct-I/O are already well understood, so we simply reuse that path.

> Do you need that I send you a patch to demonstrate what I mean?

I remain skeptical of what you are proposing, but yes, a patch has a
better chance to move the discussion forward.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 18:48                 ` Dan Williams
@ 2016-05-02 19:22                   ` Boaz Harrosh
  0 siblings, 0 replies; 25+ messages in thread
From: Boaz Harrosh @ 2016-05-02 19:22 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vishal Verma, linux-nvdimm@lists.01.org, linux-block, Jan Kara,
	Matthew Wilcox, Dave Chinner, linux-kernel@vger.kernel.org,
	XFS Developers, Jens Axboe, Linux MM, Al Viro, Christoph Hellwig,
	linux-fsdevel, Andrew Morton, linux-ext4

On 05/02/2016 09:48 PM, Dan Williams wrote:
<>
>> And then it keeps broken the aligned buffered writes, which are still
>> broken after this set.
> 
> ...identical to the current situation with a traditional disk.
> 

Not true!! please see what I wrote "aligned buffered writes"
If there are no reads involved then there are no errors returned
to application.

>> I have by now read the v2 patches. And I think you guys did not yet try
>> the proper fix for dax_do_io. I think you need to go deeper into the loops
>> and selectively call bdev_* when error on a specific page copy. No need to
>> go through direct_IO path at all.
> 
> We still reach a point where the minimum granularity of
> bdev_direct_access() is larger than a sector, so you end up still
> needing to have the application understand how to send a properly
> aligned I/O.  The semantics of how to send a properly aligned
> direct-I/O are already well understood, so we simply reuse that path.
> 

You are making a mountain out of a mouse. The simple copy of a file
from start (offset ZERO) to end-of-file which is the most common usage
on earth is perfectly aligned and needs not any O_DIRECT and is what is used
everywhere.

>> Do you need that I send you a patch to demonstrate what I mean?
> 
> I remain skeptical of what you are proposing, but yes, a patch has a
> better chance to move the discussion forward.
> 

Sigh! OK
Boaz


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-02 15:41   ` Boaz Harrosh
  2016-05-02 15:51     ` Vishal Verma
  2016-05-02 16:01     ` Dan Williams
@ 2016-05-05 14:24     ` Christoph Hellwig
  2016-05-05 15:15       ` Dan Williams
  2016-05-05 21:39       ` Verma, Vishal L
  2 siblings, 2 replies; 25+ messages in thread
From: Christoph Hellwig @ 2016-05-05 14:24 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Vishal Verma, linux-nvdimm, Jens Axboe, Jan Kara, Andrew Morton,
	Matthew Wilcox, Dave Chinner, linux-kernel, xfs, linux-block,
	linux-mm, Al Viro, Christoph Hellwig, linux-fsdevel, linux-ext4

On Mon, May 02, 2016 at 06:41:51PM +0300, Boaz Harrosh wrote:
> > All IO in a dax filesystem used to go through dax_do_io, which cannot
> > handle media errors, and thus cannot provide a recovery path that can
> > send a write through the driver to clear errors.
> > 
> > Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
> > path for DAX filesystems, use the same direct_IO path for both DAX and
> > direct_io iocbs, but use the flags to identify when we are in O_DIRECT
> > mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
> > direct_IO path instead of DAX.
> > 
> 
> Really? What are your thinking here?
> 
> What about all the current users of O_DIRECT, you have just made them
> 4 times slower and "less concurrent*" then "buffred io" users. Since
> direct_IO path will queue an IO request and all.
> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
> 
> I hate it that you overload the semantics of a known and expected
> O_DIRECT flag, for special pmem quirks. This is an incompatible
> and unrelated overload of the semantics of O_DIRECT.

Agreed - makig O_DIRECT less direct than not having it is plain stupid,
and I somehow missed this initially.

This whole DAX story turns into a major nightmare, and I fear all our
hodge podge tweaks to the semantics aren't helping it.

It seems like we simply need an explicit O_DAX for the read/write
bypass if can't sort out the semantics (error, writer synchronization)
just as we need a special flag for MMAP..

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 14:24     ` Christoph Hellwig
@ 2016-05-05 15:15       ` Dan Williams
  2016-05-05 15:22         ` Christoph Hellwig
  2016-05-05 21:42         ` Verma, Vishal L
  2016-05-05 21:39       ` Verma, Vishal L
  1 sibling, 2 replies; 25+ messages in thread
From: Dan Williams @ 2016-05-05 15:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Boaz Harrosh, linux-block, linux-ext4, Jan Kara, Matthew Wilcox,
	Dave Chinner, linux-kernel@vger.kernel.org, XFS Developers,
	Jens Axboe, Linux MM, Al Viro, linux-nvdimm, linux-fsdevel,
	Andrew Morton

On Thu, May 5, 2016 at 7:24 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, May 02, 2016 at 06:41:51PM +0300, Boaz Harrosh wrote:
>> > All IO in a dax filesystem used to go through dax_do_io, which cannot
>> > handle media errors, and thus cannot provide a recovery path that can
>> > send a write through the driver to clear errors.
>> >
>> > Add a new iocb flag for DAX, and set it only for DAX mounts. In the IO
>> > path for DAX filesystems, use the same direct_IO path for both DAX and
>> > direct_io iocbs, but use the flags to identify when we are in O_DIRECT
>> > mode vs non O_DIRECT with DAX, and for O_DIRECT, use the conventional
>> > direct_IO path instead of DAX.
>> >
>>
>> Really? What are your thinking here?
>>
>> What about all the current users of O_DIRECT, you have just made them
>> 4 times slower and "less concurrent*" then "buffred io" users. Since
>> direct_IO path will queue an IO request and all.
>> (And if it is not so slow then why do we need dax_do_io at all? [Rhetorical])
>>
>> I hate it that you overload the semantics of a known and expected
>> O_DIRECT flag, for special pmem quirks. This is an incompatible
>> and unrelated overload of the semantics of O_DIRECT.
>
> Agreed - makig O_DIRECT less direct than not having it is plain stupid,
> and I somehow missed this initially.

Of course I disagree because like Dave argues in the msync case we
should do the correct thing first and make it fast later, but also
like Dave this arguing in circles is getting tiresome.

> This whole DAX story turns into a major nightmare, and I fear all our
> hodge podge tweaks to the semantics aren't helping it.
>
> It seems like we simply need an explicit O_DAX for the read/write
> bypass if can't sort out the semantics (error, writer synchronization)
> just as we need a special flag for MMAP.

I don't see how O_DAX makes this situation better if the goal is to
accelerate unmodified applications...

Vishal, at least the "delete a file with a badblock" model will still
work for implicitly clearing errors with your changes to stop doing
block clearing in fs/dax.c.  This combined with a new -EBADBLOCK (as
Dave suggests) and explicit logging of I/Os that fail for this reason
at least gives a chance to communicate errors in files to suitably
aware applications / environments.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 15:15       ` Dan Williams
@ 2016-05-05 15:22         ` Christoph Hellwig
  2016-05-05 16:24           ` Dan Williams
  2016-05-05 21:45           ` Verma, Vishal L
  2016-05-05 21:42         ` Verma, Vishal L
  1 sibling, 2 replies; 25+ messages in thread
From: Christoph Hellwig @ 2016-05-05 15:22 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, Boaz Harrosh, linux-block, linux-ext4,
	Jan Kara, Matthew Wilcox, Dave Chinner,
	linux-kernel@vger.kernel.org, XFS Developers, Jens Axboe,
	Linux MM, Al Viro, linux-nvdimm, linux-fsdevel, Andrew Morton

On Thu, May 05, 2016 at 08:15:32AM -0700, Dan Williams wrote:
> > Agreed - makig O_DIRECT less direct than not having it is plain stupid,
> > and I somehow missed this initially.
> 
> Of course I disagree because like Dave argues in the msync case we
> should do the correct thing first and make it fast later, but also
> like Dave this arguing in circles is getting tiresome.

We should do the right thing first, and make it fast later.  But this
proposal is not getting it right - it still does not handle errors
for the fast path, but magically makes it work for direct I/O by
in general using a less optional path for O_DIRECT.  It's getting the
worst of all choices.

As far as I can tell the only sensible option is to:

 - always try dax-like I/O first
 - have a custom get_user_pages + rw_bytes fallback handles bad blocks
   when hitting EIO

And then we need to sort out the concurrent write synchronization.
Again there I think we absolutely have to obey Posix for the !O_DIRECT
case and can avoid it for O_DIRECT, similar to the existing non-DAX
semantics.  If we want any special additional semantics we _will_ need
a special O_DAX flag.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 15:22         ` Christoph Hellwig
@ 2016-05-05 16:24           ` Dan Williams
  2016-05-05 21:45           ` Verma, Vishal L
  1 sibling, 0 replies; 25+ messages in thread
From: Dan Williams @ 2016-05-05 16:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Boaz Harrosh, linux-block, linux-ext4, Jan Kara, Matthew Wilcox,
	Dave Chinner, linux-kernel@vger.kernel.org, XFS Developers,
	Jens Axboe, Linux MM, Al Viro, linux-nvdimm, linux-fsdevel,
	Andrew Morton

On Thu, May 5, 2016 at 8:22 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, May 05, 2016 at 08:15:32AM -0700, Dan Williams wrote:
>> > Agreed - makig O_DIRECT less direct than not having it is plain stupid,
>> > and I somehow missed this initially.
>>
>> Of course I disagree because like Dave argues in the msync case we
>> should do the correct thing first and make it fast later, but also
>> like Dave this arguing in circles is getting tiresome.
>
> We should do the right thing first, and make it fast later.  But this
> proposal is not getting it right - it still does not handle errors
> for the fast path, but magically makes it work for direct I/O by
> in general using a less optional path for O_DIRECT.  It's getting the
> worst of all choices.
>
> As far as I can tell the only sensible option is to:
>
>  - always try dax-like I/O first
>  - have a custom get_user_pages + rw_bytes fallback handles bad blocks
>    when hitting EIO

If you're on board with more special fallbacks for dax-capable block
devices that indeed opens up the thinking.  The O_DIRECT approach was
meant to keep the error clearing model close to the traditional block
device case, but yes that does constrain the implementation in
sub-optimal ways.

However, we still have the alignment problem in the rw_bytes case, how
do we communicate to the application that only writes with a certain
size/alignment will clear errors?  That forced alignment assumption
was the other appeal of O_DIRECT.  Perhaps we can at least start with
hole punching and block reallocation as the error clearing method
while we think more about the write-to-clear case?

> And then we need to sort out the concurrent write synchronization.
> Again there I think we absolutely have to obey Posix for the !O_DIRECT
> case and can avoid it for O_DIRECT, similar to the existing non-DAX
> semantics.  If we want any special additional semantics we _will_ need
> a special O_DAX flag.

Ok, makes sense.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 15:22         ` Christoph Hellwig
  2016-05-05 16:24           ` Dan Williams
@ 2016-05-05 21:45           ` Verma, Vishal L
  2016-05-08  9:01             ` hch
  1 sibling, 1 reply; 25+ messages in thread
From: Verma, Vishal L @ 2016-05-05 21:45 UTC (permalink / raw)
  To: Williams, Dan J, hch@infradead.org
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	xfs@oss.sgi.com, linux-nvdimm@ml01.01.org, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, axboe@fb.com, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

T24gVGh1LCAyMDE2LTA1LTA1IGF0IDA4OjIyIC0wNzAwLCBDaHJpc3RvcGggSGVsbHdpZyB3cm90
ZToNCj4gT24gVGh1LCBNYXkgMDUsIDIwMTYgYXQgMDg6MTU6MzJBTSAtMDcwMCwgRGFuIFdpbGxp
YW1zIHdyb3RlOg0KPiA+IA0KPiA+ID4gDQo+ID4gPiBBZ3JlZWQgLSBtYWtpZyBPX0RJUkVDVCBs
ZXNzIGRpcmVjdCB0aGFuIG5vdCBoYXZpbmcgaXQgaXMgcGxhaW4NCj4gPiA+IHN0dXBpZCwNCj4g
PiA+IGFuZCBJIHNvbWVob3cgbWlzc2VkIHRoaXMgaW5pdGlhbGx5Lg0KPiA+IE9mIGNvdXJzZSBJ
IGRpc2FncmVlIGJlY2F1c2UgbGlrZSBEYXZlIGFyZ3VlcyBpbiB0aGUgbXN5bmMgY2FzZSB3ZQ0K
PiA+IHNob3VsZCBkbyB0aGUgY29ycmVjdCB0aGluZyBmaXJzdCBhbmQgbWFrZSBpdCBmYXN0IGxh
dGVyLCBidXQgYWxzbw0KPiA+IGxpa2UgRGF2ZSB0aGlzIGFyZ3VpbmcgaW4gY2lyY2xlcyBpcyBn
ZXR0aW5nIHRpcmVzb21lLg0KPiBXZSBzaG91bGQgZG8gdGhlIHJpZ2h0IHRoaW5nIGZpcnN0LCBh
bmQgbWFrZSBpdCBmYXN0IGxhdGVyLsKgwqBCdXQgdGhpcw0KPiBwcm9wb3NhbCBpcyBub3QgZ2V0
dGluZyBpdCByaWdodCAtIGl0IHN0aWxsIGRvZXMgbm90IGhhbmRsZSBlcnJvcnMNCj4gZm9yIHRo
ZSBmYXN0IHBhdGgsIGJ1dCBtYWdpY2FsbHkgbWFrZXMgaXQgd29yayBmb3IgZGlyZWN0IEkvTyBi
eQ0KPiBpbiBnZW5lcmFsIHVzaW5nIGEgbGVzcyBvcHRpb25hbCBwYXRoIGZvciBPX0RJUkVDVC7C
oMKgSXQncyBnZXR0aW5nIHRoZQ0KPiB3b3JzdCBvZiBhbGwgY2hvaWNlcy4NCj4gDQo+IEFzIGZh
ciBhcyBJIGNhbiB0ZWxsIHRoZSBvbmx5IHNlbnNpYmxlIG9wdGlvbiBpcyB0bzoNCj4gDQo+IMKg
LSBhbHdheXMgdHJ5IGRheC1saWtlIEkvTyBmaXJzdA0KPiDCoC0gaGF2ZSBhIGN1c3RvbSBnZXRf
dXNlcl9wYWdlcyArIHJ3X2J5dGVzIGZhbGxiYWNrIGhhbmRsZXMgYmFkIGJsb2Nrcw0KPiDCoMKg
wqB3aGVuIGhpdHRpbmcgRUlPDQoNCkknbSBub3Qgc3VyZSBJIGNvbXBsZXRlbHkgdW5kZXJzdGFu
ZCBob3cgdGhpcyB3aWxsIHdvcms/IENhbiB5b3UgZXhwbGFpbg0KYSBiaXQ/IFdvdWxkIHdlIGhh
dmUgdG8gZXhwb3J0IHJ3X2J5dGVzIHVwIHRvIGxheWVycyBhYm92ZSB0aGUgcG1lbQ0KZHJpdmVy
PyBXaGVyZSBkb2VzIGdldF91c2VyX3BhZ2VzIGNvbWUgaW4/DQoNCj4gDQo+IEFuZCB0aGVuIHdl
IG5lZWQgdG8gc29ydCBvdXQgdGhlIGNvbmN1cnJlbnQgd3JpdGUgc3luY2hyb25pemF0aW9uLg0K
PiBBZ2FpbiB0aGVyZSBJIHRoaW5rIHdlIGFic29sdXRlbHkgaGF2ZSB0byBvYmV5IFBvc2l4IGZv
ciB0aGUgIU9fRElSRUNUDQo+IGNhc2UgYW5kIGNhbiBhdm9pZCBpdCBmb3IgT19ESVJFQ1QsIHNp
bWlsYXIgdG8gdGhlIGV4aXN0aW5nIG5vbi1EQVgNCj4gc2VtYW50aWNzLsKgwqBJZiB3ZSB3YW50
IGFueSBzcGVjaWFsIGFkZGl0aW9uYWwgc2VtYW50aWNzIHdlIF93aWxsXyBuZWVkDQo+IGEgc3Bl
Y2lhbCBPX0RBWCBmbGFnLg0KPiBfX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19f
X19fX19fX19fXw0KPiBMaW51eC1udmRpbW0gbWFpbGluZyBsaXN0DQo+IExpbnV4LW52ZGltbUBs
aXN0cy4wMS5vcmcNCj4gaHR0cHM6Ly9saXN0cy4wMS5vcmcvbWFpbG1hbi9saXN0aW5mby9saW51
eC1udmRpbW0=

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 21:45           ` Verma, Vishal L
@ 2016-05-08  9:01             ` hch
  2016-05-08 18:42               ` Verma, Vishal L
  0 siblings, 1 reply; 25+ messages in thread
From: hch @ 2016-05-08  9:01 UTC (permalink / raw)
  To: Verma, Vishal L
  Cc: Williams, Dan J, hch@infradead.org, linux-kernel@vger.kernel.org,
	linux-block@vger.kernel.org, xfs@oss.sgi.com,
	linux-nvdimm@ml01.01.org, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, axboe@fb.com, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

On Thu, May 05, 2016 at 09:45:07PM +0000, Verma, Vishal L wrote:
> I'm not sure I completely understand how this will work? Can you explain
> a bit? Would we have to export rw_bytes up to layers above the pmem
> driver? Where does get_user_pages come in?

A DAX filesystem can directly use the nvdimm layer the same way btt
doe,s what's the problem?

Re get_user_pages my idea was to simply use that to lock down the user
pages so that we can call rw_bytes on it.  How else would you do it?  Do
a kmalloc, copy_from_user and then another memcpy?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-08  9:01             ` hch
@ 2016-05-08 18:42               ` Verma, Vishal L
  0 siblings, 0 replies; 25+ messages in thread
From: Verma, Vishal L @ 2016-05-08 18:42 UTC (permalink / raw)
  To: hch@infradead.org
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	xfs@oss.sgi.com, linux-nvdimm@ml01.01.org, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, Williams, Dan J, axboe@fb.com,
	akpm@linux-foundation.org, linux-fsdevel@vger.kernel.org,
	linux-ext4@vger.kernel.org, david@fromorbit.com, jack@suse.cz,
	matthew@wil.cx

T24gU3VuLCAyMDE2LTA1LTA4IGF0IDAyOjAxIC0wNzAwLCBoY2hAaW5mcmFkZWFkLm9yZyB3cm90
ZToNCj4gT24gVGh1LCBNYXkgMDUsIDIwMTYgYXQgMDk6NDU6MDdQTSArMDAwMCwgVmVybWEsIFZp
c2hhbCBMIHdyb3RlOg0KPiA+IA0KPiA+IEknbSBub3Qgc3VyZSBJIGNvbXBsZXRlbHkgdW5kZXJz
dGFuZCBob3cgdGhpcyB3aWxsIHdvcms/IENhbiB5b3UNCj4gPiBleHBsYWluDQo+ID4gYSBiaXQ/
IFdvdWxkIHdlIGhhdmUgdG8gZXhwb3J0IHJ3X2J5dGVzIHVwIHRvIGxheWVycyBhYm92ZSB0aGUg
cG1lbQ0KPiA+IGRyaXZlcj8gV2hlcmUgZG9lcyBnZXRfdXNlcl9wYWdlcyBjb21lIGluPw0KPiBB
IERBWCBmaWxlc3lzdGVtIGNhbiBkaXJlY3RseSB1c2UgdGhlIG52ZGltbSBsYXllciB0aGUgc2Ft
ZSB3YXkgYnR0DQo+IGRvZSxzIHdoYXQncyB0aGUgcHJvYmxlbT8NCg0KVGhlIEJUVCBkb2VzIHJ3
X2J5dGVzIHRocm91Z2ggYW4gaW50ZXJuYWwtdG8tbGlibnZkaW1tIG1lY2hhbmlzbSwgYnV0DQpy
d19ieXRlcyBpc24ndCBleHBvcnRlZCB0byB0aGUgZmlsZXN5c3RlbSwgY3VycmVudGx5Li4gVG8g
ZG8gdGhpcyB3ZSdkDQpoYXZlIHRvIGVpdGhlciBhZGQgYW4gcndfYnl0ZXMgdG8gYmxvY2sgZGV2
aWNlIG9wZXJhdGlvbnMuLi5vcg0Kc29tZXRoaW5nLg0KDQpBbm90aGVyIHRoaW5nIGlzIHJ3X2J5
dGVzIGN1cnJlbnRseSBkb2Vzbid0IGRvIGVycm9yIGNsZWFyaW5nIGVpdGhlci4NCldlIHN0b3Jl
IGJhZGJsb2NrcyBhdCBzZWN0b3IgZ3JhbnVsYXJpdHksIGFuZCBsaWtlIERhbiBzYWlkIGVhcmxp
ZXIsDQp0aGF0IGhpZGVzIHRoZSBjbGVhcl9lcnJvciBhbGlnbm1lbnQgcmVxdWlyZW1lbnRzIGFu
ZCB1cHBlciBsYXllcnMNCmRvbid0IGhhdmUgdG8gYmUgYXdhcmUgb2YgaXQuIFRvIG1ha2Ugcndf
Ynl0ZXMgY2xlYXIgc3ViLXNlY3RvciBlcnJvcnMsDQp3ZSdkIGhhdmUgdG8gY2hhbmdlIHRoZSBn
cmFudWxhcml0eSBvZiBiYWQtYmxvY2tzLCBhbmQgbWFrZSB1cHBlcg0KbGF5ZXJzIGF3YXJlIG9m
IHRoZSBjbGVhcmluZyBhbGlnbm1lbnQgcmVxdWlyZW1lbnRzLg0KDQpVc2luZyBhIGJsb2NrLXdy
aXRlIHNlbWFudGljIGZvciBjbGVhcmluZyBoaWRlcyBhbGwgdGhpcyBhd2F5Lg0KDQo+IA0KPiBS
ZSBnZXRfdXNlcl9wYWdlcyBteSBpZGVhIHdhcyB0byBzaW1wbHkgdXNlIHRoYXQgdG8gbG9jayBk
b3duIHRoZQ0KPiB1c2VyDQo+IHBhZ2VzIHNvIHRoYXQgd2UgY2FuIGNhbGwgcndfYnl0ZXMgb24g
aXQuwqDCoEhvdyBlbHNlIHdvdWxkIHlvdSBkbw0KPiBpdD/CoMKgRG8NCj4gYSBrbWFsbG9jLCBj
b3B5X2Zyb21fdXNlciBhbmQgdGhlbiBhbm90aGVyIG1lbWNweT8=

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 15:15       ` Dan Williams
  2016-05-05 15:22         ` Christoph Hellwig
@ 2016-05-05 21:42         ` Verma, Vishal L
  1 sibling, 0 replies; 25+ messages in thread
From: Verma, Vishal L @ 2016-05-05 21:42 UTC (permalink / raw)
  To: Williams, Dan J, hch@infradead.org
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	xfs@oss.sgi.com, linux-nvdimm@ml01.01.org, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, axboe@fb.com, akpm@linux-foundation.org,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

T24gVGh1LCAyMDE2LTA1LTA1IGF0IDA4OjE1IC0wNzAwLCBEYW4gV2lsbGlhbXMgd3JvdGU6DQo+
IE9uIFRodSwgTWF5IDUsIDIwMTYgYXQgNzoyNCBBTSwgQ2hyaXN0b3BoIEhlbGx3aWcgPGhjaEBp
bmZyYWRlYWQub3JnPg0KPiB3cm90ZToNCj4gPiANCj4gPiBPbiBNb24sIE1heSAwMiwgMjAxNiBh
dCAwNjo0MTo1MVBNICswMzAwLCBCb2F6IEhhcnJvc2ggd3JvdGU6DQo+ID4gPiANCj4gPiA+ID4g
DQo+ID4gPiA+IEFsbCBJTyBpbiBhIGRheCBmaWxlc3lzdGVtIHVzZWQgdG8gZ28gdGhyb3VnaCBk
YXhfZG9faW8sIHdoaWNoDQo+ID4gPiA+IGNhbm5vdA0KPiA+ID4gPiBoYW5kbGUgbWVkaWEgZXJy
b3JzLCBhbmQgdGh1cyBjYW5ub3QgcHJvdmlkZSBhIHJlY292ZXJ5IHBhdGgNCj4gPiA+ID4gdGhh
dCBjYW4NCj4gPiA+ID4gc2VuZCBhIHdyaXRlIHRocm91Z2ggdGhlIGRyaXZlciB0byBjbGVhciBl
cnJvcnMuDQo+ID4gPiA+IA0KPiA+ID4gPiBBZGQgYSBuZXcgaW9jYiBmbGFnIGZvciBEQVgsIGFu
ZCBzZXQgaXQgb25seSBmb3IgREFYIG1vdW50cy4gSW4NCj4gPiA+ID4gdGhlIElPDQo+ID4gPiA+
IHBhdGggZm9yIERBWCBmaWxlc3lzdGVtcywgdXNlIHRoZSBzYW1lIGRpcmVjdF9JTyBwYXRoIGZv
ciBib3RoDQo+ID4gPiA+IERBWCBhbmQNCj4gPiA+ID4gZGlyZWN0X2lvIGlvY2JzLCBidXQgdXNl
IHRoZSBmbGFncyB0byBpZGVudGlmeSB3aGVuIHdlIGFyZSBpbg0KPiA+ID4gPiBPX0RJUkVDVA0K
PiA+ID4gPiBtb2RlIHZzIG5vbiBPX0RJUkVDVCB3aXRoIERBWCwgYW5kIGZvciBPX0RJUkVDVCwg
dXNlIHRoZQ0KPiA+ID4gPiBjb252ZW50aW9uYWwNCj4gPiA+ID4gZGlyZWN0X0lPIHBhdGggaW5z
dGVhZCBvZiBEQVguDQo+ID4gPiA+IA0KPiA+ID4gUmVhbGx5PyBXaGF0IGFyZSB5b3VyIHRoaW5r
aW5nIGhlcmU/DQo+ID4gPiANCj4gPiA+IFdoYXQgYWJvdXQgYWxsIHRoZSBjdXJyZW50IHVzZXJz
IG9mIE9fRElSRUNULCB5b3UgaGF2ZSBqdXN0IG1hZGUNCj4gPiA+IHRoZW0NCj4gPiA+IDQgdGlt
ZXMgc2xvd2VyIGFuZCAibGVzcyBjb25jdXJyZW50KiIgdGhlbiAiYnVmZnJlZCBpbyIgdXNlcnMu
DQo+ID4gPiBTaW5jZQ0KPiA+ID4gZGlyZWN0X0lPIHBhdGggd2lsbCBxdWV1ZSBhbiBJTyByZXF1
ZXN0IGFuZCBhbGwuDQo+ID4gPiAoQW5kIGlmIGl0IGlzIG5vdCBzbyBzbG93IHRoZW4gd2h5IGRv
IHdlIG5lZWQgZGF4X2RvX2lvIGF0IGFsbD8NCj4gPiA+IFtSaGV0b3JpY2FsXSkNCj4gPiA+IA0K
PiA+ID4gSSBoYXRlIGl0IHRoYXQgeW91IG92ZXJsb2FkIHRoZSBzZW1hbnRpY3Mgb2YgYSBrbm93
biBhbmQgZXhwZWN0ZWQNCj4gPiA+IE9fRElSRUNUIGZsYWcsIGZvciBzcGVjaWFsIHBtZW0gcXVp
cmtzLiBUaGlzIGlzIGFuIGluY29tcGF0aWJsZQ0KPiA+ID4gYW5kIHVucmVsYXRlZCBvdmVybG9h
ZCBvZiB0aGUgc2VtYW50aWNzIG9mIE9fRElSRUNULg0KPiA+IEFncmVlZCAtIG1ha2lnIE9fRElS
RUNUIGxlc3MgZGlyZWN0IHRoYW4gbm90IGhhdmluZyBpdCBpcyBwbGFpbg0KPiA+IHN0dXBpZCwN
Cj4gPiBhbmQgSSBzb21laG93IG1pc3NlZCB0aGlzIGluaXRpYWxseS4NCj4gT2YgY291cnNlIEkg
ZGlzYWdyZWUgYmVjYXVzZSBsaWtlIERhdmUgYXJndWVzIGluIHRoZSBtc3luYyBjYXNlIHdlDQo+
IHNob3VsZCBkbyB0aGUgY29ycmVjdCB0aGluZyBmaXJzdCBhbmQgbWFrZSBpdCBmYXN0IGxhdGVy
LCBidXQgYWxzbw0KPiBsaWtlIERhdmUgdGhpcyBhcmd1aW5nIGluIGNpcmNsZXMgaXMgZ2V0dGlu
ZyB0aXJlc29tZS4NCj4gDQo+ID4gDQo+ID4gVGhpcyB3aG9sZSBEQVggc3RvcnkgdHVybnMgaW50
byBhIG1ham9yIG5pZ2h0bWFyZSwgYW5kIEkgZmVhciBhbGwNCj4gPiBvdXINCj4gPiBob2RnZSBw
b2RnZSB0d2Vha3MgdG8gdGhlIHNlbWFudGljcyBhcmVuJ3QgaGVscGluZyBpdC4NCj4gPiANCj4g
PiBJdCBzZWVtcyBsaWtlIHdlIHNpbXBseSBuZWVkIGFuIGV4cGxpY2l0IE9fREFYIGZvciB0aGUg
cmVhZC93cml0ZQ0KPiA+IGJ5cGFzcyBpZiBjYW4ndCBzb3J0IG91dCB0aGUgc2VtYW50aWNzIChl
cnJvciwgd3JpdGVyDQo+ID4gc3luY2hyb25pemF0aW9uKQ0KPiA+IGp1c3QgYXMgd2UgbmVlZCBh
IHNwZWNpYWwgZmxhZyBmb3IgTU1BUC4NCj4gSSBkb24ndCBzZWUgaG93IE9fREFYIG1ha2VzIHRo
aXMgc2l0dWF0aW9uIGJldHRlciBpZiB0aGUgZ29hbCBpcyB0bw0KPiBhY2NlbGVyYXRlIHVubW9k
aWZpZWQgYXBwbGljYXRpb25zLi4uDQo+IA0KPiBWaXNoYWwsIGF0IGxlYXN0IHRoZSAiZGVsZXRl
IGEgZmlsZSB3aXRoIGEgYmFkYmxvY2siIG1vZGVsIHdpbGwgc3RpbGwNCj4gd29yayBmb3IgaW1w
bGljaXRseSBjbGVhcmluZyBlcnJvcnMgd2l0aCB5b3VyIGNoYW5nZXMgdG8gc3RvcCBkb2luZw0K
PiBibG9jayBjbGVhcmluZyBpbiBmcy9kYXguYy7CoMKgVGhpcyBjb21iaW5lZCB3aXRoIGEgbmV3
IC1FQkFEQkxPQ0sgKGFzDQo+IERhdmUgc3VnZ2VzdHMpIGFuZCBleHBsaWNpdCBsb2dnaW5nIG9m
IEkvT3MgdGhhdCBmYWlsIGZvciB0aGlzIHJlYXNvbg0KPiBhdCBsZWFzdCBnaXZlcyBhIGNoYW5j
ZSB0byBjb21tdW5pY2F0ZSBlcnJvcnMgaW4gZmlsZXMgdG8gc3VpdGFibHkNCj4gYXdhcmUgYXBw
bGljYXRpb25zIC8gZW52aXJvbm1lbnRzLg0KDQpBZ3JlZWQgLSBJJ2xsIHNlbmQgb3V0IGEgc2Vy
aWVzIHRoYXQgaGFzIGp1c3QgdGhlIHplcm9pbmcgY2hhbmdlcywgYW5kDQpkcm9wIHRoZSBkYXhf
aW8gZmFsbGJhY2svT19ESVJFQ1QgdHdlYWsgZm9yIG5vdyB3aGlsZSB3ZSBmaWd1cmUgb3V0IHRo
ZQ0KcmlnaHQgdGhpbmcgdG8gZG8uIFRoYXQgc2hvdWxkIGdldCB1cyB0byBhIHBsYWNlIHdoZXJl
IHdlIHN0aWxsIGhhdmUgZGF4DQppbiB0aGUgcHJlc2VuY2Ugb2YgZXJyb3JzLCBhbmQgaGF2ZSBf
YV8gcGF0aCBmb3IgcmVjb3ZlcnkuDQoNCj4gX19fX19fX19fX19fX19fX19fX19fX19fX19fX19f
X19fX19fX19fX19fX19fX18NCj4gTGludXgtbnZkaW1tIG1haWxpbmcgbGlzdA0KPiBMaW51eC1u
dmRpbW1AbGlzdHMuMDEub3JnDQo+IGh0dHBzOi8vbGlzdHMuMDEub3JnL21haWxtYW4vbGlzdGlu
Zm8vbGludXgtbnZkaW1t

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 14:24     ` Christoph Hellwig
  2016-05-05 15:15       ` Dan Williams
@ 2016-05-05 21:39       ` Verma, Vishal L
  2016-05-08  9:01         ` hch
  1 sibling, 1 reply; 25+ messages in thread
From: Verma, Vishal L @ 2016-05-05 21:39 UTC (permalink / raw)
  To: hch@infradead.org, boaz@plexistor.com
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvdimm@ml01.01.org, xfs@oss.sgi.com, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, akpm@linux-foundation.org, axboe@fb.com,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

T24gVGh1LCAyMDE2LTA1LTA1IGF0IDA3OjI0IC0wNzAwLCBDaHJpc3RvcGggSGVsbHdpZyB3cm90
ZToNCj4gT24gTW9uLCBNYXkgMDIsIDIwMTYgYXQgMDY6NDE6NTFQTSArMDMwMCwgQm9heiBIYXJy
b3NoIHdyb3RlOg0KPiA+IA0KPiA+ID4gDQo+ID4gPiBBbGwgSU8gaW4gYSBkYXggZmlsZXN5c3Rl
bSB1c2VkIHRvIGdvIHRocm91Z2ggZGF4X2RvX2lvLCB3aGljaA0KPiA+ID4gY2Fubm90DQo+ID4g
PiBoYW5kbGUgbWVkaWEgZXJyb3JzLCBhbmQgdGh1cyBjYW5ub3QgcHJvdmlkZSBhIHJlY292ZXJ5
IHBhdGggdGhhdA0KPiA+ID4gY2FuDQo+ID4gPiBzZW5kIGEgd3JpdGUgdGhyb3VnaCB0aGUgZHJp
dmVyIHRvIGNsZWFyIGVycm9ycy4NCj4gPiA+IA0KPiA+ID4gQWRkIGEgbmV3IGlvY2IgZmxhZyBm
b3IgREFYLCBhbmQgc2V0IGl0IG9ubHkgZm9yIERBWCBtb3VudHMuIEluDQo+ID4gPiB0aGUgSU8N
Cj4gPiA+IHBhdGggZm9yIERBWCBmaWxlc3lzdGVtcywgdXNlIHRoZSBzYW1lIGRpcmVjdF9JTyBw
YXRoIGZvciBib3RoIERBWA0KPiA+ID4gYW5kDQo+ID4gPiBkaXJlY3RfaW8gaW9jYnMsIGJ1dCB1
c2UgdGhlIGZsYWdzIHRvIGlkZW50aWZ5IHdoZW4gd2UgYXJlIGluDQo+ID4gPiBPX0RJUkVDVA0K
PiA+ID4gbW9kZSB2cyBub24gT19ESVJFQ1Qgd2l0aCBEQVgsIGFuZCBmb3IgT19ESVJFQ1QsIHVz
ZSB0aGUNCj4gPiA+IGNvbnZlbnRpb25hbA0KPiA+ID4gZGlyZWN0X0lPIHBhdGggaW5zdGVhZCBv
ZiBEQVguDQo+ID4gPiANCj4gPiBSZWFsbHk/IFdoYXQgYXJlIHlvdXIgdGhpbmtpbmcgaGVyZT8N
Cj4gPiANCj4gPiBXaGF0IGFib3V0IGFsbCB0aGUgY3VycmVudCB1c2VycyBvZiBPX0RJUkVDVCwg
eW91IGhhdmUganVzdCBtYWRlDQo+ID4gdGhlbQ0KPiA+IDQgdGltZXMgc2xvd2VyIGFuZCAibGVz
cyBjb25jdXJyZW50KiIgdGhlbiAiYnVmZnJlZCBpbyIgdXNlcnMuIFNpbmNlDQo+ID4gZGlyZWN0
X0lPIHBhdGggd2lsbCBxdWV1ZSBhbiBJTyByZXF1ZXN0IGFuZCBhbGwuDQo+ID4gKEFuZCBpZiBp
dCBpcyBub3Qgc28gc2xvdyB0aGVuIHdoeSBkbyB3ZSBuZWVkIGRheF9kb19pbyBhdCBhbGw/DQo+
ID4gW1JoZXRvcmljYWxdKQ0KPiA+IA0KPiA+IEkgaGF0ZSBpdCB0aGF0IHlvdSBvdmVybG9hZCB0
aGUgc2VtYW50aWNzIG9mIGEga25vd24gYW5kIGV4cGVjdGVkDQo+ID4gT19ESVJFQ1QgZmxhZywg
Zm9yIHNwZWNpYWwgcG1lbSBxdWlya3MuIFRoaXMgaXMgYW4gaW5jb21wYXRpYmxlDQo+ID4gYW5k
IHVucmVsYXRlZCBvdmVybG9hZCBvZiB0aGUgc2VtYW50aWNzIG9mIE9fRElSRUNULg0KPiBBZ3Jl
ZWQgLSBtYWtpZyBPX0RJUkVDVCBsZXNzIGRpcmVjdCB0aGFuIG5vdCBoYXZpbmcgaXQgaXMgcGxh
aW4NCj4gc3R1cGlkLA0KPiBhbmQgSSBzb21laG93IG1pc3NlZCB0aGlzIGluaXRpYWxseS4NCg0K
SG93IGlzIGl0IGFueSAnbGVzcyBkaXJlY3QnPyBBbGwgaXQgZG9lcyBub3cgaXMgZm9sbG93IHRo
ZSBibG9ja2Rldg0KT19ESVJFQ1QgcGF0aC4gVGhlcmUgc3RpbGwgaXNuJ3QgYW55IHBhZ2UgY2Fj
aGUgaW52b2x2ZWQuLg0KDQo+IA0KPiBUaGlzIHdob2xlIERBWCBzdG9yeSB0dXJucyBpbnRvIGEg
bWFqb3IgbmlnaHRtYXJlLCBhbmQgSSBmZWFyIGFsbCBvdXINCj4gaG9kZ2UgcG9kZ2UgdHdlYWtz
IHRvIHRoZSBzZW1hbnRpY3MgYXJlbid0IGhlbHBpbmcgaXQuDQo+IA0KPiBJdCBzZWVtcyBsaWtl
IHdlIHNpbXBseSBuZWVkIGFuIGV4cGxpY2l0IE9fREFYIGZvciB0aGUgcmVhZC93cml0ZQ0KPiBi
eXBhc3MgaWYgY2FuJ3Qgc29ydCBvdXQgdGhlIHNlbWFudGljcyAoZXJyb3IsIHdyaXRlciBzeW5j
aHJvbml6YXRpb24pDQo+IGp1c3QgYXMgd2UgbmVlZCBhIHNwZWNpYWwgZmxhZyBmb3IgTU1BUC4u

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io
  2016-05-05 21:39       ` Verma, Vishal L
@ 2016-05-08  9:01         ` hch
  0 siblings, 0 replies; 25+ messages in thread
From: hch @ 2016-05-08  9:01 UTC (permalink / raw)
  To: Verma, Vishal L
  Cc: hch@infradead.org, boaz@plexistor.com,
	linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvdimm@ml01.01.org, xfs@oss.sgi.com, linux-mm@kvack.org,
	viro@zeniv.linux.org.uk, akpm@linux-foundation.org, axboe@fb.com,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	david@fromorbit.com, jack@suse.cz, matthew@wil.cx

On Thu, May 05, 2016 at 09:39:14PM +0000, Verma, Vishal L wrote:
> How is it any 'less direct'? All it does now is follow the blockdev
> O_DIRECT path. There still isn't any page cache involved..

It's still more overhead than the play DAX I/O path.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-05-08 18:42 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1461878218-3844-1-git-send-email-vishal.l.verma@intel.com>
2016-04-28 21:16 ` [PATCH v4 5/7] fs: prioritize and separate direct_io from dax_io Vishal Verma
2016-05-02 14:56   ` Christoph Hellwig
2016-05-02 15:45     ` Vishal Verma
2016-05-02 15:41   ` Boaz Harrosh
2016-05-02 15:51     ` Vishal Verma
2016-05-02 16:03       ` Boaz Harrosh
2016-05-02 18:52         ` Verma, Vishal L
2016-05-02 16:01     ` Dan Williams
2016-05-02 16:22       ` Boaz Harrosh
2016-05-02 16:49         ` Dan Williams
2016-05-02 17:44           ` Boaz Harrosh
2016-05-02 18:10             ` Dan Williams
2016-05-02 18:32               ` Boaz Harrosh
2016-05-02 18:48                 ` Dan Williams
2016-05-02 19:22                   ` Boaz Harrosh
2016-05-05 14:24     ` Christoph Hellwig
2016-05-05 15:15       ` Dan Williams
2016-05-05 15:22         ` Christoph Hellwig
2016-05-05 16:24           ` Dan Williams
2016-05-05 21:45           ` Verma, Vishal L
2016-05-08  9:01             ` hch
2016-05-08 18:42               ` Verma, Vishal L
2016-05-05 21:42         ` Verma, Vishal L
2016-05-05 21:39       ` Verma, Vishal L
2016-05-08  9:01         ` hch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).