* [PATCH 1/8] xfs: don't pass ioflags around in the ioctl path
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 2/8] xfs: kill ioflags Christoph Hellwig
` (7 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
Instead check the file pointer for the invisble I/O flag directly, and
use the chance to drop redundant arguments from the xfs_ioc_space
prototype.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_ioctl.c | 22 ++++++++--------------
fs/xfs/xfs_ioctl.h | 3 ---
fs/xfs/xfs_ioctl32.c | 6 +-----
3 files changed, 9 insertions(+), 22 deletions(-)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index dbca737..6ab5a24 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf)
{
+ struct inode *inode = file_inode(filp);
+ struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
- if (ioflags & XFS_IO_INVIS)
+ if (filp->f_mode & FMODE_NOCMTIME)
flags |= XFS_PREALLOC_INVISIBLE;
error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
STATIC int
xfs_ioc_getbmap(
- struct xfs_inode *ip,
- int ioflags,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
return -EINVAL;
bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (ioflags & XFS_IO_INVIS)
+ if (file->f_mode & FMODE_NOCMTIME)
bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
- error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+ error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
(__force struct getbmap *)arg+1);
if (error)
return error;
@@ -1619,12 +1617,8 @@ xfs_file_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_ioctl(ip);
switch (cmd) {
@@ -1643,7 +1637,7 @@ xfs_file_ioctl(
if (copy_from_user(&bf, arg, sizeof(bf)))
return -EFAULT;
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_DIOINFO: {
struct dioattr da;
@@ -1702,7 +1696,7 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+ return xfs_ioc_getbmap(filp, cmd, arg);
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7..8b52881 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
extern int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8a..321f577 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
if (xfs_compat_flock64_copyin(&bf, arg))
return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(mp, arg);
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 2/8] xfs: kill ioflags
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2016-06-22 15:27 ` [PATCH 1/8] xfs: don't pass ioflags around in the ioctl path Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 3/8] xfs: remove s_maxbytes enforcement in xfs_file_read_iter Christoph Hellwig
` (6 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
Now that we have the direct I/O kiocb flag there is no real need to sample
the value inside of XFS, and the invis flag was always just partially used
and isn't worth keeping this infrastructure around for. This also splits
the read tracepoint into buffered vs direct as we've done for writes a long
time ago.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_file.c | 26 +++++++++-----------------
fs/xfs/xfs_inode.h | 10 ----------
fs/xfs/xfs_trace.h | 19 ++++++++-----------
3 files changed, 17 insertions(+), 38 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 713991c..b32e6b0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -249,18 +249,12 @@ xfs_file_read_iter(
struct xfs_mount *mp = ip->i_mount;
size_t size = iov_iter_count(to);
ssize_t ret = 0;
- int ioflags = 0;
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
XFS_STATS_INC(mp, xs_read_calls);
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
- ioflags |= XFS_IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
- if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
+ if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -293,7 +287,7 @@ xfs_file_read_iter(
* serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -327,7 +321,10 @@ xfs_file_read_iter(
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, pos, ioflags);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ trace_xfs_file_direct_read(ip, size, pos);
+ else
+ trace_xfs_file_buffered_read(ip, size, pos);
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
@@ -346,18 +343,14 @@ xfs_file_splice_read(
unsigned int flags)
{
struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- int ioflags = 0;
ssize_t ret;
XFS_STATS_INC(ip->i_mount, xs_read_calls);
- if (infilp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+ trace_xfs_file_splice_read(ip, count, *ppos);
/*
* DAX inodes cannot ues the page cache for splice, so we have to push
@@ -620,7 +613,7 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from;
ret = mapping->a_ops->direct_IO(iocb, &data);
@@ -670,8 +663,7 @@ xfs_file_buffered_aio_write(
current->backing_dev_info = inode_to_bdi(inode);
write_retry:
- trace_xfs_file_buffered_write(ip, iov_iter_count(from),
- iocb->ki_pos, 0);
+ trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c19d3d..8eb78ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -473,14 +473,4 @@ do { \
extern struct kmem_zone *xfs_inode_zone;
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
-#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
- { XFS_IO_ISDIRECT, "DIRECT" }, \
- { XFS_IO_INVIS, "INVIS"}
-
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6787a9f..2504f94 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1135,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
- TP_ARGS(ip, count, offset, flags),
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+ TP_ARGS(ip, count, offset),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, flags)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1151,23 +1150,21 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
- "offset 0x%llx count 0x%zx ioflags %s",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
- __entry->count,
- __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+ __entry->count)
)
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
- TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
+ TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 3/8] xfs: remove s_maxbytes enforcement in xfs_file_read_iter
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2016-06-22 15:27 ` [PATCH 1/8] xfs: don't pass ioflags around in the ioctl path Christoph Hellwig
2016-06-22 15:27 ` [PATCH 2/8] xfs: kill ioflags Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 4/8] xfs: split xfs_file_read_iter into buffered and direct I/O helpers Christoph Hellwig
` (5 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
All the three low-level read implementations that we might call already
take care of not overflowing the maximum supported bytes, no need to
duplicate it here.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_file.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b32e6b0..09a5a78 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -249,7 +249,6 @@ xfs_file_read_iter(
struct xfs_mount *mp = ip->i_mount;
size_t size = iov_iter_count(to);
ssize_t ret = 0;
- xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
XFS_STATS_INC(mp, xs_read_calls);
@@ -266,13 +265,6 @@ xfs_file_read_iter(
}
}
- n = mp->m_super->s_maxbytes - pos;
- if (n <= 0 || size == 0)
- return 0;
-
- if (n < size)
- size = n;
-
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 4/8] xfs: split xfs_file_read_iter into buffered and direct I/O helpers
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (2 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 3/8] xfs: remove s_maxbytes enforcement in xfs_file_read_iter Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 5/8] xfs: stop using generic_file_read_iter for direct I/O Christoph Hellwig
` (4 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
Similar to what we did on the write side a while ago.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_file.c | 83 ++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 57 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 09a5a78..e584333 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -239,35 +239,33 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- size_t size = iov_iter_count(to);
+ size_t count = iov_iter_count(to);
+ struct xfs_buftarg *target;
ssize_t ret = 0;
- loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(mp, xs_read_calls);
+ trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
- if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ if (XFS_IS_REALTIME_INODE(ip))
+ target = ip->i_mount->m_rtdev_targp;
+ else
+ target = ip->i_mount->m_ddev_targp;
+
+ if (!IS_DAX(inode)) {
/* DIO must be aligned to device logical sector size */
- if ((pos | size) & target->bt_logical_sectormask) {
- if (pos == i_size_read(inode))
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+ if (iocb->ki_pos == i_size_read(inode))
return 0;
return -EINVAL;
}
}
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* Locking is a bit tricky here. If we take an exclusive lock for direct
* IO, we effectively serialise all new concurrent read IO to this file
@@ -279,7 +277,7 @@ xfs_file_read_iter(
* serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) {
+ if (mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -294,8 +292,8 @@ xfs_file_read_iter(
* flush and reduce the chances of repeated iolock cycles going
* forward.
*/
- if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait(mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -306,23 +304,56 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2(mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
+ ret = generic_file_read_iter(iocb, to);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = generic_file_read_iter(iocb, to);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
+ ssize_t ret = 0;
+
+ XFS_STATS_INC(mp, xs_read_calls);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
if (iocb->ki_flags & IOCB_DIRECT)
- trace_xfs_file_direct_read(ip, size, pos);
+ ret = xfs_file_dio_aio_read(iocb, to);
else
- trace_xfs_file_buffered_read(ip, size, pos);
+ ret = xfs_file_buffered_aio_read(iocb, to);
- ret = generic_file_read_iter(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -578,7 +609,7 @@ xfs_file_dio_aio_write(
end = iocb->ki_pos + count - 1;
/*
- * See xfs_file_read_iter() for why we do a full-file flush here.
+ * See xfs_file_dio_aio_read() for why we do a full-file flush here.
*/
if (mapping->nrpages) {
ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 5/8] xfs: stop using generic_file_read_iter for direct I/O
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (3 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 4/8] xfs: split xfs_file_read_iter into buffered and direct I/O helpers Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 6/8] xfs: direct calls in the direct I/O path Christoph Hellwig
` (3 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
XFS already implement it's own flushing of the pagecache because it
implements proper synchronization for direct I/O reads. This means
calling generic_file_read_iter for direct I/O is rather useless,
as it doesn't do much but updating the atime and iocb position for
us. This also gets rid of the buffered I/O fallback that isn't used
for XFS.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_file.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e584333..f761f49 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -246,12 +246,17 @@ xfs_file_dio_aio_read(
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
+ loff_t isize = i_size_read(inode);
size_t count = iov_iter_count(to);
+ struct iov_iter data;
struct xfs_buftarg *target;
ssize_t ret = 0;
trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ if (!count)
+ return 0; /* skip atime */
+
if (XFS_IS_REALTIME_INODE(ip))
target = ip->i_mount->m_rtdev_targp;
else
@@ -260,7 +265,7 @@ xfs_file_dio_aio_read(
if (!IS_DAX(inode)) {
/* DIO must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
- if (iocb->ki_pos == i_size_read(inode))
+ if (iocb->ki_pos == isize)
return 0;
return -EINVAL;
}
@@ -311,9 +316,15 @@ xfs_file_dio_aio_read(
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- ret = generic_file_read_iter(iocb, to);
+ data = *to;
+ ret = mapping->a_ops->direct_IO(iocb, &data);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
+ }
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ file_accessed(iocb->ki_filp);
return ret;
}
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 6/8] xfs: direct calls in the direct I/O path
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (4 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 5/8] xfs: stop using generic_file_read_iter for direct I/O Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-22 15:27 ` [PATCH 7/8] xfs: split direct I/O and DAX path Christoph Hellwig
` (2 subsequent siblings)
8 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
We control both the callers and callees of ->direct_IO, so remove the
indirect calls.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_aops.c | 24 +++++-------------------
fs/xfs/xfs_aops.h | 3 +++
fs/xfs/xfs_file.c | 17 +++++++++++++++--
3 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 80714eb..b368277 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1303,7 +1303,7 @@ xfs_get_blocks_dax_fault(
* whereas if we have flags set we will always be called in task context
* (i.e. from a workqueue).
*/
-STATIC int
+int
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
@@ -1374,24 +1374,10 @@ xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- dio_iodone_t *endio = NULL;
- int flags = 0;
- struct block_device *bdev;
-
- if (iov_iter_rw(iter) == WRITE) {
- endio = xfs_end_io_direct_write;
- flags = DIO_ASYNC_EXTEND;
- }
-
- if (IS_DAX(inode)) {
- return dax_do_io(iocb, inode, iter,
- xfs_get_blocks_direct, endio, 0);
- }
-
- bdev = xfs_find_bdev_for_inode(inode);
- return __blockdev_direct_IO(iocb, inode, bdev, iter,
- xfs_get_blocks_direct, endio, NULL, flags);
+ /*
+ * We just need the method present so that open/fcntl allow direct I/O.
+ */
+ return -EINVAL;
}
STATIC sector_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 814aab7..bf2d9a1 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
+int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private);
+
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f761f49..24b267d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -317,7 +317,13 @@ xfs_file_dio_aio_read(
}
data = *to;
- ret = mapping->a_ops->direct_IO(iocb, &data);
+ if (IS_DAX(inode)) {
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+ NULL, 0);
+ } else {
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, NULL, NULL, 0);
+ }
if (ret > 0) {
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
@@ -650,7 +656,14 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from;
- ret = mapping->a_ops->direct_IO(iocb, &data);
+ if (IS_DAX(inode)) {
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, 0);
+ } else {
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, xfs_end_io_direct_write,
+ NULL, DIO_ASYNC_EXTEND);
+ }
/* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) {
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 7/8] xfs: split direct I/O and DAX path
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (5 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 6/8] xfs: direct calls in the direct I/O path Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
[not found] ` <1466609236-23801-8-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2016-06-22 15:27 ` [PATCH 8/8] xfs: fix locking for DAX writes Christoph Hellwig
2016-06-23 23:24 ` xfs: untangle the direct I/O and DAX path, fix DAX locking Dave Chinner
8 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
So far the DAX code overloaded the direct I/O code path. There is very little
in common between the two, and untangling them allows to clean up both variants.
As a ѕide effect we also get separate trace points for both I/O types.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++-----------
fs/xfs/xfs_trace.h | 2 +
2 files changed, 112 insertions(+), 29 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 24b267d..0e74325 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -262,13 +262,11 @@ xfs_file_dio_aio_read(
else
target = ip->i_mount->m_ddev_targp;
- if (!IS_DAX(inode)) {
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
- if (iocb->ki_pos == isize)
- return 0;
- return -EINVAL;
- }
+ /* DIO must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+ if (iocb->ki_pos == isize)
+ return 0;
+ return -EINVAL;
}
/*
@@ -317,13 +315,37 @@ xfs_file_dio_aio_read(
}
data = *to;
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
- NULL, 0);
- } else {
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, NULL, NULL, 0);
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, NULL, NULL, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
}
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct iov_iter data = *to;
+ size_t count = iov_iter_count(to);
+ ssize_t ret = 0;
+
+ trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+ if (!count)
+ return 0; /* skip atime */
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
if (ret > 0) {
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
@@ -356,7 +378,8 @@ xfs_file_read_iter(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
ssize_t ret = 0;
XFS_STATS_INC(mp, xs_read_calls);
@@ -364,7 +387,9 @@ xfs_file_read_iter(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_read(iocb, to);
+ else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_read(iocb, to);
else
ret = xfs_file_buffered_aio_read(iocb, to);
@@ -586,8 +611,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if (!IS_DAX(inode) &&
- ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -656,14 +680,9 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from;
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
- xfs_end_io_direct_write, 0);
- } else {
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, xfs_end_io_direct_write,
- NULL, DIO_ASYNC_EXTEND);
- }
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, xfs_end_io_direct_write,
+ NULL, DIO_ASYNC_EXTEND);
/* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) {
@@ -680,10 +699,70 @@ out:
xfs_rw_iunlock(ip, iolock);
/*
- * No fallback to buffered IO on errors for XFS. DAX can result in
- * partial writes, but direct IO will either complete fully or fail.
+ * No fallback to buffered IO on errors for XFS, direct IO will either
+ * complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ int unaligned_io = 0;
+ int iolock;
+ struct iov_iter data;
+
+ /* "unaligned" here means not aligned to a filesystem block */
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+ unaligned_io = 1;
+ iolock = XFS_IOLOCK_EXCL;
+ } else if (mapping->nrpages) {
+ iolock = XFS_IOLOCK_EXCL;
+ } else {
+ iolock = XFS_IOLOCK_SHARED;
+ }
+ xfs_rw_ilock(ip, iolock);
+
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out;
+
+ /*
+ * Yes, even DAX files can have page cache attached to them: A zeroed
+ * page is inserted into the pagecache when we have to serve a write
+ * fault on a hole. It should never be dirtied and can simply be
+ * dropped from the pagecache once we get real data for the page.
*/
- ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+ if (mapping->nrpages) {
+ ret = invalidate_inode_pages2(mapping);
+ WARN_ON_ONCE(ret);
+ }
+
+ if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+ data = *from;
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(from, ret);
+ }
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -765,7 +844,9 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_write(iocb, from);
+ else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2504f94..1451690 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \
TP_ARGS(ip, count, offset))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DECLARE_EVENT_CLASS(xfs_page_class,
--
2.1.4
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 8/8] xfs: fix locking for DAX writes
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (6 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 7/8] xfs: split direct I/O and DAX path Christoph Hellwig
@ 2016-06-22 15:27 ` Christoph Hellwig
2016-06-23 14:22 ` Boaz Harrosh
2016-06-23 23:24 ` xfs: untangle the direct I/O and DAX path, fix DAX locking Dave Chinner
8 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-22 15:27 UTC (permalink / raw)
To: xfs-VZNHf3L845pBDgjK7y7TUQ
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w
So far DAX writes inherited the locking from direct I/O writes, but the direct
I/O model of using shared locks for writes is actually wrong for DAX. For
direct I/O we're out of any standards and don't have to provide the Posix
required exclusion between writers, but for DAX which gets transparently
enable on applications without any knowledge of it we can't simply drop the
requirement. Even worse this only happens for aligned writes and thus
doesn't show up for many typical use cases.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
fs/xfs/xfs_file.c | 20 +-------------------
1 file changed, 1 insertion(+), 19 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0e74325..413c9e0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -714,24 +714,11 @@ xfs_file_dax_write(
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
+ int iolock = XFS_IOLOCK_EXCL;
struct iov_iter data;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
- unaligned_io = 1;
- iolock = XFS_IOLOCK_EXCL;
- } else if (mapping->nrpages) {
- iolock = XFS_IOLOCK_EXCL;
- } else {
- iolock = XFS_IOLOCK_SHARED;
- }
xfs_rw_ilock(ip, iolock);
-
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
@@ -747,11 +734,6 @@ xfs_file_dax_write(
WARN_ON_ONCE(ret);
}
- if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
- }
-
trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
data = *from;
--
2.1.4
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH 8/8] xfs: fix locking for DAX writes
2016-06-22 15:27 ` [PATCH 8/8] xfs: fix locking for DAX writes Christoph Hellwig
@ 2016-06-23 14:22 ` Boaz Harrosh
0 siblings, 0 replies; 23+ messages in thread
From: Boaz Harrosh @ 2016-06-23 14:22 UTC (permalink / raw)
To: Christoph Hellwig, xfs; +Cc: linux-fsdevel, linux-nvdimm
On 06/22/2016 06:27 PM, Christoph Hellwig wrote:
> So far DAX writes inherited the locking from direct I/O writes, but the direct
> I/O model of using shared locks for writes is actually wrong for DAX. For
> direct I/O we're out of any standards and don't have to provide the Posix
> required exclusion between writers, but for DAX which gets transparently
> enable on applications without any knowledge of it we can't simply drop the
> requirement. Even worse this only happens for aligned writes and thus
> doesn't show up for many typical use cases.
>
Hi Sir Christoph
You raise a very interesting point and I would please like to ask questions.
Is this a theoretical standards problem or a real applications problem that
you know of?
You say above: " Posix required exclusion between writers"
As I understand, what it means is that if two threads/processes A & B write to the
same offset-length, in parallel. then a consistent full version will hold
of either A or B, which ever comes last. But never a torn version of both.
Is this really POSIX. I mean I knew POSIX is silly but so much so?
What about NFS CEPH Luster and all these network shared stuff. Does POSIX
say "On a single Node?". (Trond been yelling about file locks for *any* kind
of synchronization for years.)
And even with the write-lock to serialize writers (Or i_mute in case of ext4)
I do not see how this serialization works, because in a cached environment a
write_back can start and crash while the second thread above starts his memcopy
and on disk we still get a torn version of the record that was half from
A half from B. (Or maybe I do not understand what your automicity means)
Is not a rant I would really like to know what application uses this
"single-writer" facility and how does it actually works for them? I honestly
don't see how it works. (And do they really check that they are only working
on a local file system?)
Sorry for my slowness please explain?
BTW: I think that all the patches except this one makes a lot of sense
because of all the hidden quirks of direct_IO code paths. Just for example the
difference between "aligned and none align writes" as you mentioned above.
My $0.017:
Who In the real world would actually break without this patch, which
is not already broken?
And why sacrifice the vast majority of good applications for the sake
of an already broken (theoretical?) applications.
Thank you
Boaz
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_file.c | 20 +-------------------
> 1 file changed, 1 insertion(+), 19 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 0e74325..413c9e0 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -714,24 +714,11 @@ xfs_file_dax_write(
> struct address_space *mapping = iocb->ki_filp->f_mapping;
> struct inode *inode = mapping->host;
> struct xfs_inode *ip = XFS_I(inode);
> - struct xfs_mount *mp = ip->i_mount;
> ssize_t ret = 0;
> - int unaligned_io = 0;
> - int iolock;
> + int iolock = XFS_IOLOCK_EXCL;
> struct iov_iter data;
>
> - /* "unaligned" here means not aligned to a filesystem block */
> - if ((iocb->ki_pos & mp->m_blockmask) ||
> - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
> - unaligned_io = 1;
> - iolock = XFS_IOLOCK_EXCL;
> - } else if (mapping->nrpages) {
> - iolock = XFS_IOLOCK_EXCL;
> - } else {
> - iolock = XFS_IOLOCK_SHARED;
> - }
> xfs_rw_ilock(ip, iolock);
> -
> ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> if (ret)
> goto out;
> @@ -747,11 +734,6 @@ xfs_file_dax_write(
> WARN_ON_ONCE(ret);
> }
>
> - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
> - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
> - iolock = XFS_IOLOCK_SHARED;
> - }
> -
> trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
>
> data = *from;
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
[not found] ` <1466609236-23801-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
` (7 preceding siblings ...)
2016-06-22 15:27 ` [PATCH 8/8] xfs: fix locking for DAX writes Christoph Hellwig
@ 2016-06-23 23:24 ` Dave Chinner
2016-06-24 1:14 ` Dan Williams
2016-06-24 7:26 ` Christoph Hellwig
8 siblings, 2 replies; 23+ messages in thread
From: Dave Chinner @ 2016-06-23 23:24 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w, xfs-VZNHf3L845pBDgjK7y7TUQ
On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote:
> The last patch is what started the series: XFS currently uses the
> direct I/O locking strategy for DAX because DAX was overloaded onto
> the direct I/O path. For XFS this means that we only take a shared
> inode lock instead of the normal exclusive one for writes IFF they
> are properly aligned. While this is fine for O_DIRECT which requires
> explicit opt-in from the application it's not fine for DAX where we'll
> suddenly lose expected and required synchronization of the file system
> happens to use DAX undeneath.
Except we did that *intentionally* - by definition there is no
cache to bypass with DAX and so all IO is "direct". That, combined
with the fact that all Linux filesystems except XFS break the POSIX
exclusive writer rule you are quoting to begin with, it seemed
pointless to enforce it for DAX....
So, before taking any patches to change that behaviour in XFS, a
wider discussion about the policy needs to be had. I don't think
we should care about POSIX here - if you have an application that
needs this serialisation, turn off DAX. That's why I made it a
per-inode inheritable flag and why the mount option will go away
over time.
Cheers,
Dave.
--
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-23 23:24 ` xfs: untangle the direct I/O and DAX path, fix DAX locking Dave Chinner
@ 2016-06-24 1:14 ` Dan Williams
2016-06-24 7:13 ` Dave Chinner
2016-06-24 7:26 ` Christoph Hellwig
1 sibling, 1 reply; 23+ messages in thread
From: Dan Williams @ 2016-06-24 1:14 UTC (permalink / raw)
To: Dave Chinner
Cc: Christoph Hellwig, linux-fsdevel, linux-nvdimm, XFS Developers
On Thu, Jun 23, 2016 at 4:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote:
>> The last patch is what started the series: XFS currently uses the
>> direct I/O locking strategy for DAX because DAX was overloaded onto
>> the direct I/O path. For XFS this means that we only take a shared
>> inode lock instead of the normal exclusive one for writes IFF they
>> are properly aligned. While this is fine for O_DIRECT which requires
>> explicit opt-in from the application it's not fine for DAX where we'll
>> suddenly lose expected and required synchronization of the file system
>> happens to use DAX undeneath.
>
> Except we did that *intentionally* - by definition there is no
> cache to bypass with DAX and so all IO is "direct". That, combined
> with the fact that all Linux filesystems except XFS break the POSIX
> exclusive writer rule you are quoting to begin with, it seemed
> pointless to enforce it for DAX....
If we're going to be strict about POSIX fsync() semantics we should be
strict about this exclusive write semantic. In other words why is it
ok to loosen one and not the other, if application compatibility is
the concern?
>
> So, before taking any patches to change that behaviour in XFS, a
> wider discussion about the policy needs to be had. I don't think
> we should care about POSIX here - if you have an application that
> needs this serialisation, turn off DAX.
s/needs this serialisation/needs the kernel to flush cpu cache/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-24 1:14 ` Dan Williams
@ 2016-06-24 7:13 ` Dave Chinner
2016-06-24 7:31 ` Christoph Hellwig
0 siblings, 1 reply; 23+ messages in thread
From: Dave Chinner @ 2016-06-24 7:13 UTC (permalink / raw)
To: Dan Williams
Cc: Christoph Hellwig, linux-fsdevel, linux-nvdimm, XFS Developers
On Thu, Jun 23, 2016 at 06:14:47PM -0700, Dan Williams wrote:
> On Thu, Jun 23, 2016 at 4:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Wed, Jun 22, 2016 at 05:27:08PM +0200, Christoph Hellwig wrote:
> >> The last patch is what started the series: XFS currently uses the
> >> direct I/O locking strategy for DAX because DAX was overloaded onto
> >> the direct I/O path. For XFS this means that we only take a shared
> >> inode lock instead of the normal exclusive one for writes IFF they
> >> are properly aligned. While this is fine for O_DIRECT which requires
> >> explicit opt-in from the application it's not fine for DAX where we'll
> >> suddenly lose expected and required synchronization of the file system
> >> happens to use DAX undeneath.
> >
> > Except we did that *intentionally* - by definition there is no
> > cache to bypass with DAX and so all IO is "direct". That, combined
> > with the fact that all Linux filesystems except XFS break the POSIX
> > exclusive writer rule you are quoting to begin with, it seemed
> > pointless to enforce it for DAX....
>
> If we're going to be strict about POSIX fsync() semantics we should be
> strict about this exclusive write semantic. In other words why is it
> ok to loosen one and not the other, if application compatibility is
> the concern?
This is a POSIX compliant fsync() implementation:
int fsync(int fd)
{
return 0;
}
That's not what we require from Linux filesystems and storage
subsystems. Our data integrity requirements are not actually
defined by POSIX - we go way beyond what POSIX actually requires us
to implement. If all we cared about is POSIX, then the above is how
we'd implement fsync() simply because it's fast. Everyone implements
fsync differently, so portable applications can't actually rely on
the POSIX standard fsync() implementation to keep their data safe...
IOWs, we don't give a shit about what POSIX says about fsync
because, in practice, it's useless. Instead, we implement something
that *works* and provides users with real data integrity guarantees.
If you like the POSIX specs for data integrity, go use
sync_file_range() - it doesn't guarantee data integrity, just like
posix compliant fsync(). And yes, applications that use
sync_file_range() are known to lose data when systems crash...
The POSIX exclusive write requirement is a different case. No linux
filesystem except XFS has ever met that requirement (in 20 something
years), yet I don't see applications falling over with corrupt data
from non-exclusive writes all the time, nor do I see application
developers shouting at us to provide it. i.e. reality tells us this
isn't a POSIX behaviour that applications rely on because everyone
implements it differently.
So, like fsync(), if everyone implements it differently,
applications don't rely on posix smeantics to serialise access to
overlapping ranges of a file. And if that's the case, then
why even bother exclusive write locking in the filesystem when there
is no need for serialisation of page cache contents?
We don't do it because POSIX says so, because we already ignore what
POSIX says about this topic for technical reasons. So why should we
make DAX conform to POSIX exclusive writer behaviour when DAX is
being specifically aimed at high performance, highly concurrent
applications where exclusive writer behaviour will cause major
performance issues?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-24 7:13 ` Dave Chinner
@ 2016-06-24 7:31 ` Christoph Hellwig
0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-24 7:31 UTC (permalink / raw)
To: Dave Chinner
Cc: Dan Williams, Christoph Hellwig, linux-fsdevel, linux-nvdimm,
XFS Developers
On Fri, Jun 24, 2016 at 05:13:18PM +1000, Dave Chinner wrote:
> This is a POSIX compliant fsync() implementation:
>
> int fsync(int fd)
> {
> return 0;
> }
Depends on what you mean with "Posix". Modern Posix which includex
XPG has the _POSIX_SYNCHRONIZED_IO option, which Linux implements. For
that Posix says about fsync:
[SIO] [Option Start] If _POSIX_SYNCHRONIZED_IO is defined, the fsync()
function shall force all currently queued I/O operations associated with
the file indicated by file descriptor fildes to the synchronized I/O
completion state. All I/O operations shall be completed as defined for
synchronized I/O file integrity completion. [Option End]
Whereas synchronized I/O file integrity completion is defined as:
3.378 Synchronized I/O Data Integrity Completion
For read, when the operation has been completed or diagnosed if
unsuccessful. The read is complete only when an image of the data has been
successfully transferred to the requesting process. If there were any
pending write requests affecting the data to be read at the time that the
synchronized read operation was requested, these write requests are
successfully transferred prior to reading the data.
For write, when the operation has been completed or diagnosed if
unsuccessful. The write is complete only when the data specified in the
write request is successfully transferred and all file system information
required to retrieve the data is successfully transferred.
File attributes that are not necessary for data retrieval (access time,
modification time, status change time) need not be successfully
transferred prior to returning to the calling process.
3.379 Synchronized I/O File Integrity Completion
Identical to a synchronized I/O data integrity completion with the
addition that all file attributes relative to the I/O operation (including
access time, modification time, status change time) are successfully
transferred prior to returning to the calling process.
So in this case Posix very much requires data to be on a stable
medium.
> The POSIX exclusive write requirement is a different case. No linux
> filesystem except XFS has ever met that requirement (in 20 something
> years), yet I don't see applications falling over with corrupt data
> from non-exclusive writes all the time, nor do I see application
> developers shouting at us to provide it. i.e. reality tells us this
> isn't a POSIX behaviour that applications rely on because everyone
> implements it differently.
Every file system exludes writes from other writes.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-23 23:24 ` xfs: untangle the direct I/O and DAX path, fix DAX locking Dave Chinner
2016-06-24 1:14 ` Dan Williams
@ 2016-06-24 7:26 ` Christoph Hellwig
2016-06-24 23:00 ` Dave Chinner
1 sibling, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-24 7:26 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w, Christoph Hellwig,
xfs-VZNHf3L845pBDgjK7y7TUQ
On Fri, Jun 24, 2016 at 09:24:46AM +1000, Dave Chinner wrote:
> Except we did that *intentionally* - by definition there is no
> cache to bypass with DAX and so all IO is "direct". That, combined
> with the fact that all Linux filesystems except XFS break the POSIX
> exclusive writer rule you are quoting to begin with, it seemed
> pointless to enforce it for DAX....
No file system breaks the exclusive writer rule - most filesystem
don't make writers atomic vs readers.
More importantly every other filesystem (well there only are ext2
and ext4..) exludes DAX writers against other DAX writers.
> So, before taking any patches to change that behaviour in XFS, a
> wider discussion about the policy needs to be had. I don't think
> we should care about POSIX here - if you have an application that
> needs this serialisation, turn off DAX. That's why I made it a
> per-inode inheritable flag and why the mount option will go away
> over time.
Sorry, but this is simply broken - allowing apps to opt-in behavior
(e.g. like we're using O_DIRECT) is always fine. Requriring
filesystem-specific tuning that has affect outside the app to get
existing documented behavior is not how to design APIs.
Maybe we'll need to opt-in to use DAX for mmap, but giving the same
existing behavior for read and write and avoiding a copy to the pagecache
is an obvious win.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-24 7:26 ` Christoph Hellwig
@ 2016-06-24 23:00 ` Dave Chinner
2016-06-28 13:10 ` Christoph Hellwig
0 siblings, 1 reply; 23+ messages in thread
From: Dave Chinner @ 2016-06-24 23:00 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs, linux-fsdevel, linux-nvdimm
On Fri, Jun 24, 2016 at 09:26:12AM +0200, Christoph Hellwig wrote:
> On Fri, Jun 24, 2016 at 09:24:46AM +1000, Dave Chinner wrote:
> > Except we did that *intentionally* - by definition there is no
> > cache to bypass with DAX and so all IO is "direct". That, combined
> > with the fact that all Linux filesystems except XFS break the POSIX
> > exclusive writer rule you are quoting to begin with, it seemed
> > pointless to enforce it for DAX....
>
> No file system breaks the exclusive writer rule - most filesystem
> don't make writers atomic vs readers.
>
> More importantly every other filesystem (well there only are ext2
> and ext4..) exludes DAX writers against other DAX writers.
>
> > So, before taking any patches to change that behaviour in XFS, a
> > wider discussion about the policy needs to be had. I don't think
> > we should care about POSIX here - if you have an application that
> > needs this serialisation, turn off DAX. That's why I made it a
> > per-inode inheritable flag and why the mount option will go away
> > over time.
>
> Sorry, but this is simply broken - allowing apps to opt-in behavior
> (e.g. like we're using O_DIRECT) is always fine. Requriring
> filesystem-specific tuning that has affect outside the app to get
> existing documented behavior is not how to design APIs.
Using DAX is an *admin decision*, not an application decision.
Indeed, it's a mount option right now, and that's most definitely not
something the application can turn on or off! Inode flags allow the
admin to decide that two apps working on the same filesystem can use
(or not use) DAX independently, rather than needing to put them on
different filesystems.
> Maybe we'll need to opt-in to use DAX for mmap, but giving the same
> existing behavior for read and write and avoiding a copy to the pagecache
> is an obvious win.
You can't use DAX just for mmap. It's an inode scope behaviour -
once it's turned on, all accesses to that inode - regardless of user
interface - must use DAX. It's all or nothing, not a per file
descript/mmap context option.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: xfs: untangle the direct I/O and DAX path, fix DAX locking
2016-06-24 23:00 ` Dave Chinner
@ 2016-06-28 13:10 ` Christoph Hellwig
[not found] ` <20160628131059.GA30475-jcswGhMUV9g@public.gmane.org>
0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2016-06-28 13:10 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-nvdimm-y27Ovi1pjclAfugRpC6u6w, Christoph Hellwig,
xfs-VZNHf3L845pBDgjK7y7TUQ
On Sat, Jun 25, 2016 at 09:00:45AM +1000, Dave Chinner wrote:
> >
> > Sorry, but this is simply broken - allowing apps to opt-in behavior
> > (e.g. like we're using O_DIRECT) is always fine. Requriring
> > filesystem-specific tuning that has affect outside the app to get
> > existing documented behavior is not how to design APIs.
>
> Using DAX is an *admin decision*, not an application decision.
Of course - that's exactly my point.
> Indeed, it's a mount option right now, and that's most definitely not
> something the application can turn on or off! Inode flags allow the
> admin to decide that two apps working on the same filesystem can use
> (or not use) DAX independently, rather than needing to put them on
> different filesystems.
Right. And an existing application can get DAX turned on under its
back, and will now suddently get different synchronization behavior.
That is if it's writes happen to be aligned to the fs block size.
> > Maybe we'll need to opt-in to use DAX for mmap, but giving the same
> > existing behavior for read and write and avoiding a copy to the pagecache
> > is an obvious win.
>
> You can't use DAX just for mmap. It's an inode scope behaviour -
> once it's turned on, all accesses to that inode - regardless of user
> interface - must use DAX. It's all or nothing, not a per file
> descript/mmap context option.
Right now it is. But when discussing mmap behavior one option was to
require an opt-in to get DAX-specific mmap semantics. For plain
read/write we have no such option and thus absolutely need to behave as
all normal reads and writes behave. If you think the exclusive lock
for writes hurts we have two options:
a) implement range locks (although they might be more expensive for
typical loads)
b) add a new O_* or RWF_* option to not require the synchronization
for apps that don't want it.
Neither of those cases really is DAX-specific.
^ permalink raw reply [flat|nested] 23+ messages in thread