* [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:16 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
From: Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Add a copy_file_range() system call for offloading copies between
regular files.
This gives an interface to underlying layers of the storage stack which
can copy without reading and writing all the data. There are a few
candidates that should support copy offloading in the nearer term:
- btrfs shares extent references with its clone ioctl
- NFS has patches to add a COPY command which copies on the server
- SCSI has a family of XCOPY commands which copy in the device
This system call avoids the complexity of also accelerating the creation
of the destination file by operating on an existing destination file
descriptor, not a path.
Currently the high level vfs entry point limits copy offloading to files
on the same mount and super (and not in the same file). This can be
relaxed if we get implementations which can copy between file systems
safely.
Signed-off-by: Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
[Anna Schumaker: Change -EINVAL to -EBADF during file verification]
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 129 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
4 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3f..82c4933 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, int flags)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_file_range)
+ return -EBADF;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
+ len, flags);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+ if (!f_in.file || !f_out.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+out:
+ fdput(f_in);
+ fdput(f_out);
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c3..c97aed8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1631,6 +1631,7 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, int);
};
struct inode_operations {
@@ -1684,6 +1685,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
+ loff_t, size_t, int);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..2b60f0c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_copy_file_range 282
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5..4e01cd9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
From: Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Add a copy_file_range() system call for offloading copies between
regular files.
This gives an interface to underlying layers of the storage stack which
can copy without reading and writing all the data. There are a few
candidates that should support copy offloading in the nearer term:
- btrfs shares extent references with its clone ioctl
- NFS has patches to add a COPY command which copies on the server
- SCSI has a family of XCOPY commands which copy in the device
This system call avoids the complexity of also accelerating the creation
of the destination file by operating on an existing destination file
descriptor, not a path.
Currently the high level vfs entry point limits copy offloading to files
on the same mount and super (and not in the same file). This can be
relaxed if we get implementations which can copy between file systems
safely.
Signed-off-by: Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
[Anna Schumaker: Change -EINVAL to -EBADF during file verification]
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 129 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
4 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3f..82c4933 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, int flags)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_file_range)
+ return -EBADF;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
+ len, flags);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+ if (!f_in.file || !f_out.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+out:
+ fdput(f_in);
+ fdput(f_out);
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c3..c97aed8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1631,6 +1631,7 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, int);
};
struct inode_operations {
@@ -1684,6 +1685,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
+ loff_t, size_t, int);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..2b60f0c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_copy_file_range 282
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5..4e01cd9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
From: Zach Brown <zab@redhat.com>
Add a copy_file_range() system call for offloading copies between
regular files.
This gives an interface to underlying layers of the storage stack which
can copy without reading and writing all the data. There are a few
candidates that should support copy offloading in the nearer term:
- btrfs shares extent references with its clone ioctl
- NFS has patches to add a COPY command which copies on the server
- SCSI has a family of XCOPY commands which copy in the device
This system call avoids the complexity of also accelerating the creation
of the destination file by operating on an existing destination file
descriptor, not a path.
Currently the high level vfs entry point limits copy offloading to files
on the same mount and super (and not in the same file). This can be
relaxed if we get implementations which can copy between file systems
safely.
Signed-off-by: Zach Brown <zab@redhat.com>
[Anna Schumaker: Change -EINVAL to -EBADF during file verification]
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
fs/read_write.c | 129 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
4 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3f..82c4933 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, int flags)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_file_range)
+ return -EBADF;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
+ len, flags);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+ if (!f_in.file || !f_out.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+out:
+ fdput(f_in);
+ fdput(f_out);
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c3..c97aed8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1631,6 +1631,7 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, int);
};
struct inode_operations {
@@ -1684,6 +1685,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
+ loff_t, size_t, int);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9..2b60f0c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_copy_file_range 282
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5..4e01cd9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* Re: [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper
2015-09-04 20:16 ` Anna Schumaker
(?)
(?)
@ 2015-09-04 21:50 ` Darrick J. Wong
-1 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-04 21:50 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
mtk.manpages, andros, hch
On Fri, Sep 04, 2015 at 04:16:55PM -0400, Anna Schumaker wrote:
> From: Zach Brown <zab@redhat.com>
>
> Add a copy_file_range() system call for offloading copies between
> regular files.
>
> This gives an interface to underlying layers of the storage stack which
> can copy without reading and writing all the data. There are a few
> candidates that should support copy offloading in the nearer term:
>
> - btrfs shares extent references with its clone ioctl
> - NFS has patches to add a COPY command which copies on the server
> - SCSI has a family of XCOPY commands which copy in the device
>
> This system call avoids the complexity of also accelerating the creation
> of the destination file by operating on an existing destination file
> descriptor, not a path.
>
> Currently the high level vfs entry point limits copy offloading to files
> on the same mount and super (and not in the same file). This can be
> relaxed if we get implementations which can copy between file systems
> safely.
>
> Signed-off-by: Zach Brown <zab@redhat.com>
> [Anna Schumaker: Change -EINVAL to -EBADF during file verification]
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/read_write.c | 129 ++++++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 4 files changed, 136 insertions(+), 1 deletion(-)
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 819ef3f..82c4933 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -16,6 +16,7 @@
> #include <linux/pagemap.h>
> #include <linux/splice.h>
> #include <linux/compat.h>
> +#include <linux/mount.h>
> #include "internal.h"
>
> #include <asm/uaccess.h>
> @@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
> return do_sendfile(out_fd, in_fd, NULL, count, 0);
> }
> #endif
> +
> +/*
> + * copy_file_range() differs from regular file read and write in that it
> + * specifically allows return partial success. When it does so is up to
> + * the copy_file_range method.
> + */
> +ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + size_t len, int flags)
> +{
> + struct inode *inode_in;
> + struct inode *inode_out;
> + ssize_t ret;
> +
> + if (flags)
> + return -EINVAL;
> +
> + if (len == 0)
> + return 0;
> +
> + /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
> + ret = rw_verify_area(READ, file_in, &pos_in, len);
> + if (ret >= 0)
> + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> + if (ret < 0)
> + return ret;
> +
> + if (!(file_in->f_mode & FMODE_READ) ||
> + !(file_out->f_mode & FMODE_WRITE) ||
> + (file_out->f_flags & O_APPEND) ||
> + !file_in->f_op || !file_in->f_op->copy_file_range)
> + return -EBADF;
> +
> + inode_in = file_inode(file_in);
> + inode_out = file_inode(file_out);
> +
> + /* make sure offsets don't wrap and the input is inside i_size */
> + if (pos_in + len < pos_in || pos_out + len < pos_out ||
> + pos_in + len > i_size_read(inode_in))
> + return -EINVAL;
> +
> + /* this could be relaxed once a method supports cross-fs copies */
> + if (inode_in->i_sb != inode_out->i_sb ||
> + file_in->f_path.mnt != file_out->f_path.mnt)
> + return -EXDEV;
> +
> + /* forbid ranges in the same file */
> + if (inode_in == inode_out)
> + return -EINVAL;
btrfs does and XFS will support the case of a file sharing blocks with itself.
--D
> +
> + ret = mnt_want_write_file(file_out);
> + if (ret)
> + return ret;
> +
> + ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
> + len, flags);
> + if (ret > 0) {
> + fsnotify_access(file_in);
> + add_rchar(current, ret);
> + fsnotify_modify(file_out);
> + add_wchar(current, ret);
> + }
> + inc_syscr(current);
> + inc_syscw(current);
> +
> + mnt_drop_write_file(file_out);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range);
> +
> +SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
> + int, fd_out, loff_t __user *, off_out,
> + size_t, len, unsigned int, flags)
> +{
> + loff_t pos_in;
> + loff_t pos_out;
> + struct fd f_in;
> + struct fd f_out;
> + ssize_t ret;
> +
> + f_in = fdget(fd_in);
> + f_out = fdget(fd_out);
> + if (!f_in.file || !f_out.file) {
> + ret = -EBADF;
> + goto out;
> + }
> +
> + ret = -EFAULT;
> + if (off_in) {
> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_in = f_in.file->f_pos;
> + }
> +
> + if (off_out) {
> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_out = f_out.file->f_pos;
> + }
> +
> + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
> + flags);
> + if (ret > 0) {
> + pos_in += ret;
> + pos_out += ret;
> +
> + if (off_in) {
> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_in.file->f_pos = pos_in;
> + }
> +
> + if (off_out) {
> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_out.file->f_pos = pos_out;
> + }
> + }
> +out:
> + fdput(f_in);
> + fdput(f_out);
> + return ret;
> +}
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index cc008c3..c97aed8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1631,6 +1631,7 @@ struct file_operations {
> #ifndef CONFIG_MMU
> unsigned (*mmap_capabilities)(struct file *);
> #endif
> + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, int);
> };
>
> struct inode_operations {
> @@ -1684,6 +1685,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
> unsigned long, loff_t *);
> extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
> unsigned long, loff_t *);
> +extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> + loff_t, size_t, int);
>
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index e016bd9..2b60f0c 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
> __SYSCALL(__NR_bpf, sys_bpf)
> #define __NR_execveat 281
> __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
> +#define __NR_copy_file_range 282
> +__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 282
> +#define __NR_syscalls 283
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 7995ef5..4e01cd9 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -173,6 +173,7 @@ cond_syscall(sys_setfsuid);
> cond_syscall(sys_setfsgid);
> cond_syscall(sys_capget);
> cond_syscall(sys_capset);
> +cond_syscall(sys_copy_file_range);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);
> --
> 2.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:16 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
We need to verify that both the source and the destination files are
part of the same filesystem, otherwise we can't create a reflink.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/btrfs/ioctl.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 62ae286..9c0c955 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
if (ret == 0)
ret = len;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
We need to verify that both the source and the destination files are
part of the same filesystem, otherwise we can't create a reflink.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/btrfs/ioctl.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 62ae286..9c0c955 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
if (ret == 0)
ret = len;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
We need to verify that both the source and the destination files are
part of the same filesystem, otherwise we can't create a reflink.
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
fs/btrfs/ioctl.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 62ae286..9c0c955 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
if (ret == 0)
ret = len;
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* Re: [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
2015-09-04 20:16 ` Anna Schumaker
(?)
(?)
@ 2015-09-09 9:18 ` David Sterba
2015-09-09 15:56 ` Anna Schumaker
-1 siblings, 1 reply; 121+ messages in thread
From: David Sterba @ 2015-09-09 9:18 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
On Fri, Sep 04, 2015 at 04:16:58PM -0400, Anna Schumaker wrote:
> We need to verify that both the source and the destination files are
> part of the same filesystem, otherwise we can't create a reflink.
>
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/btrfs/ioctl.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 62ae286..9c0c955 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
> {
> ssize_t ret;
>
> + if (inode_in->i_sb != inode_out->i_sb ||
> + file_in->f_path.mnt != file_out->f_path.mnt)
> + return -EXDEV;
The same check exists in btrfs_clone_files (added in the previous
patch), what's the reason to add it here again?
> +
> ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
2015-09-09 9:18 ` David Sterba
@ 2015-09-09 15:56 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-09 15:56 UTC (permalink / raw)
To: dsterba, linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab,
viro, clm, darrick.wong, mtk.manpages, andros, hch
On 09/09/2015 05:18 AM, David Sterba wrote:
> On Fri, Sep 04, 2015 at 04:16:58PM -0400, Anna Schumaker wrote:
>> We need to verify that both the source and the destination files are
>> part of the same filesystem, otherwise we can't create a reflink.
>>
>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>> ---
>> fs/btrfs/ioctl.c | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 62ae286..9c0c955 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> {
>> ssize_t ret;
>>
>> + if (inode_in->i_sb != inode_out->i_sb ||
>> + file_in->f_path.mnt != file_out->f_path.mnt)
>> + return -EXDEV;
>
> The same check exists in btrfs_clone_files (added in the previous
> patch), what's the reason to add it here again?
To be really, really sure that it's not a cross-device copy? :)
In reality, it was an oversight and I'll drop the patch. Thanks for letting me know!
Anna
>
>> +
>> ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range
@ 2015-09-09 15:56 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-09 15:56 UTC (permalink / raw)
To: dsterba, linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab,
viro, clm, darrick.wong, mtk.manpages, andros, hch
On 09/09/2015 05:18 AM, David Sterba wrote:
> On Fri, Sep 04, 2015 at 04:16:58PM -0400, Anna Schumaker wrote:
>> We need to verify that both the source and the destination files are
>> part of the same filesystem, otherwise we can't create a reflink.
>>
>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>> ---
>> fs/btrfs/ioctl.c | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 62ae286..9c0c955 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -3856,6 +3856,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> {
>> ssize_t ret;
>>
>> + if (inode_in->i_sb != inode_out->i_sb ||
>> + file_in->f_path.mnt != file_out->f_path.mnt)
>> + return -EXDEV;
>
> The same check exists in btrfs_clone_files (added in the previous
> patch), what's the reason to add it here again?
To be really, really sure that it's not a cross-device copy? :)
In reality, it was an oversight and I'll drop the patch. Thanks for letting me know!
Anna
>
>> +
>> ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
^ permalink raw reply [flat|nested] 121+ messages in thread
* [PATCH v1 5/8] vfs: Remove copy_file_range mountpoint checks
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:16 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
I still want to do an in-kernel copy even if the files are on different
mountpoints, and NFS has a "server to server" copy that expects two
files on different mountpoints. Let's have individual filesystems
implement this check instead.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 82c4933..922ca58 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1369,11 +1369,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
pos_in + len > i_size_read(inode_in))
return -EINVAL;
- /* this could be relaxed once a method supports cross-fs copies */
- if (inode_in->i_sb != inode_out->i_sb ||
- file_in->f_path.mnt != file_out->f_path.mnt)
- return -EXDEV;
-
/* forbid ranges in the same file */
if (inode_in == inode_out)
return -EINVAL;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 5/8] vfs: Remove copy_file_range mountpoint checks
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
I still want to do an in-kernel copy even if the files are on different
mountpoints, and NFS has a "server to server" copy that expects two
files on different mountpoints. Let's have individual filesystems
implement this check instead.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 82c4933..922ca58 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1369,11 +1369,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
pos_in + len > i_size_read(inode_in))
return -EINVAL;
- /* this could be relaxed once a method supports cross-fs copies */
- if (inode_in->i_sb != inode_out->i_sb ||
- file_in->f_path.mnt != file_out->f_path.mnt)
- return -EXDEV;
-
/* forbid ranges in the same file */
if (inode_in == inode_out)
return -EINVAL;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 5/8] vfs: Remove copy_file_range mountpoint checks
@ 2015-09-04 20:16 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:16 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
I still want to do an in-kernel copy even if the files are on different
mountpoints, and NFS has a "server to server" copy that expects two
files on different mountpoints. Let's have individual filesystems
implement this check instead.
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
fs/read_write.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 82c4933..922ca58 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1369,11 +1369,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
pos_in + len > i_size_read(inode_in))
return -EINVAL;
- /* this could be relaxed once a method supports cross-fs copies */
- if (inode_in->i_sb != inode_out->i_sb ||
- file_in->f_path.mnt != file_out->f_path.mnt)
- return -EXDEV;
-
/* forbid ranges in the same file */
if (inode_in == inode_out)
return -EINVAL;
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 6/8] vfs: Copy should check len after file open mode
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:17 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
I don't think it makes sense to report that a copy succeeded if the
files aren't open properly.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 922ca58..59637ed 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1345,9 +1345,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (flags)
return -EINVAL;
- if (len == 0)
- return 0;
-
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
ret = rw_verify_area(READ, file_in, &pos_in, len);
if (ret >= 0)
@@ -1373,6 +1370,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (inode_in == inode_out)
return -EINVAL;
+ if (len == 0)
+ return 0;
+
ret = mnt_want_write_file(file_out);
if (ret)
return ret;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 6/8] vfs: Copy should check len after file open mode
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
I don't think it makes sense to report that a copy succeeded if the
files aren't open properly.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 922ca58..59637ed 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1345,9 +1345,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (flags)
return -EINVAL;
- if (len == 0)
- return 0;
-
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
ret = rw_verify_area(READ, file_in, &pos_in, len);
if (ret >= 0)
@@ -1373,6 +1370,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (inode_in == inode_out)
return -EINVAL;
+ if (len == 0)
+ return 0;
+
ret = mnt_want_write_file(file_out);
if (ret)
return ret;
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 6/8] vfs: Copy should check len after file open mode
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
I don't think it makes sense to report that a copy succeeded if the
files aren't open properly.
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
fs/read_write.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 922ca58..59637ed 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1345,9 +1345,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (flags)
return -EINVAL;
- if (len == 0)
- return 0;
-
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
ret = rw_verify_area(READ, file_in, &pos_in, len);
if (ret >= 0)
@@ -1373,6 +1370,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (inode_in == inode_out)
return -EINVAL;
+ if (len == 0)
+ return 0;
+
ret = mnt_want_write_file(file_out);
if (ret)
return ret;
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread
* [PATCH v1 8/8] vfs: Fall back on splice if no copy function defined
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:17 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
The NFS server will need a fallback for filesystems that don't have any
kind of copy acceleration yet, and it should be generally useful to have
an in-kernel copy to avoid lots of switches between kernel and
user space. Users who only want a reflink can pass the COPY_REFLINK
flag to disable the fallback.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 16 ++++++++++++----
include/linux/copy.h | 6 ++++++
include/uapi/linux/Kbuild | 1 +
include/uapi/linux/copy.h | 6 ++++++
4 files changed, 25 insertions(+), 4 deletions(-)
create mode 100644 include/linux/copy.h
create mode 100644 include/uapi/linux/copy.h
diff --git a/fs/read_write.c b/fs/read_write.c
index 9dafb7f..bd7e7e2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
+#include <linux/copy.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
@@ -1342,7 +1343,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_out;
ssize_t ret;
- if (flags)
+ if (!((flags == 0) || (flags == COPY_REFLINK)))
return -EINVAL;
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
@@ -1355,7 +1356,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND) ||
- !file_out->f_op || !file_out->f_op->copy_file_range)
+ !file_in->f_op)
return -EBADF;
inode_in = file_inode(file_in);
@@ -1377,8 +1378,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
- ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
- len, flags);
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if ((ret < 0) && !(flags & COPY_REFLINK)) {
+ file_start_write(file_out);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
+ file_end_write(file_out);
+ }
if (ret > 0) {
fsnotify_access(file_in);
add_rchar(current, ret);
diff --git a/include/linux/copy.h b/include/linux/copy.h
new file mode 100644
index 0000000..fd54543
--- /dev/null
+++ b/include/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_COPY_H
+#define _LINUX_COPY_H
+
+#include <uapi/linux/copy.h>
+
+#endif /* _LINUX_COPY_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942..b6109f3 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -90,6 +90,7 @@ header-y += coda_psdev.h
header-y += coff.h
header-y += connector.h
header-y += const.h
+header-y += copy.h
header-y += cramfs_fs.h
header-y += cuda.h
header-y += cyclades.h
diff --git a/include/uapi/linux/copy.h b/include/uapi/linux/copy.h
new file mode 100644
index 0000000..68f3d65
--- /dev/null
+++ b/include/uapi/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _UAPI_LINUX_COPY_H
+#define _UAPI_LINUX_COPY_H
+
+#define COPY_REFLINK 1 /* only perform a reflink */
+
+#endif /* _UAPI_LINUX_COPY_H */
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 8/8] vfs: Fall back on splice if no copy function defined
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
The NFS server will need a fallback for filesystems that don't have any
kind of copy acceleration yet, and it should be generally useful to have
an in-kernel copy to avoid lots of switches between kernel and
user space. Users who only want a reflink can pass the COPY_REFLINK
flag to disable the fallback.
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
fs/read_write.c | 16 ++++++++++++----
include/linux/copy.h | 6 ++++++
include/uapi/linux/Kbuild | 1 +
include/uapi/linux/copy.h | 6 ++++++
4 files changed, 25 insertions(+), 4 deletions(-)
create mode 100644 include/linux/copy.h
create mode 100644 include/uapi/linux/copy.h
diff --git a/fs/read_write.c b/fs/read_write.c
index 9dafb7f..bd7e7e2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
+#include <linux/copy.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
@@ -1342,7 +1343,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_out;
ssize_t ret;
- if (flags)
+ if (!((flags == 0) || (flags == COPY_REFLINK)))
return -EINVAL;
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
@@ -1355,7 +1356,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND) ||
- !file_out->f_op || !file_out->f_op->copy_file_range)
+ !file_in->f_op)
return -EBADF;
inode_in = file_inode(file_in);
@@ -1377,8 +1378,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
- ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
- len, flags);
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if ((ret < 0) && !(flags & COPY_REFLINK)) {
+ file_start_write(file_out);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
+ file_end_write(file_out);
+ }
if (ret > 0) {
fsnotify_access(file_in);
add_rchar(current, ret);
diff --git a/include/linux/copy.h b/include/linux/copy.h
new file mode 100644
index 0000000..fd54543
--- /dev/null
+++ b/include/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_COPY_H
+#define _LINUX_COPY_H
+
+#include <uapi/linux/copy.h>
+
+#endif /* _LINUX_COPY_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942..b6109f3 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -90,6 +90,7 @@ header-y += coda_psdev.h
header-y += coff.h
header-y += connector.h
header-y += const.h
+header-y += copy.h
header-y += cramfs_fs.h
header-y += cuda.h
header-y += cyclades.h
diff --git a/include/uapi/linux/copy.h b/include/uapi/linux/copy.h
new file mode 100644
index 0000000..68f3d65
--- /dev/null
+++ b/include/uapi/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _UAPI_LINUX_COPY_H
+#define _UAPI_LINUX_COPY_H
+
+#define COPY_REFLINK 1 /* only perform a reflink */
+
+#endif /* _UAPI_LINUX_COPY_H */
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 8/8] vfs: Fall back on splice if no copy function defined
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
The NFS server will need a fallback for filesystems that don't have any
kind of copy acceleration yet, and it should be generally useful to have
an in-kernel copy to avoid lots of switches between kernel and
user space. Users who only want a reflink can pass the COPY_REFLINK
flag to disable the fallback.
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
fs/read_write.c | 16 ++++++++++++----
include/linux/copy.h | 6 ++++++
include/uapi/linux/Kbuild | 1 +
include/uapi/linux/copy.h | 6 ++++++
4 files changed, 25 insertions(+), 4 deletions(-)
create mode 100644 include/linux/copy.h
create mode 100644 include/uapi/linux/copy.h
diff --git a/fs/read_write.c b/fs/read_write.c
index 9dafb7f..bd7e7e2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
+#include <linux/copy.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
@@ -1342,7 +1343,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_out;
ssize_t ret;
- if (flags)
+ if (!((flags == 0) || (flags == COPY_REFLINK)))
return -EINVAL;
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
@@ -1355,7 +1356,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND) ||
- !file_out->f_op || !file_out->f_op->copy_file_range)
+ !file_in->f_op)
return -EBADF;
inode_in = file_inode(file_in);
@@ -1377,8 +1378,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
- ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
- len, flags);
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if ((ret < 0) && !(flags & COPY_REFLINK)) {
+ file_start_write(file_out);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
+ file_end_write(file_out);
+ }
if (ret > 0) {
fsnotify_access(file_in);
add_rchar(current, ret);
diff --git a/include/linux/copy.h b/include/linux/copy.h
new file mode 100644
index 0000000..fd54543
--- /dev/null
+++ b/include/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_COPY_H
+#define _LINUX_COPY_H
+
+#include <uapi/linux/copy.h>
+
+#endif /* _LINUX_COPY_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942..b6109f3 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -90,6 +90,7 @@ header-y += coda_psdev.h
header-y += coff.h
header-y += connector.h
header-y += const.h
+header-y += copy.h
header-y += cramfs_fs.h
header-y += cuda.h
header-y += cyclades.h
diff --git a/include/uapi/linux/copy.h b/include/uapi/linux/copy.h
new file mode 100644
index 0000000..68f3d65
--- /dev/null
+++ b/include/uapi/linux/copy.h
@@ -0,0 +1,6 @@
+#ifndef _UAPI_LINUX_COPY_H
+#define _UAPI_LINUX_COPY_H
+
+#define COPY_REFLINK 1 /* only perform a reflink */
+
+#endif /* _UAPI_LINUX_COPY_H */
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* Re: [PATCH v1 8/8] vfs: Fall back on splice if no copy function defined
2015-09-04 20:17 ` Anna Schumaker
(?)
(?)
@ 2015-09-04 21:08 ` Darrick J. Wong
[not found] ` <20150904210813.GA30681-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
-1 siblings, 1 reply; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-04 21:08 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
mtk.manpages, andros, hch
On Fri, Sep 04, 2015 at 04:17:02PM -0400, Anna Schumaker wrote:
> The NFS server will need a fallback for filesystems that don't have any
> kind of copy acceleration yet, and it should be generally useful to have
> an in-kernel copy to avoid lots of switches between kernel and
> user space. Users who only want a reflink can pass the COPY_REFLINK
> flag to disable the fallback.
>
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/read_write.c | 16 ++++++++++++----
> include/linux/copy.h | 6 ++++++
> include/uapi/linux/Kbuild | 1 +
> include/uapi/linux/copy.h | 6 ++++++
> 4 files changed, 25 insertions(+), 4 deletions(-)
> create mode 100644 include/linux/copy.h
> create mode 100644 include/uapi/linux/copy.h
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 9dafb7f..bd7e7e2 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -7,6 +7,7 @@
> #include <linux/slab.h>
> #include <linux/stat.h>
> #include <linux/fcntl.h>
> +#include <linux/copy.h>
> #include <linux/file.h>
> #include <linux/uio.h>
> #include <linux/fsnotify.h>
> @@ -1342,7 +1343,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> struct inode *inode_out;
> ssize_t ret;
>
> - if (flags)
> + if (!((flags == 0) || (flags == COPY_REFLINK)))
I think it's a good idea for the copy_file_range flags to have the name of
the syscall in them at the beginning, both to try to avoid namespace collisions
and also to make it clear where the flag comes from. So, could we prefix the
values with "COPY_FILE_RANGE_" instead of just "COPY_"?
Also... I'm confused by what the structure of check implies about 'flags' --
are each of 'flags's values supposed to be mutually exclusive, or is it a
straight bitset like flags arguments tend to be?
Say we want to add dedupe and ponies as potential behavioral variants. Then
we'd end up with something like:
/* raw flags */
#define COPY_FILE_RANGE_SHARE_BLOCKS (1)
#define COPY_FILE_RANGE_CHECK_SAME (2)
#define COPY_FILE_RANGE_PONIES (4)
/* syntactic sugar */
#define COPY_FILE_RANGE_DEDUPE (COPY_FILE_RANGE_SHARE_BLOCKS | \
COPY_FILE_RANGE_CHECK_SAME)
#define COPY_FILE_RANGE_ALL (COPY_FILE_RANGE_SHARE_BLOCKS | \
COPY_FILE_RANGE_CHECK_SAME | \
COPY_FILE_RANGE_PONIES)
and in copy_file_range():
if (flags & ~COPY_FILE_RANGE_ALL)
return -EINVAL;
if ((flags & COPY_FILE_RANGE_CHECK_SAME) &&
!(flags & COPY_FILE_RANGE_SHARE_BLOCKS))
return -EINVAL; /* or is it return 0? */
if (flags & COPY_FILE_RANGE_PONIES)
panic();
if (flags & COPY_FILE_RANGE_CHECK_SAME)
check_same_contents(...);
err = vfs_copy_range(...);
if (err < 0 && !(flags & COPY_FILE_RANGE_SHARE_BLOCKS))
err = splice(...);
One hard part is figuring out just what actions we want userland to be able to
ask for. I can think of three: "share blocks via remapping" (i.e. reflink),
"share blocks via remapping but only if they're identical" (i.e. dedupe), and
"classic copy via pagecache". I lean towards giving each variant its own bit
and only allowing through the cases that make sense, rather than giving each
valid combination its own unique number, but maybe I misinterpreted the intent
behind the code.
(I guess there could also be 'do it with hardware assist' but that's digging
myself a pretty deep hole.)
--D
> return -EINVAL;
>
> /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
> @@ -1355,7 +1356,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> if (!(file_in->f_mode & FMODE_READ) ||
> !(file_out->f_mode & FMODE_WRITE) ||
> (file_out->f_flags & O_APPEND) ||
> - !file_out->f_op || !file_out->f_op->copy_file_range)
> + !file_in->f_op)
> return -EBADF;
>
> inode_in = file_inode(file_in);
> @@ -1377,8 +1378,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> if (ret)
> return ret;
>
> - ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
> - len, flags);
> + ret = -EOPNOTSUPP;
> + if (file_out->f_op->copy_file_range)
> + ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
> + pos_out, len, flags);
> + if ((ret < 0) && !(flags & COPY_REFLINK)) {
> + file_start_write(file_out);
> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
> + file_end_write(file_out);
> + }
> if (ret > 0) {
> fsnotify_access(file_in);
> add_rchar(current, ret);
> diff --git a/include/linux/copy.h b/include/linux/copy.h
> new file mode 100644
> index 0000000..fd54543
> --- /dev/null
> +++ b/include/linux/copy.h
> @@ -0,0 +1,6 @@
> +#ifndef _LINUX_COPY_H
> +#define _LINUX_COPY_H
> +
> +#include <uapi/linux/copy.h>
> +
> +#endif /* _LINUX_COPY_H */
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
> index 1ff9942..b6109f3 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -90,6 +90,7 @@ header-y += coda_psdev.h
> header-y += coff.h
> header-y += connector.h
> header-y += const.h
> +header-y += copy.h
> header-y += cramfs_fs.h
> header-y += cuda.h
> header-y += cyclades.h
> diff --git a/include/uapi/linux/copy.h b/include/uapi/linux/copy.h
> new file mode 100644
> index 0000000..68f3d65
> --- /dev/null
> +++ b/include/uapi/linux/copy.h
> @@ -0,0 +1,6 @@
> +#ifndef _UAPI_LINUX_COPY_H
> +#define _UAPI_LINUX_COPY_H
> +
> +#define COPY_REFLINK 1 /* only perform a reflink */
> +
> +#endif /* _UAPI_LINUX_COPY_H */
> --
> 2.5.1
>
^ permalink raw reply [flat|nested] 121+ messages in thread
* [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()
2015-09-04 20:16 ` Anna Schumaker
(?)
@ 2015-09-04 20:17 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
copy_file_range() is a new system call for copying ranges of data
completely in the kernel. This gives filesystems an opportunity to
implement some kind of "copy acceleration", such as reflinks or
server-side-copy (in the case of NFS).
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 168 insertions(+)
create mode 100644 man2/copy_file_range.2
diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
new file mode 100644
index 0000000..4a4cb73
--- /dev/null
+++ b/man2/copy_file_range.2
@@ -0,0 +1,168 @@
+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
+.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
+.SH NAME
+copy_file_range \- Copy a range of data from one file to another
+.SH SYNOPSIS
+.nf
+.B #include <linux/copy.h>
+.B #include <sys/syscall.h>
+.B #include <unistd.h>
+
+.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
+.BI " int " fd_out ", loff_t * " off_out ", size_t " len ",
+.BI " unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR copy_file_range ()
+system call performs an in-kernel copy between two file descriptors
+without all that tedious mucking about in userspace.
+It copies up to
+.I len
+bytes of data from file descriptor
+.I fd_in
+to file descriptor
+.I fd_out
+at
+.IR off_out .
+The file descriptors must not refer to the same file.
+
+The following semantics apply for
+.IR fd_in ,
+and similar statements apply to
+.IR off_out :
+.IP * 3
+If
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the current file offset and the current
+file offset is adjusted appropriately.
+.IP *
+If
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer that specifies the starting
+offset where bytes from
+.I fd_in
+will be read. The current file offset of
+.I fd_in
+is not changed, but
+.I off_in
+is adjusted appropriately.
+.PP
+The default behavior of
+.BR copy_file_range ()
+is filesystem specific, and might result in creating a
+copy-on-write reflink.
+In the event that a given filesystem does not implement
+any form of copy acceleration, the kernel will perform
+a deep copy of the requested range by reading bytes from
+.I fd_in
+and writing them to
+.IR fd_out .
+
+Currently, Linux only supports the following flag:
+.TP 1.9i
+.B COPY_REFLINK
+Only perform the copy if the filesystem can do it as a reflink.
+Do not fall back on performing a deep copy.
+.SH RETURN VALUE
+Upon successful completion,
+.BR copy_file_range ()
+will return the number of bytes copied between files.
+This could be less than the length originally requested.
+
+On error,
+.BR copy_file_range ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+One or more file descriptors are not valid,
+or do not have proper read-write mode.
+.TP
+.B EINVAL
+Requested range extends beyond the end of the file;
+.I flags
+argument is set to an invalid value.
+.TP
+.B EOPNOTSUPP
+.B COPY_REFLINK
+was specified in
+.IR flags ,
+but the target filesystem does not support reflinks.
+.TP
+.B EXDEV
+Target filesystem doesn't support cross-filesystem copies.
+.SH VERSIONS
+The
+.BR copy_file_range ()
+system call first appeared in Linux 4.3.
+.SH CONFORMING TO
+The
+.BR copy_file_range ()
+system call is a nonstandard Linux extension.
+.SH EXAMPLE
+.nf
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/copy.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+
+int main(int argc, char **argv)
+{
+ int fd_in, fd_out;
+ struct stat stat;
+ loff_t len, ret;
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ fd_in = open(argv[1], O_RDONLY);
+ if (fd_in == -1) {
+ perror("open (argv[1])");
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd_in, &stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+ len = stat.st_size;
+
+ fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
+ if (fd_out == -1) {
+ perror("open (argv[2])");
+ exit(EXIT_FAILURE);
+ }
+
+ do {
+ ret = syscall(__NR_copy_file_range, fd_in, NULL,
+ fd_out, NULL, len, 0);
+ if (ret == -1) {
+ perror("copy_file_range");
+ exit(EXIT_FAILURE);
+ }
+
+ len -= ret;
+ } while (len > 0);
+
+ close(fd_in);
+ close(fd_out);
+ exit(EXIT_SUCCESS);
+}
+.fi
+.SH SEE ALSO
+.BR splice (2)
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
copy_file_range() is a new system call for copying ranges of data
completely in the kernel. This gives filesystems an opportunity to
implement some kind of "copy acceleration", such as reflinks or
server-side-copy (in the case of NFS).
Signed-off-by: Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
---
man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 168 insertions(+)
create mode 100644 man2/copy_file_range.2
diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
new file mode 100644
index 0000000..4a4cb73
--- /dev/null
+++ b/man2/copy_file_range.2
@@ -0,0 +1,168 @@
+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
+.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
+.SH NAME
+copy_file_range \- Copy a range of data from one file to another
+.SH SYNOPSIS
+.nf
+.B #include <linux/copy.h>
+.B #include <sys/syscall.h>
+.B #include <unistd.h>
+
+.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
+.BI " int " fd_out ", loff_t * " off_out ", size_t " len ",
+.BI " unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR copy_file_range ()
+system call performs an in-kernel copy between two file descriptors
+without all that tedious mucking about in userspace.
+It copies up to
+.I len
+bytes of data from file descriptor
+.I fd_in
+to file descriptor
+.I fd_out
+at
+.IR off_out .
+The file descriptors must not refer to the same file.
+
+The following semantics apply for
+.IR fd_in ,
+and similar statements apply to
+.IR off_out :
+.IP * 3
+If
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the current file offset and the current
+file offset is adjusted appropriately.
+.IP *
+If
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer that specifies the starting
+offset where bytes from
+.I fd_in
+will be read. The current file offset of
+.I fd_in
+is not changed, but
+.I off_in
+is adjusted appropriately.
+.PP
+The default behavior of
+.BR copy_file_range ()
+is filesystem specific, and might result in creating a
+copy-on-write reflink.
+In the event that a given filesystem does not implement
+any form of copy acceleration, the kernel will perform
+a deep copy of the requested range by reading bytes from
+.I fd_in
+and writing them to
+.IR fd_out .
+
+Currently, Linux only supports the following flag:
+.TP 1.9i
+.B COPY_REFLINK
+Only perform the copy if the filesystem can do it as a reflink.
+Do not fall back on performing a deep copy.
+.SH RETURN VALUE
+Upon successful completion,
+.BR copy_file_range ()
+will return the number of bytes copied between files.
+This could be less than the length originally requested.
+
+On error,
+.BR copy_file_range ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+One or more file descriptors are not valid,
+or do not have proper read-write mode.
+.TP
+.B EINVAL
+Requested range extends beyond the end of the file;
+.I flags
+argument is set to an invalid value.
+.TP
+.B EOPNOTSUPP
+.B COPY_REFLINK
+was specified in
+.IR flags ,
+but the target filesystem does not support reflinks.
+.TP
+.B EXDEV
+Target filesystem doesn't support cross-filesystem copies.
+.SH VERSIONS
+The
+.BR copy_file_range ()
+system call first appeared in Linux 4.3.
+.SH CONFORMING TO
+The
+.BR copy_file_range ()
+system call is a nonstandard Linux extension.
+.SH EXAMPLE
+.nf
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/copy.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+
+int main(int argc, char **argv)
+{
+ int fd_in, fd_out;
+ struct stat stat;
+ loff_t len, ret;
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ fd_in = open(argv[1], O_RDONLY);
+ if (fd_in == -1) {
+ perror("open (argv[1])");
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd_in, &stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+ len = stat.st_size;
+
+ fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
+ if (fd_out == -1) {
+ perror("open (argv[2])");
+ exit(EXIT_FAILURE);
+ }
+
+ do {
+ ret = syscall(__NR_copy_file_range, fd_in, NULL,
+ fd_out, NULL, len, 0);
+ if (ret == -1) {
+ perror("copy_file_range");
+ exit(EXIT_FAILURE);
+ }
+
+ len -= ret;
+ } while (len > 0);
+
+ close(fd_in);
+ close(fd_out);
+ exit(EXIT_SUCCESS);
+}
+.fi
+.SH SEE ALSO
+.BR splice (2)
--
2.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 121+ messages in thread* [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()
@ 2015-09-04 20:17 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-04 20:17 UTC (permalink / raw)
To: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
copy_file_range() is a new system call for copying ranges of data
completely in the kernel. This gives filesystems an opportunity to
implement some kind of "copy acceleration", such as reflinks or
server-side-copy (in the case of NFS).
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 168 insertions(+)
create mode 100644 man2/copy_file_range.2
diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
new file mode 100644
index 0000000..4a4cb73
--- /dev/null
+++ b/man2/copy_file_range.2
@@ -0,0 +1,168 @@
+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
+.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
+.SH NAME
+copy_file_range \- Copy a range of data from one file to another
+.SH SYNOPSIS
+.nf
+.B #include <linux/copy.h>
+.B #include <sys/syscall.h>
+.B #include <unistd.h>
+
+.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
+.BI " int " fd_out ", loff_t * " off_out ", size_t " len ",
+.BI " unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR copy_file_range ()
+system call performs an in-kernel copy between two file descriptors
+without all that tedious mucking about in userspace.
+It copies up to
+.I len
+bytes of data from file descriptor
+.I fd_in
+to file descriptor
+.I fd_out
+at
+.IR off_out .
+The file descriptors must not refer to the same file.
+
+The following semantics apply for
+.IR fd_in ,
+and similar statements apply to
+.IR off_out :
+.IP * 3
+If
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the current file offset and the current
+file offset is adjusted appropriately.
+.IP *
+If
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer that specifies the starting
+offset where bytes from
+.I fd_in
+will be read. The current file offset of
+.I fd_in
+is not changed, but
+.I off_in
+is adjusted appropriately.
+.PP
+The default behavior of
+.BR copy_file_range ()
+is filesystem specific, and might result in creating a
+copy-on-write reflink.
+In the event that a given filesystem does not implement
+any form of copy acceleration, the kernel will perform
+a deep copy of the requested range by reading bytes from
+.I fd_in
+and writing them to
+.IR fd_out .
+
+Currently, Linux only supports the following flag:
+.TP 1.9i
+.B COPY_REFLINK
+Only perform the copy if the filesystem can do it as a reflink.
+Do not fall back on performing a deep copy.
+.SH RETURN VALUE
+Upon successful completion,
+.BR copy_file_range ()
+will return the number of bytes copied between files.
+This could be less than the length originally requested.
+
+On error,
+.BR copy_file_range ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+One or more file descriptors are not valid,
+or do not have proper read-write mode.
+.TP
+.B EINVAL
+Requested range extends beyond the end of the file;
+.I flags
+argument is set to an invalid value.
+.TP
+.B EOPNOTSUPP
+.B COPY_REFLINK
+was specified in
+.IR flags ,
+but the target filesystem does not support reflinks.
+.TP
+.B EXDEV
+Target filesystem doesn't support cross-filesystem copies.
+.SH VERSIONS
+The
+.BR copy_file_range ()
+system call first appeared in Linux 4.3.
+.SH CONFORMING TO
+The
+.BR copy_file_range ()
+system call is a nonstandard Linux extension.
+.SH EXAMPLE
+.nf
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/copy.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+
+int main(int argc, char **argv)
+{
+ int fd_in, fd_out;
+ struct stat stat;
+ loff_t len, ret;
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ fd_in = open(argv[1], O_RDONLY);
+ if (fd_in == -1) {
+ perror("open (argv[1])");
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd_in, &stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+ len = stat.st_size;
+
+ fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
+ if (fd_out == -1) {
+ perror("open (argv[2])");
+ exit(EXIT_FAILURE);
+ }
+
+ do {
+ ret = syscall(__NR_copy_file_range, fd_in, NULL,
+ fd_out, NULL, len, 0);
+ if (ret == -1) {
+ perror("copy_file_range");
+ exit(EXIT_FAILURE);
+ }
+
+ len -= ret;
+ } while (len > 0);
+
+ close(fd_in);
+ close(fd_out);
+ exit(EXIT_SUCCESS);
+}
+.fi
+.SH SEE ALSO
+.BR splice (2)
--
2.5.1
^ permalink raw reply related [flat|nested] 121+ messages in thread* Re: [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()
2015-09-04 20:17 ` Anna Schumaker
(?)
(?)
@ 2015-09-04 21:38 ` Darrick J. Wong
[not found] ` <20150904213856.GC10391-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
-1 siblings, 1 reply; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-04 21:38 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
mtk.manpages, andros, hch
On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> copy_file_range() is a new system call for copying ranges of data
> completely in the kernel. This gives filesystems an opportunity to
> implement some kind of "copy acceleration", such as reflinks or
> server-side-copy (in the case of NFS).
>
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> man2/copy_file_range.2 | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 168 insertions(+)
> create mode 100644 man2/copy_file_range.2
>
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> new file mode 100644
> index 0000000..4a4cb73
> --- /dev/null
> +++ b/man2/copy_file_range.2
> @@ -0,0 +1,168 @@
> +.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +copy_file_range \- Copy a range of data from one file to another
> +.SH SYNOPSIS
> +.nf
> +.B #include <linux/copy.h>
> +.B #include <sys/syscall.h>
> +.B #include <unistd.h>
> +
> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in ",
> +.BI " int " fd_out ", loff_t * " off_out ", size_t " len ",
> +.BI " unsigned int " flags );
> +.fi
> +.SH DESCRIPTION
> +The
> +.BR copy_file_range ()
> +system call performs an in-kernel copy between two file descriptors
> +without all that tedious mucking about in userspace.
;)
> +It copies up to
> +.I len
> +bytes of data from file descriptor
> +.I fd_in
> +to file descriptor
> +.I fd_out
> +at
> +.IR off_out .
> +The file descriptors must not refer to the same file.
Why? btrfs (and XFS) reflink can handle the case of a file sharing blocks
with itself.
> +
> +The following semantics apply for
> +.IR fd_in ,
> +and similar statements apply to
> +.IR off_out :
> +.IP * 3
> +If
> +.I off_in
> +is NULL, then bytes are read from
> +.I fd_in
> +starting from the current file offset and the current
> +file offset is adjusted appropriately.
> +.IP *
> +If
> +.I off_in
> +is not NULL, then
> +.I off_in
> +must point to a buffer that specifies the starting
> +offset where bytes from
> +.I fd_in
> +will be read. The current file offset of
> +.I fd_in
> +is not changed, but
> +.I off_in
> +is adjusted appropriately.
> +.PP
> +The default behavior of
> +.BR copy_file_range ()
> +is filesystem specific, and might result in creating a
> +copy-on-write reflink.
> +In the event that a given filesystem does not implement
> +any form of copy acceleration, the kernel will perform
> +a deep copy of the requested range by reading bytes from
I wonder if it's wise to allow deep copies -- what happens if len == 1T?
Will this syscall just block for a really long time?
> +.I fd_in
> +and writing them to
> +.IR fd_out .
"...if COPY_REFLINK is not set in flags."
> +
> +Currently, Linux only supports the following flag:
> +.TP 1.9i
> +.B COPY_REFLINK
> +Only perform the copy if the filesystem can do it as a reflink.
> +Do not fall back on performing a deep copy.
> +.SH RETURN VALUE
> +Upon successful completion,
> +.BR copy_file_range ()
> +will return the number of bytes copied between files.
> +This could be less than the length originally requested.
> +
> +On error,
> +.BR copy_file_range ()
> +returns \-1 and
> +.I errno
> +is set to indicate the error.
> +.SH ERRORS
> +.TP
> +.B EBADF
> +One or more file descriptors are not valid,
> +or do not have proper read-write mode.
"or fd_out is not opened for writing"?
> +.TP
> +.B EINVAL
> +Requested range extends beyond the end of the file;
> +.I flags
> +argument is set to an invalid value.
> +.TP
> +.B EOPNOTSUPP
> +.B COPY_REFLINK
> +was specified in
> +.IR flags ,
> +but the target filesystem does not support reflinks.
> +.TP
> +.B EXDEV
> +Target filesystem doesn't support cross-filesystem copies.
> +.SH VERSIONS
Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
that can be returned? (I was looking at the fallocate manpage.)
--D
> +The
> +.BR copy_file_range ()
> +system call first appeared in Linux 4.3.
> +.SH CONFORMING TO
> +The
> +.BR copy_file_range ()
> +system call is a nonstandard Linux extension.
> +.SH EXAMPLE
> +.nf
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <linux/copy.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/stat.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +
> +int main(int argc, char **argv)
> +{
> + int fd_in, fd_out;
> + struct stat stat;
> + loff_t len, ret;
> +
> + if (argc != 3) {
> + fprintf(stderr, "Usage: %s <pathname> <pathname>\n", argv[0]);
> + exit(EXIT_FAILURE);
> + }
> +
> + fd_in = open(argv[1], O_RDONLY);
> + if (fd_in == -1) {
> + perror("open (argv[1])");
> + exit(EXIT_FAILURE);
> + }
> +
> + if (fstat(fd_in, &stat) == -1) {
> + perror("fstat");
> + exit(EXIT_FAILURE);
> + }
> + len = stat.st_size;
> +
> + fd_out = open(argv[2], O_WRONLY | O_CREAT, 0644);
> + if (fd_out == -1) {
> + perror("open (argv[2])");
> + exit(EXIT_FAILURE);
> + }
> +
> + do {
> + ret = syscall(__NR_copy_file_range, fd_in, NULL,
> + fd_out, NULL, len, 0);
> + if (ret == -1) {
> + perror("copy_file_range");
> + exit(EXIT_FAILURE);
> + }
> +
> + len -= ret;
> + } while (len > 0);
> +
> + close(fd_in);
> + close(fd_out);
> + exit(EXIT_SUCCESS);
> +}
> +.fi
> +.SH SEE ALSO
> +.BR splice (2)
> --
> 2.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-04 20:16 ` Anna Schumaker
@ 2015-09-04 22:25 ` Andreas Dilger
-1 siblings, 0 replies; 121+ messages in thread
From: Andreas Dilger @ 2015-09-04 22:25 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On Sep 4, 2015, at 2:16 PM, Anna Schumaker <Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
>
> Copy system calls came up during Plumbers a couple of weeks ago,
> because several filesystems (including NFS and XFS) are currently
> working on copy acceleration implementations. We haven't heard from
> Zach Brown in a while, so I volunteered to push his patches upstream
> so individual filesystems don't need to keep writing their own ioctls.
>
> The first three patches are a simple reposting of Zach's patches
> from several months ago, with one minor error code fix. The remaining
> patches add in a fallback mechanism when filesystems don't provide a
> copy function. This is especially useful when doing a server-side
> copy on NFS (using the new COPY operation in NFS v4.2). This fallback
> can be disabled by passing the flag COPY_REFLINK to the system call.
>
> The last patch is a man page patch documenting this new system call,
> including an example program.
>
> I tested the fallback option by using /dev/urandom to generate files
> of varying sizes and copying them. I compared the time to copy
> against that of `cp` just to see if there is a noticable difference.
> I found that runtimes are roughly the same, but in-kernel copy tends
> to use less of the cpu. Values in the tables below are averages
> across multiple trials.
>
>
> /usr/bin/cp | 512 MB | 1024 MB | 1536 MB | 2048 MB
> -------------|-----------|-----------|-----------|-----------
> user | 0.00s | 0.00s | 0.00s | 0.00s
> system | 0.32s | 0.52s | 1.04s | 1.04s
> cpu | 73% | 69% | 62% | 62%
> total | 0.446 | 0.757 | 1.197 | 1.667
>
>
> VFS copy | 512 MB | 1024 MB | 1536 MB | 2048 MB
> -------------|-----------|-----------|-----------|-----------
> user | 0.00s | 0.00s | 0.00s | 0.00s
> system | 0.33s | 0.49s | 0.76s | 0.99s
> cpu | 77% | 62% | 60% | 59%
> total | 0.422 | 0.777 | 1.267 | 1.655
>
>
> Questions? Comments? Thoughts?
This is a bit of a surprising result, since in my testing in the
past, copy_{to/from}_user() is a major consumer of CPU time (50%
of a CPU core at 1GB/s). What backing filesystem did you test on?
In theory, the VFS copy routines should save at least 50% of the
CPU usage since it only needs to make one copy (src->dest) instead
of two (kernel->user, user->kernel). Ideally it wouldn't make any
data copies at all and just pass page references from the source
to the target.
Cheers, Andreas
>
> Anna
>
>
> Anna Schumaker (5):
> btrfs: Add mountpoint checking during btrfs_copy_file_range
> vfs: Remove copy_file_range mountpoint checks
> vfs: Copy should check len after file open mode
> vfs: Copy should use file_out rather than file_in
> vfs: Fall back on splice if no copy function defined
>
> Zach Brown (3):
> vfs: add copy_file_range syscall and vfs helper
> x86: add sys_copy_file_range to syscall tables
> btrfs: add .copy_file_range file operation
>
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/btrfs/ctree.h | 3 +
> fs/btrfs/file.c | 1 +
> fs/btrfs/ioctl.c | 95 ++++++++++++++----------
> fs/read_write.c | 132 +++++++++++++++++++++++++++++++++
> include/linux/copy.h | 6 ++
> include/linux/fs.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> include/uapi/linux/Kbuild | 1 +
> include/uapi/linux/copy.h | 6 ++
> kernel/sys_ni.c | 1 +
> 12 files changed, 214 insertions(+), 40 deletions(-)
> create mode 100644 include/linux/copy.h
> create mode 100644 include/uapi/linux/copy.h
>
> --
> 2.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cheers, Andreas
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-04 22:25 ` Andreas Dilger
0 siblings, 0 replies; 121+ messages in thread
From: Andreas Dilger @ 2015-09-04 22:25 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
On Sep 4, 2015, at 2:16 PM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:
>
> Copy system calls came up during Plumbers a couple of weeks ago,
> because several filesystems (including NFS and XFS) are currently
> working on copy acceleration implementations. We haven't heard from
> Zach Brown in a while, so I volunteered to push his patches upstream
> so individual filesystems don't need to keep writing their own ioctls.
>
> The first three patches are a simple reposting of Zach's patches
> from several months ago, with one minor error code fix. The remaining
> patches add in a fallback mechanism when filesystems don't provide a
> copy function. This is especially useful when doing a server-side
> copy on NFS (using the new COPY operation in NFS v4.2). This fallback
> can be disabled by passing the flag COPY_REFLINK to the system call.
>
> The last patch is a man page patch documenting this new system call,
> including an example program.
>
> I tested the fallback option by using /dev/urandom to generate files
> of varying sizes and copying them. I compared the time to copy
> against that of `cp` just to see if there is a noticable difference.
> I found that runtimes are roughly the same, but in-kernel copy tends
> to use less of the cpu. Values in the tables below are averages
> across multiple trials.
>
>
> /usr/bin/cp | 512 MB | 1024 MB | 1536 MB | 2048 MB
> -------------|-----------|-----------|-----------|-----------
> user | 0.00s | 0.00s | 0.00s | 0.00s
> system | 0.32s | 0.52s | 1.04s | 1.04s
> cpu | 73% | 69% | 62% | 62%
> total | 0.446 | 0.757 | 1.197 | 1.667
>
>
> VFS copy | 512 MB | 1024 MB | 1536 MB | 2048 MB
> -------------|-----------|-----------|-----------|-----------
> user | 0.00s | 0.00s | 0.00s | 0.00s
> system | 0.33s | 0.49s | 0.76s | 0.99s
> cpu | 77% | 62% | 60% | 59%
> total | 0.422 | 0.777 | 1.267 | 1.655
>
>
> Questions? Comments? Thoughts?
This is a bit of a surprising result, since in my testing in the
past, copy_{to/from}_user() is a major consumer of CPU time (50%
of a CPU core at 1GB/s). What backing filesystem did you test on?
In theory, the VFS copy routines should save at least 50% of the
CPU usage since it only needs to make one copy (src->dest) instead
of two (kernel->user, user->kernel). Ideally it wouldn't make any
data copies at all and just pass page references from the source
to the target.
Cheers, Andreas
>
> Anna
>
>
> Anna Schumaker (5):
> btrfs: Add mountpoint checking during btrfs_copy_file_range
> vfs: Remove copy_file_range mountpoint checks
> vfs: Copy should check len after file open mode
> vfs: Copy should use file_out rather than file_in
> vfs: Fall back on splice if no copy function defined
>
> Zach Brown (3):
> vfs: add copy_file_range syscall and vfs helper
> x86: add sys_copy_file_range to syscall tables
> btrfs: add .copy_file_range file operation
>
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/btrfs/ctree.h | 3 +
> fs/btrfs/file.c | 1 +
> fs/btrfs/ioctl.c | 95 ++++++++++++++----------
> fs/read_write.c | 132 +++++++++++++++++++++++++++++++++
> include/linux/copy.h | 6 ++
> include/linux/fs.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> include/uapi/linux/Kbuild | 1 +
> include/uapi/linux/copy.h | 6 ++
> kernel/sys_ni.c | 1 +
> 12 files changed, 214 insertions(+), 40 deletions(-)
> create mode 100644 include/linux/copy.h
> create mode 100644 include/uapi/linux/copy.h
>
> --
> 2.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cheers, Andreas
^ permalink raw reply [flat|nested] 121+ messages in thread[parent not found: <4B41043F-5D85-42D6-8F20-2DCC45930EF4-m1MBpc4rdrD3fQ9qLvQP4Q@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-04 22:25 ` Andreas Dilger
@ 2015-09-05 8:33 ` Al Viro
-1 siblings, 0 replies; 121+ messages in thread
From: Al Viro @ 2015-09-05 8:33 UTC (permalink / raw)
To: Andreas Dilger
Cc: Anna Schumaker, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
clm-b10kYP2dOMg, darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
> This is a bit of a surprising result, since in my testing in the
> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> of a CPU core at 1GB/s). What backing filesystem did you test on?
While we are at it, was cp(1) using read(2)/write(2) loop or was it using
something else (sendfile(2), for example)?
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-05 8:33 ` Al Viro
0 siblings, 0 replies; 121+ messages in thread
From: Al Viro @ 2015-09-05 8:33 UTC (permalink / raw)
To: Andreas Dilger
Cc: Anna Schumaker, linux-nfs, linux-btrfs, linux-fsdevel, linux-api,
zab, clm, darrick.wong, mtk.manpages, andros, hch
On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
> This is a bit of a surprising result, since in my testing in the
> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> of a CPU core at 1GB/s). What backing filesystem did you test on?
While we are at it, was cp(1) using read(2)/write(2) loop or was it using
something else (sendfile(2), for example)?
^ permalink raw reply [flat|nested] 121+ messages in thread[parent not found: <20150905083342.GG22011-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-05 8:33 ` Al Viro
(?)
@ 2015-09-08 15:08 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 15:08 UTC (permalink / raw)
To: Al Viro, Andreas Dilger
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
clm-b10kYP2dOMg, darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On 09/05/2015 04:33 AM, Al Viro wrote:
> On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
>
>> This is a bit of a surprising result, since in my testing in the
>> past, copy_{to/from}_user() is a major consumer of CPU time (50%
>> of a CPU core at 1GB/s). What backing filesystem did you test on?
>
> While we are at it, was cp(1) using read(2)/write(2) loop or was it using
> something else (sendfile(2), for example)?
cp uses a read / write loop, and has some heuristics for guessing an optimum buffer size.
Anna
>
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 15:08 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 15:08 UTC (permalink / raw)
To: Al Viro, Andreas Dilger
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
clm-b10kYP2dOMg, darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On 09/05/2015 04:33 AM, Al Viro wrote:
> On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
>
>> This is a bit of a surprising result, since in my testing in the
>> past, copy_{to/from}_user() is a major consumer of CPU time (50%
>> of a CPU core at 1GB/s). What backing filesystem did you test on?
>
> While we are at it, was cp(1) using read(2)/write(2) loop or was it using
> something else (sendfile(2), for example)?
cp uses a read / write loop, and has some heuristics for guessing an optimum buffer size.
Anna
>
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 15:08 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 15:08 UTC (permalink / raw)
To: Al Viro, Andreas Dilger
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, clm,
darrick.wong, mtk.manpages, andros, hch
On 09/05/2015 04:33 AM, Al Viro wrote:
> On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
>
>> This is a bit of a surprising result, since in my testing in the
>> past, copy_{to/from}_user() is a major consumer of CPU time (50%
>> of a CPU core at 1GB/s). What backing filesystem did you test on?
>
> While we are at it, was cp(1) using read(2)/write(2) loop or was it using
> something else (sendfile(2), for example)?
cp uses a read / write loop, and has some heuristics for guessing an optimum buffer size.
Anna
>
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 15:08 ` Anna Schumaker
(?)
(?)
@ 2015-09-08 20:45 ` Darrick J. Wong
[not found] ` <20150908204517.GC30681-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
-1 siblings, 1 reply; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-08 20:45 UTC (permalink / raw)
To: Anna Schumaker
Cc: Al Viro, Andreas Dilger, linux-nfs, linux-btrfs, linux-fsdevel,
linux-api, zab, clm, mtk.manpages, andros, hch
On Tue, Sep 08, 2015 at 11:08:03AM -0400, Anna Schumaker wrote:
> On 09/05/2015 04:33 AM, Al Viro wrote:
> > On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
> >
> >> This is a bit of a surprising result, since in my testing in the
> >> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> >> of a CPU core at 1GB/s). What backing filesystem did you test on?
> >
> > While we are at it, was cp(1) using read(2)/write(2) loop or was it using
> > something else (sendfile(2), for example)?
>
> cp uses a read / write loop, and has some heuristics for guessing an optimum buffer size.
..but afaict cp doesn't fsync at the end, which means it's possible that
the destination file's blocks are still delalloc and nothing's been flushed
to disk yet. What happens if you time (cp /tmp/a /tmp/b ; sync) ?
2048M / 1.667s = ~1200MB/s.
--D
>
> Anna
>
> >
>
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-04 22:25 ` Andreas Dilger
@ 2015-09-08 15:07 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 15:07 UTC (permalink / raw)
To: Andreas Dilger
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
On 09/04/2015 06:25 PM, Andreas Dilger wrote:
> On Sep 4, 2015, at 2:16 PM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:
>>
>> Copy system calls came up during Plumbers a couple of weeks ago,
>> because several filesystems (including NFS and XFS) are currently
>> working on copy acceleration implementations. We haven't heard from
>> Zach Brown in a while, so I volunteered to push his patches upstream
>> so individual filesystems don't need to keep writing their own ioctls.
>>
>> The first three patches are a simple reposting of Zach's patches
>> from several months ago, with one minor error code fix. The remaining
>> patches add in a fallback mechanism when filesystems don't provide a
>> copy function. This is especially useful when doing a server-side
>> copy on NFS (using the new COPY operation in NFS v4.2). This fallback
>> can be disabled by passing the flag COPY_REFLINK to the system call.
>>
>> The last patch is a man page patch documenting this new system call,
>> including an example program.
>>
>> I tested the fallback option by using /dev/urandom to generate files
>> of varying sizes and copying them. I compared the time to copy
>> against that of `cp` just to see if there is a noticable difference.
>> I found that runtimes are roughly the same, but in-kernel copy tends
>> to use less of the cpu. Values in the tables below are averages
>> across multiple trials.
>>
>>
>> /usr/bin/cp | 512 MB | 1024 MB | 1536 MB | 2048 MB
>> -------------|-----------|-----------|-----------|-----------
>> user | 0.00s | 0.00s | 0.00s | 0.00s
>> system | 0.32s | 0.52s | 1.04s | 1.04s
>> cpu | 73% | 69% | 62% | 62%
>> total | 0.446 | 0.757 | 1.197 | 1.667
>>
>>
>> VFS copy | 512 MB | 1024 MB | 1536 MB | 2048 MB
>> -------------|-----------|-----------|-----------|-----------
>> user | 0.00s | 0.00s | 0.00s | 0.00s
>> system | 0.33s | 0.49s | 0.76s | 0.99s
>> cpu | 77% | 62% | 60% | 59%
>> total | 0.422 | 0.777 | 1.267 | 1.655
>>
>>
>> Questions? Comments? Thoughts?
>
> This is a bit of a surprising result, since in my testing in the
> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> of a CPU core at 1GB/s). What backing filesystem did you test on?
I tested using XFS against two KVM guests. Maybe something there is adding the extra cpu cycles?
Anna
>
> In theory, the VFS copy routines should save at least 50% of the
> CPU usage since it only needs to make one copy (src->dest) instead
> of two (kernel->user, user->kernel). Ideally it wouldn't make any
> data copies at all and just pass page references from the source
> to the target.
>
> Cheers, Andreas
>>
>> Anna
>>
>>
>> Anna Schumaker (5):
>> btrfs: Add mountpoint checking during btrfs_copy_file_range
>> vfs: Remove copy_file_range mountpoint checks
>> vfs: Copy should check len after file open mode
>> vfs: Copy should use file_out rather than file_in
>> vfs: Fall back on splice if no copy function defined
>>
>> Zach Brown (3):
>> vfs: add copy_file_range syscall and vfs helper
>> x86: add sys_copy_file_range to syscall tables
>> btrfs: add .copy_file_range file operation
>>
>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/btrfs/ctree.h | 3 +
>> fs/btrfs/file.c | 1 +
>> fs/btrfs/ioctl.c | 95 ++++++++++++++----------
>> fs/read_write.c | 132 +++++++++++++++++++++++++++++++++
>> include/linux/copy.h | 6 ++
>> include/linux/fs.h | 3 +
>> include/uapi/asm-generic/unistd.h | 4 +-
>> include/uapi/linux/Kbuild | 1 +
>> include/uapi/linux/copy.h | 6 ++
>> kernel/sys_ni.c | 1 +
>> 12 files changed, 214 insertions(+), 40 deletions(-)
>> create mode 100644 include/linux/copy.h
>> create mode 100644 include/uapi/linux/copy.h
>>
>> --
>> 2.5.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
> Cheers, Andreas
>
>
>
>
>
^ permalink raw reply [flat|nested] 121+ messages in thread* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 15:07 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 15:07 UTC (permalink / raw)
To: Andreas Dilger
Cc: linux-nfs, linux-btrfs, linux-fsdevel, linux-api, zab, viro, clm,
darrick.wong, mtk.manpages, andros, hch
On 09/04/2015 06:25 PM, Andreas Dilger wrote:
> On Sep 4, 2015, at 2:16 PM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:
>>
>> Copy system calls came up during Plumbers a couple of weeks ago,
>> because several filesystems (including NFS and XFS) are currently
>> working on copy acceleration implementations. We haven't heard from
>> Zach Brown in a while, so I volunteered to push his patches upstream
>> so individual filesystems don't need to keep writing their own ioctls.
>>
>> The first three patches are a simple reposting of Zach's patches
>> from several months ago, with one minor error code fix. The remaining
>> patches add in a fallback mechanism when filesystems don't provide a
>> copy function. This is especially useful when doing a server-side
>> copy on NFS (using the new COPY operation in NFS v4.2). This fallback
>> can be disabled by passing the flag COPY_REFLINK to the system call.
>>
>> The last patch is a man page patch documenting this new system call,
>> including an example program.
>>
>> I tested the fallback option by using /dev/urandom to generate files
>> of varying sizes and copying them. I compared the time to copy
>> against that of `cp` just to see if there is a noticable difference.
>> I found that runtimes are roughly the same, but in-kernel copy tends
>> to use less of the cpu. Values in the tables below are averages
>> across multiple trials.
>>
>>
>> /usr/bin/cp | 512 MB | 1024 MB | 1536 MB | 2048 MB
>> -------------|-----------|-----------|-----------|-----------
>> user | 0.00s | 0.00s | 0.00s | 0.00s
>> system | 0.32s | 0.52s | 1.04s | 1.04s
>> cpu | 73% | 69% | 62% | 62%
>> total | 0.446 | 0.757 | 1.197 | 1.667
>>
>>
>> VFS copy | 512 MB | 1024 MB | 1536 MB | 2048 MB
>> -------------|-----------|-----------|-----------|-----------
>> user | 0.00s | 0.00s | 0.00s | 0.00s
>> system | 0.33s | 0.49s | 0.76s | 0.99s
>> cpu | 77% | 62% | 60% | 59%
>> total | 0.422 | 0.777 | 1.267 | 1.655
>>
>>
>> Questions? Comments? Thoughts?
>
> This is a bit of a surprising result, since in my testing in the
> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> of a CPU core at 1GB/s). What backing filesystem did you test on?
I tested using XFS against two KVM guests. Maybe something there is adding the extra cpu cycles?
Anna
>
> In theory, the VFS copy routines should save at least 50% of the
> CPU usage since it only needs to make one copy (src->dest) instead
> of two (kernel->user, user->kernel). Ideally it wouldn't make any
> data copies at all and just pass page references from the source
> to the target.
>
> Cheers, Andreas
>>
>> Anna
>>
>>
>> Anna Schumaker (5):
>> btrfs: Add mountpoint checking during btrfs_copy_file_range
>> vfs: Remove copy_file_range mountpoint checks
>> vfs: Copy should check len after file open mode
>> vfs: Copy should use file_out rather than file_in
>> vfs: Fall back on splice if no copy function defined
>>
>> Zach Brown (3):
>> vfs: add copy_file_range syscall and vfs helper
>> x86: add sys_copy_file_range to syscall tables
>> btrfs: add .copy_file_range file operation
>>
>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/btrfs/ctree.h | 3 +
>> fs/btrfs/file.c | 1 +
>> fs/btrfs/ioctl.c | 95 ++++++++++++++----------
>> fs/read_write.c | 132 +++++++++++++++++++++++++++++++++
>> include/linux/copy.h | 6 ++
>> include/linux/fs.h | 3 +
>> include/uapi/asm-generic/unistd.h | 4 +-
>> include/uapi/linux/Kbuild | 1 +
>> include/uapi/linux/copy.h | 6 ++
>> kernel/sys_ni.c | 1 +
>> 12 files changed, 214 insertions(+), 40 deletions(-)
>> create mode 100644 include/linux/copy.h
>> create mode 100644 include/uapi/linux/copy.h
>>
>> --
>> 2.5.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
> Cheers, Andreas
>
>
>
>
>
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-04 20:16 ` Anna Schumaker
@ 2015-09-08 15:21 ` Pádraig Brady
-1 siblings, 0 replies; 121+ messages in thread
From: Pádraig Brady @ 2015-09-08 15:21 UTC (permalink / raw)
To: Anna Schumaker, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On 04/09/15 21:16, Anna Schumaker wrote:
> Copy system calls came up during Plumbers a couple of weeks ago, because
> several filesystems (including NFS and XFS) are currently working on copy
> acceleration implementations. We haven't heard from Zach Brown in a while,
> so I volunteered to push his patches upstream so individual filesystems
> don't need to keep writing their own ioctls.
Just mentioning that this is just pertaining to the data, not the metadata.
Providing metadata copying facilities would be _very_ useful, as
most file system specific details relate to the metadata, and having
VFS operations for that would avoid the plethora of details in each userspace tool,
and theoretically support translations between disparate metadata.
> The first three patches are a simple reposting of Zach's patches from several
> months ago, with one minor error code fix. The remaining patches add in a
> fallback mechanism when filesystems don't provide a copy function. This is
> especially useful when doing a server-side copy on NFS (using the new COPY
> operation in NFS v4.2). This fallback can be disabled by passing the flag
> COPY_REFLINK to the system call.
I see copy_file_range() is a reflink() on BTRFS?
That's a bit surprising, as it avoids the copy completely.
cp(1) for example considered doing a BTRFS clone by default,
but didn't due to expectations that users actually wanted
the data duplicated on disk for resilience reasons,
and for performance reasons so that write latencies were
restricted to the copy operation, rather than being
introduced at usage time as the dest file is CoW'd.
If reflink() is a possibility for copy_file_range()
then could it be done optionally with a flag?
thanks,
Pádraig
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 15:21 ` Pádraig Brady
0 siblings, 0 replies; 121+ messages in thread
From: Pádraig Brady @ 2015-09-08 15:21 UTC (permalink / raw)
To: Anna Schumaker, linux-nfs, linux-btrfs, linux-fsdevel, linux-api,
zab, viro, clm, darrick.wong, mtk.manpages, andros, hch
On 04/09/15 21:16, Anna Schumaker wrote:
> Copy system calls came up during Plumbers a couple of weeks ago, because
> several filesystems (including NFS and XFS) are currently working on copy
> acceleration implementations. We haven't heard from Zach Brown in a while,
> so I volunteered to push his patches upstream so individual filesystems
> don't need to keep writing their own ioctls.
Just mentioning that this is just pertaining to the data, not the metadata.
Providing metadata copying facilities would be _very_ useful, as
most file system specific details relate to the metadata, and having
VFS operations for that would avoid the plethora of details in each userspace tool,
and theoretically support translations between disparate metadata.
> The first three patches are a simple reposting of Zach's patches from several
> months ago, with one minor error code fix. The remaining patches add in a
> fallback mechanism when filesystems don't provide a copy function. This is
> especially useful when doing a server-side copy on NFS (using the new COPY
> operation in NFS v4.2). This fallback can be disabled by passing the flag
> COPY_REFLINK to the system call.
I see copy_file_range() is a reflink() on BTRFS?
That's a bit surprising, as it avoids the copy completely.
cp(1) for example considered doing a BTRFS clone by default,
but didn't due to expectations that users actually wanted
the data duplicated on disk for resilience reasons,
and for performance reasons so that write latencies were
restricted to the copy operation, rather than being
introduced at usage time as the dest file is CoW'd.
If reflink() is a possibility for copy_file_range()
then could it be done optionally with a flag?
thanks,
Pádraig
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <55EEFCEE.5090000-V8g9lnOeT5ydJdNcDFJN0w@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 15:21 ` Pádraig Brady
@ 2015-09-08 18:23 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 18:23 UTC (permalink / raw)
To: Pádraig Brady, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
linux-btrfs-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, zab-ugsP4Wv/S6ZeoWH0uzbU5w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, clm-b10kYP2dOMg,
darrick.wong-QHcLZuEGTsvQT0dZR+AlfA,
mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
andros-HgOvQuBEEgTQT0dZR+AlfA, hch-wEGCiKHe2LqWVfeAwA7xHQ
On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> On 04/09/15 21:16, Anna Schumaker wrote:
>> Copy system calls came up during Plumbers a couple of weeks ago, because
>> several filesystems (including NFS and XFS) are currently working on copy
>> acceleration implementations. We haven't heard from Zach Brown in a while,
>> so I volunteered to push his patches upstream so individual filesystems
>> don't need to keep writing their own ioctls.
>
> Just mentioning that this is just pertaining to the data, not the metadata.
> Providing metadata copying facilities would be _very_ useful, as
> most file system specific details relate to the metadata, and having
> VFS operations for that would avoid the plethora of details in each userspace tool,
> and theoretically support translations between disparate metadata.
Metadata copying could be added later if somebody really wants it.
>
>> The first three patches are a simple reposting of Zach's patches from several
>> months ago, with one minor error code fix. The remaining patches add in a
>> fallback mechanism when filesystems don't provide a copy function. This is
>> especially useful when doing a server-side copy on NFS (using the new COPY
>> operation in NFS v4.2). This fallback can be disabled by passing the flag
>> COPY_REFLINK to the system call.
>
> I see copy_file_range() is a reflink() on BTRFS?
> That's a bit surprising, as it avoids the copy completely.
> cp(1) for example considered doing a BTRFS clone by default,
> but didn't due to expectations that users actually wanted
> the data duplicated on disk for resilience reasons,
> and for performance reasons so that write latencies were
> restricted to the copy operation, rather than being
> introduced at usage time as the dest file is CoW'd.
>
> If reflink() is a possibility for copy_file_range()
> then could it be done optionally with a flag?
The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
What does everybody think?
Anna
>
> thanks,
> Pádraig
>
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 18:23 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-08 18:23 UTC (permalink / raw)
To: Pádraig Brady, linux-nfs, linux-btrfs, linux-fsdevel,
linux-api, zab, viro, clm, darrick.wong, mtk.manpages, andros,
hch
On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> On 04/09/15 21:16, Anna Schumaker wrote:
>> Copy system calls came up during Plumbers a couple of weeks ago, because
>> several filesystems (including NFS and XFS) are currently working on copy
>> acceleration implementations. We haven't heard from Zach Brown in a while,
>> so I volunteered to push his patches upstream so individual filesystems
>> don't need to keep writing their own ioctls.
>
> Just mentioning that this is just pertaining to the data, not the metadata.
> Providing metadata copying facilities would be _very_ useful, as
> most file system specific details relate to the metadata, and having
> VFS operations for that would avoid the plethora of details in each userspace tool,
> and theoretically support translations between disparate metadata.
Metadata copying could be added later if somebody really wants it.
>
>> The first three patches are a simple reposting of Zach's patches from several
>> months ago, with one minor error code fix. The remaining patches add in a
>> fallback mechanism when filesystems don't provide a copy function. This is
>> especially useful when doing a server-side copy on NFS (using the new COPY
>> operation in NFS v4.2). This fallback can be disabled by passing the flag
>> COPY_REFLINK to the system call.
>
> I see copy_file_range() is a reflink() on BTRFS?
> That's a bit surprising, as it avoids the copy completely.
> cp(1) for example considered doing a BTRFS clone by default,
> but didn't due to expectations that users actually wanted
> the data duplicated on disk for resilience reasons,
> and for performance reasons so that write latencies were
> restricted to the copy operation, rather than being
> introduced at usage time as the dest file is CoW'd.
>
> If reflink() is a possibility for copy_file_range()
> then could it be done optionally with a flag?
The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
What does everybody think?
Anna
>
> thanks,
> Pádraig
>
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <55EF279B.3020101-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 18:23 ` Anna Schumaker
@ 2015-09-08 19:10 ` Andy Lutomirski
-1 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 19:10 UTC (permalink / raw)
To: Anna Schumaker
Cc: Pádraig Brady, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Darrick J. Wong,
Michael Kerrisk-manpages, andros-HgOvQuBEEgTQT0dZR+AlfA,
Christoph Hellwig
On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
<Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> I see copy_file_range() is a reflink() on BTRFS?
>> That's a bit surprising, as it avoids the copy completely.
>> cp(1) for example considered doing a BTRFS clone by default,
>> but didn't due to expectations that users actually wanted
>> the data duplicated on disk for resilience reasons,
>> and for performance reasons so that write latencies were
>> restricted to the copy operation, rather than being
>> introduced at usage time as the dest file is CoW'd.
>>
>> If reflink() is a possibility for copy_file_range()
>> then could it be done optionally with a flag?
>
> The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
>
> What does everybody think?
I think the best you could do is to have a hint asking politely for
the data to be deep-copied. After all, some filesystems reserve the
right to transparently deduplicate.
Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
advantage to deep copying unless you actually want two copies for
locality reasons.
--Andy
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 19:10 ` Andy Lutomirski
0 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 19:10 UTC (permalink / raw)
To: Anna Schumaker
Cc: Pádraig Brady, linux-nfs, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Darrick J. Wong, Michael Kerrisk-manpages, andros,
Christoph Hellwig
On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
<Anna.Schumaker@netapp.com> wrote:
> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> I see copy_file_range() is a reflink() on BTRFS?
>> That's a bit surprising, as it avoids the copy completely.
>> cp(1) for example considered doing a BTRFS clone by default,
>> but didn't due to expectations that users actually wanted
>> the data duplicated on disk for resilience reasons,
>> and for performance reasons so that write latencies were
>> restricted to the copy operation, rather than being
>> introduced at usage time as the dest file is CoW'd.
>>
>> If reflink() is a possibility for copy_file_range()
>> then could it be done optionally with a flag?
>
> The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
>
> What does everybody think?
I think the best you could do is to have a hint asking politely for
the data to be deep-copied. After all, some filesystems reserve the
right to transparently deduplicate.
Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
advantage to deep copying unless you actually want two copies for
locality reasons.
--Andy
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <CALCETrXxRB-LXVb+=nkwfj0zEjWuXXTctkSAc9Oec0fgyOQ5Yg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 19:10 ` Andy Lutomirski
@ 2015-09-08 20:03 ` Pádraig Brady
-1 siblings, 0 replies; 121+ messages in thread
From: Pádraig Brady @ 2015-09-08 20:03 UTC (permalink / raw)
To: Andy Lutomirski, Anna Schumaker
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Darrick J. Wong, Michael Kerrisk-manpages,
andros-HgOvQuBEEgTQT0dZR+AlfA, Christoph Hellwig, Coreutils
On 08/09/15 20:10, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> <Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>> I see copy_file_range() is a reflink() on BTRFS?
>>> That's a bit surprising, as it avoids the copy completely.
>>> cp(1) for example considered doing a BTRFS clone by default,
>>> but didn't due to expectations that users actually wanted
>>> the data duplicated on disk for resilience reasons,
>>> and for performance reasons so that write latencies were
>>> restricted to the copy operation, rather than being
>>> introduced at usage time as the dest file is CoW'd.
>>>
>>> If reflink() is a possibility for copy_file_range()
>>> then could it be done optionally with a flag?
>>
>> The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
>>
>> What does everybody think?
>
> I think the best you could do is to have a hint asking politely for
> the data to be deep-copied. After all, some filesystems reserve the
> right to transparently deduplicate.
>
> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> advantage to deep copying unless you actually want two copies for
> locality reasons.
Agreed. The relink and server side copy are separate things.
There's no advantage to not doing a server side copy,
but as mentioned there may be advantages to doing deep copies on BTRFS
(another reason not previous mentioned in this thread, would be
to avoid ENOSPC errors at some time in the future).
So having control over the deep copy seems useful.
It's debatable whether ALLOW_REFLINK should be on/off by default
for copy_file_range(). I'd be inclined to have such a setting off by default,
but cp(1) at least will work with whatever is chosen.
thanks,
Pádraig.
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 20:03 ` Pádraig Brady
0 siblings, 0 replies; 121+ messages in thread
From: Pádraig Brady @ 2015-09-08 20:03 UTC (permalink / raw)
To: Andy Lutomirski, Anna Schumaker
Cc: linux-nfs, Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Darrick J. Wong,
Michael Kerrisk-manpages, andros, Christoph Hellwig, Coreutils
On 08/09/15 20:10, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>> I see copy_file_range() is a reflink() on BTRFS?
>>> That's a bit surprising, as it avoids the copy completely.
>>> cp(1) for example considered doing a BTRFS clone by default,
>>> but didn't due to expectations that users actually wanted
>>> the data duplicated on disk for resilience reasons,
>>> and for performance reasons so that write latencies were
>>> restricted to the copy operation, rather than being
>>> introduced at usage time as the dest file is CoW'd.
>>>
>>> If reflink() is a possibility for copy_file_range()
>>> then could it be done optionally with a flag?
>>
>> The idea is that filesystems get to choose how to handle copies in the default case. BTRFS could do a reflink, but NFS could do a server side copy instead. I can change the default behavior to only do a data copy (unless the reflink flag is specified) instead, if that is desirable.
>>
>> What does everybody think?
>
> I think the best you could do is to have a hint asking politely for
> the data to be deep-copied. After all, some filesystems reserve the
> right to transparently deduplicate.
>
> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> advantage to deep copying unless you actually want two copies for
> locality reasons.
Agreed. The relink and server side copy are separate things.
There's no advantage to not doing a server side copy,
but as mentioned there may be advantages to doing deep copies on BTRFS
(another reason not previous mentioned in this thread, would be
to avoid ENOSPC errors at some time in the future).
So having control over the deep copy seems useful.
It's debatable whether ALLOW_REFLINK should be on/off by default
for copy_file_range(). I'd be inclined to have such a setting off by default,
but cp(1) at least will work with whatever is chosen.
thanks,
Pádraig.
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <55EF3EFD.3080302-V8g9lnOeT5ydJdNcDFJN0w@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 20:03 ` Pádraig Brady
@ 2015-09-08 21:29 ` Darrick J. Wong
-1 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-08 21:29 UTC (permalink / raw)
To: Pádraig Brady
Cc: Andy Lutomirski, Anna Schumaker, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros-HgOvQuBEEgTQT0dZR+AlfA, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> On 08/09/15 20:10, Andy Lutomirski wrote:
> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> > <Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >>> I see copy_file_range() is a reflink() on BTRFS?
> >>> That's a bit surprising, as it avoids the copy completely.
> >>> cp(1) for example considered doing a BTRFS clone by default,
> >>> but didn't due to expectations that users actually wanted
> >>> the data duplicated on disk for resilience reasons,
> >>> and for performance reasons so that write latencies were
> >>> restricted to the copy operation, rather than being
> >>> introduced at usage time as the dest file is CoW'd.
> >>>
> >>> If reflink() is a possibility for copy_file_range()
> >>> then could it be done optionally with a flag?
> >>
> >> The idea is that filesystems get to choose how to handle copies in the
> >> default case. BTRFS could do a reflink, but NFS could do a server side
Eww, different default behaviors depending on the filesystem. :)
> >> copy instead. I can change the default behavior to only do a data copy
> >> (unless the reflink flag is specified) instead, if that is desirable.
> >>
> >> What does everybody think?
> >
> > I think the best you could do is to have a hint asking politely for
> > the data to be deep-copied. After all, some filesystems reserve the
> > right to transparently deduplicate.
> >
> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> > advantage to deep copying unless you actually want two copies for
> > locality reasons.
>
> Agreed. The relink and server side copy are separate things.
> There's no advantage to not doing a server side copy,
> but as mentioned there may be advantages to doing deep copies on BTRFS
> (another reason not previous mentioned in this thread, would be
> to avoid ENOSPC errors at some time in the future).
>
> So having control over the deep copy seems useful.
> It's debatable whether ALLOW_REFLINK should be on/off by default
> for copy_file_range(). I'd be inclined to have such a setting off by default,
> but cp(1) at least will work with whatever is chosen.
So far it looks like people are interested in at least these "make data appear
in this other place" filesystem operations:
1. reflink
2. reflink, but only if the contents are the same (dedupe)
3. regular copy
4. regular copy, but make the hardware do it for us
5. regular copy, but require a second copy on the media (no-dedupe)
6. regular copy, but don't CoW (eatmyothercopies) (joke)
(Please add whatever ops I missed.)
I think I can see a case for letting (4) fall back to (3) since (4) is an
optimization of (3).
However, I particularly don't like the idea of (1) falling back to (3-5).
Either the kernel can satisfy a request or it can't, but let's not just
assume that we should transmogrify one type of request into another. Userspace
should decide if a reflink failure should turn into one of the copy variants,
depending on whether the user wants to spread allocation costs over rewrites or
pay it all up front. Also, if we allow reflink to fall back to copy, how do
programs find out what actually took place? Or do we simply not allow them to
find out?
Also, programs that expect reflink either to finish or fail quickly might be
surprised if it's possible for reflink to take a longer time than usual and
with the side effect that a deep(er) copy was made.
I guess if someone asks for both (1) and (3) we can do the fallback in the
kernel, like how we handle it right now.
--D
>
> thanks,
> Pádraig.
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 21:29 ` Darrick J. Wong
0 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-08 21:29 UTC (permalink / raw)
To: Pádraig Brady
Cc: Andy Lutomirski, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> On 08/09/15 20:10, Andy Lutomirski wrote:
> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> > <Anna.Schumaker@netapp.com> wrote:
> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >>> I see copy_file_range() is a reflink() on BTRFS?
> >>> That's a bit surprising, as it avoids the copy completely.
> >>> cp(1) for example considered doing a BTRFS clone by default,
> >>> but didn't due to expectations that users actually wanted
> >>> the data duplicated on disk for resilience reasons,
> >>> and for performance reasons so that write latencies were
> >>> restricted to the copy operation, rather than being
> >>> introduced at usage time as the dest file is CoW'd.
> >>>
> >>> If reflink() is a possibility for copy_file_range()
> >>> then could it be done optionally with a flag?
> >>
> >> The idea is that filesystems get to choose how to handle copies in the
> >> default case. BTRFS could do a reflink, but NFS could do a server side
Eww, different default behaviors depending on the filesystem. :)
> >> copy instead. I can change the default behavior to only do a data copy
> >> (unless the reflink flag is specified) instead, if that is desirable.
> >>
> >> What does everybody think?
> >
> > I think the best you could do is to have a hint asking politely for
> > the data to be deep-copied. After all, some filesystems reserve the
> > right to transparently deduplicate.
> >
> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> > advantage to deep copying unless you actually want two copies for
> > locality reasons.
>
> Agreed. The relink and server side copy are separate things.
> There's no advantage to not doing a server side copy,
> but as mentioned there may be advantages to doing deep copies on BTRFS
> (another reason not previous mentioned in this thread, would be
> to avoid ENOSPC errors at some time in the future).
>
> So having control over the deep copy seems useful.
> It's debatable whether ALLOW_REFLINK should be on/off by default
> for copy_file_range(). I'd be inclined to have such a setting off by default,
> but cp(1) at least will work with whatever is chosen.
So far it looks like people are interested in at least these "make data appear
in this other place" filesystem operations:
1. reflink
2. reflink, but only if the contents are the same (dedupe)
3. regular copy
4. regular copy, but make the hardware do it for us
5. regular copy, but require a second copy on the media (no-dedupe)
6. regular copy, but don't CoW (eatmyothercopies) (joke)
(Please add whatever ops I missed.)
I think I can see a case for letting (4) fall back to (3) since (4) is an
optimization of (3).
However, I particularly don't like the idea of (1) falling back to (3-5).
Either the kernel can satisfy a request or it can't, but let's not just
assume that we should transmogrify one type of request into another. Userspace
should decide if a reflink failure should turn into one of the copy variants,
depending on whether the user wants to spread allocation costs over rewrites or
pay it all up front. Also, if we allow reflink to fall back to copy, how do
programs find out what actually took place? Or do we simply not allow them to
find out?
Also, programs that expect reflink either to finish or fail quickly might be
surprised if it's possible for reflink to take a longer time than usual and
with the side effect that a deep(er) copy was made.
I guess if someone asks for both (1) and (3) we can do the fallback in the
kernel, like how we handle it right now.
--D
>
> thanks,
> Pádraig.
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 21:29 ` Darrick J. Wong
@ 2015-09-08 21:45 ` Andy Lutomirski
-1 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 21:45 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>> On 08/09/15 20:10, Andy Lutomirski wrote:
>> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>> > <Anna.Schumaker@netapp.com> wrote:
>> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> >>> I see copy_file_range() is a reflink() on BTRFS?
>> >>> That's a bit surprising, as it avoids the copy completely.
>> >>> cp(1) for example considered doing a BTRFS clone by default,
>> >>> but didn't due to expectations that users actually wanted
>> >>> the data duplicated on disk for resilience reasons,
>> >>> and for performance reasons so that write latencies were
>> >>> restricted to the copy operation, rather than being
>> >>> introduced at usage time as the dest file is CoW'd.
>> >>>
>> >>> If reflink() is a possibility for copy_file_range()
>> >>> then could it be done optionally with a flag?
>> >>
>> >> The idea is that filesystems get to choose how to handle copies in the
>> >> default case. BTRFS could do a reflink, but NFS could do a server side
>
> Eww, different default behaviors depending on the filesystem. :)
>
>> >> copy instead. I can change the default behavior to only do a data copy
>> >> (unless the reflink flag is specified) instead, if that is desirable.
>> >>
>> >> What does everybody think?
>> >
>> > I think the best you could do is to have a hint asking politely for
>> > the data to be deep-copied. After all, some filesystems reserve the
>> > right to transparently deduplicate.
>> >
>> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>> > advantage to deep copying unless you actually want two copies for
>> > locality reasons.
>>
>> Agreed. The relink and server side copy are separate things.
>> There's no advantage to not doing a server side copy,
>> but as mentioned there may be advantages to doing deep copies on BTRFS
>> (another reason not previous mentioned in this thread, would be
>> to avoid ENOSPC errors at some time in the future).
>>
>> So having control over the deep copy seems useful.
>> It's debatable whether ALLOW_REFLINK should be on/off by default
>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>> but cp(1) at least will work with whatever is chosen.
>
> So far it looks like people are interested in at least these "make data appear
> in this other place" filesystem operations:
>
> 1. reflink
> 2. reflink, but only if the contents are the same (dedupe)
What I meant by this was: if you ask for "regular copy", you may end
up with a reflink anyway. Anyway, how can you reflink a range and
have the contents *not* be the same?
> 3. regular copy
> 4. regular copy, but make the hardware do it for us
> 5. regular copy, but require a second copy on the media (no-dedupe)
If this comes from me, I have no desire to ever use this as a flag.
If someone wants to use chattr or some new operation to say "make this
range of this file belong just to me for purpose of optimizing future
writes", then sure, go for it, with the understanding that there are
plenty of filesystems for which that doesn't even make sense.
> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>
> (Please add whatever ops I missed.)
>
> I think I can see a case for letting (4) fall back to (3) since (4) is an
> optimization of (3).
>
> However, I particularly don't like the idea of (1) falling back to (3-5).
> Either the kernel can satisfy a request or it can't, but let's not just
> assume that we should transmogrify one type of request into another. Userspace
> should decide if a reflink failure should turn into one of the copy variants,
> depending on whether the user wants to spread allocation costs over rewrites or
> pay it all up front. Also, if we allow reflink to fall back to copy, how do
> programs find out what actually took place? Or do we simply not allow them to
> find out?
>
> Also, programs that expect reflink either to finish or fail quickly might be
> surprised if it's possible for reflink to take a longer time than usual and
> with the side effect that a deep(er) copy was made.
>
> I guess if someone asks for both (1) and (3) we can do the fallback in the
> kernel, like how we handle it right now.
>
I think we should focus on what the actual legit use cases might be.
Certainly we want to support a mode that's "reflink or fail". We
could have these flags:
COPY_FILE_RANGE_ALLOW_REFLINK
COPY_FILE_RANGE_ALLOW_COPY
Setting neither gets -EINVAL. Setting both works as is. Setting just
ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
ALLOW_COPY will make a best-effort attempt not to reflink but
expressly permits reflinking in cases where either (a) plain old
write(2) might also result in a reflink or (b) there is no advantage
to not reflinking.
An example of (b) would be a filesystem backed by deduped
thinly-provisioned storage that can't do anything about ENOSPC because
it doesn't control it in the first place.
Another option would be to split up the copy case into "I expect to
overwrite a lot of the target file soon, so (c) try to commit space
for that or (d) try to make it time-efficient". Of course, (d) is
irrelevant on filesystems with no random access (nvdimms, for
example).
I guess the tl;dr is that I'm highly skeptical of any use for
disallowing reflinking other than forcibly committing space in cases
where committing space actually means something.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 21:45 ` Andy Lutomirski
0 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 21:45 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>> On 08/09/15 20:10, Andy Lutomirski wrote:
>> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>> > <Anna.Schumaker@netapp.com> wrote:
>> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> >>> I see copy_file_range() is a reflink() on BTRFS?
>> >>> That's a bit surprising, as it avoids the copy completely.
>> >>> cp(1) for example considered doing a BTRFS clone by default,
>> >>> but didn't due to expectations that users actually wanted
>> >>> the data duplicated on disk for resilience reasons,
>> >>> and for performance reasons so that write latencies were
>> >>> restricted to the copy operation, rather than being
>> >>> introduced at usage time as the dest file is CoW'd.
>> >>>
>> >>> If reflink() is a possibility for copy_file_range()
>> >>> then could it be done optionally with a flag?
>> >>
>> >> The idea is that filesystems get to choose how to handle copies in the
>> >> default case. BTRFS could do a reflink, but NFS could do a server side
>
> Eww, different default behaviors depending on the filesystem. :)
>
>> >> copy instead. I can change the default behavior to only do a data copy
>> >> (unless the reflink flag is specified) instead, if that is desirable.
>> >>
>> >> What does everybody think?
>> >
>> > I think the best you could do is to have a hint asking politely for
>> > the data to be deep-copied. After all, some filesystems reserve the
>> > right to transparently deduplicate.
>> >
>> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>> > advantage to deep copying unless you actually want two copies for
>> > locality reasons.
>>
>> Agreed. The relink and server side copy are separate things.
>> There's no advantage to not doing a server side copy,
>> but as mentioned there may be advantages to doing deep copies on BTRFS
>> (another reason not previous mentioned in this thread, would be
>> to avoid ENOSPC errors at some time in the future).
>>
>> So having control over the deep copy seems useful.
>> It's debatable whether ALLOW_REFLINK should be on/off by default
>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>> but cp(1) at least will work with whatever is chosen.
>
> So far it looks like people are interested in at least these "make data appear
> in this other place" filesystem operations:
>
> 1. reflink
> 2. reflink, but only if the contents are the same (dedupe)
What I meant by this was: if you ask for "regular copy", you may end
up with a reflink anyway. Anyway, how can you reflink a range and
have the contents *not* be the same?
> 3. regular copy
> 4. regular copy, but make the hardware do it for us
> 5. regular copy, but require a second copy on the media (no-dedupe)
If this comes from me, I have no desire to ever use this as a flag.
If someone wants to use chattr or some new operation to say "make this
range of this file belong just to me for purpose of optimizing future
writes", then sure, go for it, with the understanding that there are
plenty of filesystems for which that doesn't even make sense.
> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>
> (Please add whatever ops I missed.)
>
> I think I can see a case for letting (4) fall back to (3) since (4) is an
> optimization of (3).
>
> However, I particularly don't like the idea of (1) falling back to (3-5).
> Either the kernel can satisfy a request or it can't, but let's not just
> assume that we should transmogrify one type of request into another. Userspace
> should decide if a reflink failure should turn into one of the copy variants,
> depending on whether the user wants to spread allocation costs over rewrites or
> pay it all up front. Also, if we allow reflink to fall back to copy, how do
> programs find out what actually took place? Or do we simply not allow them to
> find out?
>
> Also, programs that expect reflink either to finish or fail quickly might be
> surprised if it's possible for reflink to take a longer time than usual and
> with the side effect that a deep(er) copy was made.
>
> I guess if someone asks for both (1) and (3) we can do the fallback in the
> kernel, like how we handle it right now.
>
I think we should focus on what the actual legit use cases might be.
Certainly we want to support a mode that's "reflink or fail". We
could have these flags:
COPY_FILE_RANGE_ALLOW_REFLINK
COPY_FILE_RANGE_ALLOW_COPY
Setting neither gets -EINVAL. Setting both works as is. Setting just
ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
ALLOW_COPY will make a best-effort attempt not to reflink but
expressly permits reflinking in cases where either (a) plain old
write(2) might also result in a reflink or (b) there is no advantage
to not reflinking.
An example of (b) would be a filesystem backed by deduped
thinly-provisioned storage that can't do anything about ENOSPC because
it doesn't control it in the first place.
Another option would be to split up the copy case into "I expect to
overwrite a lot of the target file soon, so (c) try to commit space
for that or (d) try to make it time-efficient". Of course, (d) is
irrelevant on filesystems with no random access (nvdimms, for
example).
I guess the tl;dr is that I'm highly skeptical of any use for
disallowing reflinking other than forcibly committing space in cases
where committing space actually means something.
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 21:45 ` Andy Lutomirski
@ 2015-09-08 22:39 ` Darrick J. Wong
-1 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-08 22:39 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> > <Anna.Schumaker@netapp.com> wrote:
> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >>> but didn't due to expectations that users actually wanted
> >> >>> the data duplicated on disk for resilience reasons,
> >> >>> and for performance reasons so that write latencies were
> >> >>> restricted to the copy operation, rather than being
> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >>>
> >> >>> If reflink() is a possibility for copy_file_range()
> >> >>> then could it be done optionally with a flag?
> >> >>
> >> >> The idea is that filesystems get to choose how to handle copies in the
> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
> >
> > Eww, different default behaviors depending on the filesystem. :)
> >
> >> >> copy instead. I can change the default behavior to only do a data copy
> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >>
> >> >> What does everybody think?
> >> >
> >> > I think the best you could do is to have a hint asking politely for
> >> > the data to be deep-copied. After all, some filesystems reserve the
> >> > right to transparently deduplicate.
> >> >
> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> > advantage to deep copying unless you actually want two copies for
> >> > locality reasons.
> >>
> >> Agreed. The relink and server side copy are separate things.
> >> There's no advantage to not doing a server side copy,
> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> (another reason not previous mentioned in this thread, would be
> >> to avoid ENOSPC errors at some time in the future).
> >>
> >> So having control over the deep copy seems useful.
> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >> but cp(1) at least will work with whatever is chosen.
> >
> > So far it looks like people are interested in at least these "make data appear
> > in this other place" filesystem operations:
> >
> > 1. reflink
> > 2. reflink, but only if the contents are the same (dedupe)
>
> What I meant by this was: if you ask for "regular copy", you may end
> up with a reflink anyway. Anyway, how can you reflink a range and
> have the contents *not* be the same?
reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
match before, they will afterwards.
dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
Perhaps I should have said "...if the contents are the same before the call"?
>
> > 3. regular copy
> > 4. regular copy, but make the hardware do it for us
> > 5. regular copy, but require a second copy on the media (no-dedupe)
>
> If this comes from me, I have no desire to ever use this as a flag.
I meant (5) as a "disable auto-dedupe for this operation" flag, not as
a "reallocate all the shared blocks now" op...
> If someone wants to use chattr or some new operation to say "make this
> range of this file belong just to me for purpose of optimizing future
> writes", then sure, go for it, with the understanding that there are
> plenty of filesystems for which that doesn't even make sense.
"Unshare these blocks" sounds more like something fallocate could do.
So far in my XFS reflink playground, it seems that using the defrag tool to
un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
fragmented file's data to a second file and use a 'swap extents' operation,
after which the donor file is unlinked.
Hey, if this syscall turns into a more generic "do something involving two
(fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
extents" as a 7th operation, to refactor the ioctls. <smirk>
>
> > 6. regular copy, but don't CoW (eatmyothercopies) (joke)
> >
> > (Please add whatever ops I missed.)
> >
> > I think I can see a case for letting (4) fall back to (3) since (4) is an
> > optimization of (3).
> >
> > However, I particularly don't like the idea of (1) falling back to (3-5).
> > Either the kernel can satisfy a request or it can't, but let's not just
> > assume that we should transmogrify one type of request into another. Userspace
> > should decide if a reflink failure should turn into one of the copy variants,
> > depending on whether the user wants to spread allocation costs over rewrites or
> > pay it all up front. Also, if we allow reflink to fall back to copy, how do
> > programs find out what actually took place? Or do we simply not allow them to
> > find out?
> >
> > Also, programs that expect reflink either to finish or fail quickly might be
> > surprised if it's possible for reflink to take a longer time than usual and
> > with the side effect that a deep(er) copy was made.
> >
> > I guess if someone asks for both (1) and (3) we can do the fallback in the
> > kernel, like how we handle it right now.
> >
>
> I think we should focus on what the actual legit use cases might be.
> Certainly we want to support a mode that's "reflink or fail". We
> could have these flags:
>
> COPY_FILE_RANGE_ALLOW_REFLINK
> COPY_FILE_RANGE_ALLOW_COPY
>
> Setting neither gets -EINVAL. Setting both works as is. Setting just
> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> ALLOW_COPY will make a best-effort attempt not to reflink but
> expressly permits reflinking in cases where either (a) plain old
> write(2) might also result in a reflink or (b) there is no advantage
> to not reflinking.
I don't agree with having a 'copy' flag that can reflink when we also have a
'reflink' flag. I guess I just don't like having a flag with different
meanings depending on context.
Users should be able to get the default behavior by passing '0' for flags, so
provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
an admonishment that one should only use them if they have a goooood reason.
Passing neither gets you reflink-xor-copy, which is what I think we both want
in the general case.
FORBID_REFLINK = 1
FORBID_COPY = 2
CHECK_SAME = 4
HW_COPY = 8
DEDUPE = (FORBID_COPY | CHECK_SAME)
What do you say to that?
> An example of (b) would be a filesystem backed by deduped
> thinly-provisioned storage that can't do anything about ENOSPC because
> it doesn't control it in the first place.
>
> Another option would be to split up the copy case into "I expect to
> overwrite a lot of the target file soon, so (c) try to commit space
> for that or (d) try to make it time-efficient". Of course, (d) is
> irrelevant on filesystems with no random access (nvdimms, for
> example).
>
> I guess the tl;dr is that I'm highly skeptical of any use for
> disallowing reflinking other than forcibly committing space in cases
> where committing space actually means something.
That's more or less where I was going too. :)
--D
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 22:39 ` Darrick J. Wong
0 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-08 22:39 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> > <Anna.Schumaker@netapp.com> wrote:
> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >>> but didn't due to expectations that users actually wanted
> >> >>> the data duplicated on disk for resilience reasons,
> >> >>> and for performance reasons so that write latencies were
> >> >>> restricted to the copy operation, rather than being
> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >>>
> >> >>> If reflink() is a possibility for copy_file_range()
> >> >>> then could it be done optionally with a flag?
> >> >>
> >> >> The idea is that filesystems get to choose how to handle copies in the
> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
> >
> > Eww, different default behaviors depending on the filesystem. :)
> >
> >> >> copy instead. I can change the default behavior to only do a data copy
> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >>
> >> >> What does everybody think?
> >> >
> >> > I think the best you could do is to have a hint asking politely for
> >> > the data to be deep-copied. After all, some filesystems reserve the
> >> > right to transparently deduplicate.
> >> >
> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> > advantage to deep copying unless you actually want two copies for
> >> > locality reasons.
> >>
> >> Agreed. The relink and server side copy are separate things.
> >> There's no advantage to not doing a server side copy,
> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> (another reason not previous mentioned in this thread, would be
> >> to avoid ENOSPC errors at some time in the future).
> >>
> >> So having control over the deep copy seems useful.
> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >> but cp(1) at least will work with whatever is chosen.
> >
> > So far it looks like people are interested in at least these "make data appear
> > in this other place" filesystem operations:
> >
> > 1. reflink
> > 2. reflink, but only if the contents are the same (dedupe)
>
> What I meant by this was: if you ask for "regular copy", you may end
> up with a reflink anyway. Anyway, how can you reflink a range and
> have the contents *not* be the same?
reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
match before, they will afterwards.
dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
Perhaps I should have said "...if the contents are the same before the call"?
>
> > 3. regular copy
> > 4. regular copy, but make the hardware do it for us
> > 5. regular copy, but require a second copy on the media (no-dedupe)
>
> If this comes from me, I have no desire to ever use this as a flag.
I meant (5) as a "disable auto-dedupe for this operation" flag, not as
a "reallocate all the shared blocks now" op...
> If someone wants to use chattr or some new operation to say "make this
> range of this file belong just to me for purpose of optimizing future
> writes", then sure, go for it, with the understanding that there are
> plenty of filesystems for which that doesn't even make sense.
"Unshare these blocks" sounds more like something fallocate could do.
So far in my XFS reflink playground, it seems that using the defrag tool to
un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
fragmented file's data to a second file and use a 'swap extents' operation,
after which the donor file is unlinked.
Hey, if this syscall turns into a more generic "do something involving two
(fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
extents" as a 7th operation, to refactor the ioctls. <smirk>
>
> > 6. regular copy, but don't CoW (eatmyothercopies) (joke)
> >
> > (Please add whatever ops I missed.)
> >
> > I think I can see a case for letting (4) fall back to (3) since (4) is an
> > optimization of (3).
> >
> > However, I particularly don't like the idea of (1) falling back to (3-5).
> > Either the kernel can satisfy a request or it can't, but let's not just
> > assume that we should transmogrify one type of request into another. Userspace
> > should decide if a reflink failure should turn into one of the copy variants,
> > depending on whether the user wants to spread allocation costs over rewrites or
> > pay it all up front. Also, if we allow reflink to fall back to copy, how do
> > programs find out what actually took place? Or do we simply not allow them to
> > find out?
> >
> > Also, programs that expect reflink either to finish or fail quickly might be
> > surprised if it's possible for reflink to take a longer time than usual and
> > with the side effect that a deep(er) copy was made.
> >
> > I guess if someone asks for both (1) and (3) we can do the fallback in the
> > kernel, like how we handle it right now.
> >
>
> I think we should focus on what the actual legit use cases might be.
> Certainly we want to support a mode that's "reflink or fail". We
> could have these flags:
>
> COPY_FILE_RANGE_ALLOW_REFLINK
> COPY_FILE_RANGE_ALLOW_COPY
>
> Setting neither gets -EINVAL. Setting both works as is. Setting just
> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> ALLOW_COPY will make a best-effort attempt not to reflink but
> expressly permits reflinking in cases where either (a) plain old
> write(2) might also result in a reflink or (b) there is no advantage
> to not reflinking.
I don't agree with having a 'copy' flag that can reflink when we also have a
'reflink' flag. I guess I just don't like having a flag with different
meanings depending on context.
Users should be able to get the default behavior by passing '0' for flags, so
provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
an admonishment that one should only use them if they have a goooood reason.
Passing neither gets you reflink-xor-copy, which is what I think we both want
in the general case.
FORBID_REFLINK = 1
FORBID_COPY = 2
CHECK_SAME = 4
HW_COPY = 8
DEDUPE = (FORBID_COPY | CHECK_SAME)
What do you say to that?
> An example of (b) would be a filesystem backed by deduped
> thinly-provisioned storage that can't do anything about ENOSPC because
> it doesn't control it in the first place.
>
> Another option would be to split up the copy case into "I expect to
> overwrite a lot of the target file soon, so (c) try to commit space
> for that or (d) try to make it time-efficient". Of course, (d) is
> irrelevant on filesystems with no random access (nvdimms, for
> example).
>
> I guess the tl;dr is that I'm highly skeptical of any use for
> disallowing reflinking other than forcibly committing space in cases
> where committing space actually means something.
That's more or less where I was going too. :)
--D
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 22:39 ` Darrick J. Wong
@ 2015-09-08 23:08 ` Andy Lutomirski
-1 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 23:08 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>> >> On 08/09/15 20:10, Andy Lutomirski wrote:
>> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>> >> > <Anna.Schumaker@netapp.com> wrote:
>> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> >> >>> I see copy_file_range() is a reflink() on BTRFS?
>> >> >>> That's a bit surprising, as it avoids the copy completely.
>> >> >>> cp(1) for example considered doing a BTRFS clone by default,
>> >> >>> but didn't due to expectations that users actually wanted
>> >> >>> the data duplicated on disk for resilience reasons,
>> >> >>> and for performance reasons so that write latencies were
>> >> >>> restricted to the copy operation, rather than being
>> >> >>> introduced at usage time as the dest file is CoW'd.
>> >> >>>
>> >> >>> If reflink() is a possibility for copy_file_range()
>> >> >>> then could it be done optionally with a flag?
>> >> >>
>> >> >> The idea is that filesystems get to choose how to handle copies in the
>> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
>> >
>> > Eww, different default behaviors depending on the filesystem. :)
>> >
>> >> >> copy instead. I can change the default behavior to only do a data copy
>> >> >> (unless the reflink flag is specified) instead, if that is desirable.
>> >> >>
>> >> >> What does everybody think?
>> >> >
>> >> > I think the best you could do is to have a hint asking politely for
>> >> > the data to be deep-copied. After all, some filesystems reserve the
>> >> > right to transparently deduplicate.
>> >> >
>> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>> >> > advantage to deep copying unless you actually want two copies for
>> >> > locality reasons.
>> >>
>> >> Agreed. The relink and server side copy are separate things.
>> >> There's no advantage to not doing a server side copy,
>> >> but as mentioned there may be advantages to doing deep copies on BTRFS
>> >> (another reason not previous mentioned in this thread, would be
>> >> to avoid ENOSPC errors at some time in the future).
>> >>
>> >> So having control over the deep copy seems useful.
>> >> It's debatable whether ALLOW_REFLINK should be on/off by default
>> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
>> >> but cp(1) at least will work with whatever is chosen.
>> >
>> > So far it looks like people are interested in at least these "make data appear
>> > in this other place" filesystem operations:
>> >
>> > 1. reflink
>> > 2. reflink, but only if the contents are the same (dedupe)
>>
>> What I meant by this was: if you ask for "regular copy", you may end
>> up with a reflink anyway. Anyway, how can you reflink a range and
>> have the contents *not* be the same?
>
> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> match before, they will afterwards.
>
> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>
> Perhaps I should have said "...if the contents are the same before the call"?
>
Oh, I see.
Can we have a clean way to figure out whether two file ranges are the
same in a way that allows false negatives? I.e. return 1 if the
ranges are reflinks of each other and 0 if not? Pretty please? I've
implemented that in the past on btrfs by syncing the ranges and then
comparing FIEMAP output, but that's hideous.
>>
>> > 3. regular copy
>> > 4. regular copy, but make the hardware do it for us
>> > 5. regular copy, but require a second copy on the media (no-dedupe)
>>
>> If this comes from me, I have no desire to ever use this as a flag.
>
> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> a "reallocate all the shared blocks now" op...
Hmm, interesting. What effect does it have on systems that do
deferred auto-dedupe?
>>
>> I think we should focus on what the actual legit use cases might be.
>> Certainly we want to support a mode that's "reflink or fail". We
>> could have these flags:
>>
>> COPY_FILE_RANGE_ALLOW_REFLINK
>> COPY_FILE_RANGE_ALLOW_COPY
>>
>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>> ALLOW_COPY will make a best-effort attempt not to reflink but
>> expressly permits reflinking in cases where either (a) plain old
>> write(2) might also result in a reflink or (b) there is no advantage
>> to not reflinking.
>
> I don't agree with having a 'copy' flag that can reflink when we also have a
> 'reflink' flag. I guess I just don't like having a flag with different
> meanings depending on context.
>
> Users should be able to get the default behavior by passing '0' for flags, so
> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> an admonishment that one should only use them if they have a goooood reason.
> Passing neither gets you reflink-xor-copy, which is what I think we both want
> in the general case.
>
> FORBID_REFLINK = 1
> FORBID_COPY = 2
> CHECK_SAME = 4
> HW_COPY = 8
>
> DEDUPE = (FORBID_COPY | CHECK_SAME)
>
> What do you say to that?
What does HW_COPY mean?
If we have enough weird combinations, maybe having a mode instead of
flags makes sense.
--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-08 23:08 ` Andy Lutomirski
0 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-08 23:08 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>> >> On 08/09/15 20:10, Andy Lutomirski wrote:
>> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>> >> > <Anna.Schumaker@netapp.com> wrote:
>> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> >> >>> I see copy_file_range() is a reflink() on BTRFS?
>> >> >>> That's a bit surprising, as it avoids the copy completely.
>> >> >>> cp(1) for example considered doing a BTRFS clone by default,
>> >> >>> but didn't due to expectations that users actually wanted
>> >> >>> the data duplicated on disk for resilience reasons,
>> >> >>> and for performance reasons so that write latencies were
>> >> >>> restricted to the copy operation, rather than being
>> >> >>> introduced at usage time as the dest file is CoW'd.
>> >> >>>
>> >> >>> If reflink() is a possibility for copy_file_range()
>> >> >>> then could it be done optionally with a flag?
>> >> >>
>> >> >> The idea is that filesystems get to choose how to handle copies in the
>> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
>> >
>> > Eww, different default behaviors depending on the filesystem. :)
>> >
>> >> >> copy instead. I can change the default behavior to only do a data copy
>> >> >> (unless the reflink flag is specified) instead, if that is desirable.
>> >> >>
>> >> >> What does everybody think?
>> >> >
>> >> > I think the best you could do is to have a hint asking politely for
>> >> > the data to be deep-copied. After all, some filesystems reserve the
>> >> > right to transparently deduplicate.
>> >> >
>> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>> >> > advantage to deep copying unless you actually want two copies for
>> >> > locality reasons.
>> >>
>> >> Agreed. The relink and server side copy are separate things.
>> >> There's no advantage to not doing a server side copy,
>> >> but as mentioned there may be advantages to doing deep copies on BTRFS
>> >> (another reason not previous mentioned in this thread, would be
>> >> to avoid ENOSPC errors at some time in the future).
>> >>
>> >> So having control over the deep copy seems useful.
>> >> It's debatable whether ALLOW_REFLINK should be on/off by default
>> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
>> >> but cp(1) at least will work with whatever is chosen.
>> >
>> > So far it looks like people are interested in at least these "make data appear
>> > in this other place" filesystem operations:
>> >
>> > 1. reflink
>> > 2. reflink, but only if the contents are the same (dedupe)
>>
>> What I meant by this was: if you ask for "regular copy", you may end
>> up with a reflink anyway. Anyway, how can you reflink a range and
>> have the contents *not* be the same?
>
> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> match before, they will afterwards.
>
> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>
> Perhaps I should have said "...if the contents are the same before the call"?
>
Oh, I see.
Can we have a clean way to figure out whether two file ranges are the
same in a way that allows false negatives? I.e. return 1 if the
ranges are reflinks of each other and 0 if not? Pretty please? I've
implemented that in the past on btrfs by syncing the ranges and then
comparing FIEMAP output, but that's hideous.
>>
>> > 3. regular copy
>> > 4. regular copy, but make the hardware do it for us
>> > 5. regular copy, but require a second copy on the media (no-dedupe)
>>
>> If this comes from me, I have no desire to ever use this as a flag.
>
> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> a "reallocate all the shared blocks now" op...
Hmm, interesting. What effect does it have on systems that do
deferred auto-dedupe?
>>
>> I think we should focus on what the actual legit use cases might be.
>> Certainly we want to support a mode that's "reflink or fail". We
>> could have these flags:
>>
>> COPY_FILE_RANGE_ALLOW_REFLINK
>> COPY_FILE_RANGE_ALLOW_COPY
>>
>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>> ALLOW_COPY will make a best-effort attempt not to reflink but
>> expressly permits reflinking in cases where either (a) plain old
>> write(2) might also result in a reflink or (b) there is no advantage
>> to not reflinking.
>
> I don't agree with having a 'copy' flag that can reflink when we also have a
> 'reflink' flag. I guess I just don't like having a flag with different
> meanings depending on context.
>
> Users should be able to get the default behavior by passing '0' for flags, so
> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> an admonishment that one should only use them if they have a goooood reason.
> Passing neither gets you reflink-xor-copy, which is what I think we both want
> in the general case.
>
> FORBID_REFLINK = 1
> FORBID_COPY = 2
> CHECK_SAME = 4
> HW_COPY = 8
>
> DEDUPE = (FORBID_COPY | CHECK_SAME)
>
> What do you say to that?
What does HW_COPY mean?
If we have enough weird combinations, maybe having a mode instead of
flags makes sense.
--Andy
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 23:08 ` Andy Lutomirski
@ 2015-09-09 1:19 ` Darrick J. Wong
-1 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-09 1:19 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> >> > <Anna.Schumaker@netapp.com> wrote:
> >> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >> >>> but didn't due to expectations that users actually wanted
> >> >> >>> the data duplicated on disk for resilience reasons,
> >> >> >>> and for performance reasons so that write latencies were
> >> >> >>> restricted to the copy operation, rather than being
> >> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >> >>>
> >> >> >>> If reflink() is a possibility for copy_file_range()
> >> >> >>> then could it be done optionally with a flag?
> >> >> >>
> >> >> >> The idea is that filesystems get to choose how to handle copies in the
> >> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
> >> >
> >> > Eww, different default behaviors depending on the filesystem. :)
> >> >
> >> >> >> copy instead. I can change the default behavior to only do a data copy
> >> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >> >>
> >> >> >> What does everybody think?
> >> >> >
> >> >> > I think the best you could do is to have a hint asking politely for
> >> >> > the data to be deep-copied. After all, some filesystems reserve the
> >> >> > right to transparently deduplicate.
> >> >> >
> >> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> >> > advantage to deep copying unless you actually want two copies for
> >> >> > locality reasons.
> >> >>
> >> >> Agreed. The relink and server side copy are separate things.
> >> >> There's no advantage to not doing a server side copy,
> >> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> >> (another reason not previous mentioned in this thread, would be
> >> >> to avoid ENOSPC errors at some time in the future).
> >> >>
> >> >> So having control over the deep copy seems useful.
> >> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >> >> but cp(1) at least will work with whatever is chosen.
> >> >
> >> > So far it looks like people are interested in at least these "make data appear
> >> > in this other place" filesystem operations:
> >> >
> >> > 1. reflink
> >> > 2. reflink, but only if the contents are the same (dedupe)
> >>
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway. Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
> >
> > Perhaps I should have said "...if the contents are the same before the call"?
> >
>
> Oh, I see.
>
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives? I.e. return 1 if the
> ranges are reflinks of each other and 0 if not? Pretty please? I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.
Another mode for this call... :)
> >>
> >> > 3. regular copy
> >> > 4. regular copy, but make the hardware do it for us
> >> > 5. regular copy, but require a second copy on the media (no-dedupe)
> >>
> >> If this comes from me, I have no desire to ever use this as a flag.
> >
> > I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> > a "reallocate all the shared blocks now" op...
>
> Hmm, interesting. What effect does it have on systems that do
> deferred auto-dedupe?
If it's a userspace deferred auto-dedupe, then hopefully the program
coordinates with the dedupe program.
Otherwise, it's only effective with a dedupe that runs in the write-path.
> >>
> >> I think we should focus on what the actual legit use cases might be.
> >> Certainly we want to support a mode that's "reflink or fail". We
> >> could have these flags:
> >>
> >> COPY_FILE_RANGE_ALLOW_REFLINK
> >> COPY_FILE_RANGE_ALLOW_COPY
> >>
> >> Setting neither gets -EINVAL. Setting both works as is. Setting just
> >> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> >> ALLOW_COPY will make a best-effort attempt not to reflink but
> >> expressly permits reflinking in cases where either (a) plain old
> >> write(2) might also result in a reflink or (b) there is no advantage
> >> to not reflinking.
> >
> > I don't agree with having a 'copy' flag that can reflink when we also have a
> > 'reflink' flag. I guess I just don't like having a flag with different
> > meanings depending on context.
> >
> > Users should be able to get the default behavior by passing '0' for flags, so
> > provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> > an admonishment that one should only use them if they have a goooood reason.
> > Passing neither gets you reflink-xor-copy, which is what I think we both want
> > in the general case.
> >
> > FORBID_REFLINK = 1
> > FORBID_COPY = 2
> > CHECK_SAME = 4
> > HW_COPY = 8
> >
> > DEDUPE = (FORBID_COPY | CHECK_SAME)
> >
> > What do you say to that?
>
> What does HW_COPY mean?
It /probably/ means that the FS tells the storage device to copy the block
rather than streaming it through the page cache. Your autodedupe thinp device,
for example, would "copy" block X to Y by mapping both X and Y to the same
piece of media.
(Effectively the same thing as FS reflink/dedupe, but in the storage dev.)
>
> If we have enough weird combinations, maybe having a mode instead of
> flags makes sense.
Let's hope not. :)
--D
>
> --Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-09 1:19 ` Darrick J. Wong
0 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-09 1:19 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> >> > <Anna.Schumaker@netapp.com> wrote:
> >> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >> >>> but didn't due to expectations that users actually wanted
> >> >> >>> the data duplicated on disk for resilience reasons,
> >> >> >>> and for performance reasons so that write latencies were
> >> >> >>> restricted to the copy operation, rather than being
> >> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >> >>>
> >> >> >>> If reflink() is a possibility for copy_file_range()
> >> >> >>> then could it be done optionally with a flag?
> >> >> >>
> >> >> >> The idea is that filesystems get to choose how to handle copies in the
> >> >> >> default case. BTRFS could do a reflink, but NFS could do a server side
> >> >
> >> > Eww, different default behaviors depending on the filesystem. :)
> >> >
> >> >> >> copy instead. I can change the default behavior to only do a data copy
> >> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >> >>
> >> >> >> What does everybody think?
> >> >> >
> >> >> > I think the best you could do is to have a hint asking politely for
> >> >> > the data to be deep-copied. After all, some filesystems reserve the
> >> >> > right to transparently deduplicate.
> >> >> >
> >> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> >> > advantage to deep copying unless you actually want two copies for
> >> >> > locality reasons.
> >> >>
> >> >> Agreed. The relink and server side copy are separate things.
> >> >> There's no advantage to not doing a server side copy,
> >> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> >> (another reason not previous mentioned in this thread, would be
> >> >> to avoid ENOSPC errors at some time in the future).
> >> >>
> >> >> So having control over the deep copy seems useful.
> >> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> >> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >> >> but cp(1) at least will work with whatever is chosen.
> >> >
> >> > So far it looks like people are interested in at least these "make data appear
> >> > in this other place" filesystem operations:
> >> >
> >> > 1. reflink
> >> > 2. reflink, but only if the contents are the same (dedupe)
> >>
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway. Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
> >
> > Perhaps I should have said "...if the contents are the same before the call"?
> >
>
> Oh, I see.
>
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives? I.e. return 1 if the
> ranges are reflinks of each other and 0 if not? Pretty please? I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.
Another mode for this call... :)
> >>
> >> > 3. regular copy
> >> > 4. regular copy, but make the hardware do it for us
> >> > 5. regular copy, but require a second copy on the media (no-dedupe)
> >>
> >> If this comes from me, I have no desire to ever use this as a flag.
> >
> > I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> > a "reallocate all the shared blocks now" op...
>
> Hmm, interesting. What effect does it have on systems that do
> deferred auto-dedupe?
If it's a userspace deferred auto-dedupe, then hopefully the program
coordinates with the dedupe program.
Otherwise, it's only effective with a dedupe that runs in the write-path.
> >>
> >> I think we should focus on what the actual legit use cases might be.
> >> Certainly we want to support a mode that's "reflink or fail". We
> >> could have these flags:
> >>
> >> COPY_FILE_RANGE_ALLOW_REFLINK
> >> COPY_FILE_RANGE_ALLOW_COPY
> >>
> >> Setting neither gets -EINVAL. Setting both works as is. Setting just
> >> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> >> ALLOW_COPY will make a best-effort attempt not to reflink but
> >> expressly permits reflinking in cases where either (a) plain old
> >> write(2) might also result in a reflink or (b) there is no advantage
> >> to not reflinking.
> >
> > I don't agree with having a 'copy' flag that can reflink when we also have a
> > 'reflink' flag. I guess I just don't like having a flag with different
> > meanings depending on context.
> >
> > Users should be able to get the default behavior by passing '0' for flags, so
> > provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> > an admonishment that one should only use them if they have a goooood reason.
> > Passing neither gets you reflink-xor-copy, which is what I think we both want
> > in the general case.
> >
> > FORBID_REFLINK = 1
> > FORBID_COPY = 2
> > CHECK_SAME = 4
> > HW_COPY = 8
> >
> > DEDUPE = (FORBID_COPY | CHECK_SAME)
> >
> > What do you say to that?
>
> What does HW_COPY mean?
It /probably/ means that the FS tells the storage device to copy the block
rather than streaming it through the page cache. Your autodedupe thinp device,
for example, would "copy" block X to Y by mapping both X and Y to the same
piece of media.
(Effectively the same thing as FS reflink/dedupe, but in the storage dev.)
>
> If we have enough weird combinations, maybe having a mode instead of
> flags makes sense.
Let's hope not. :)
--D
>
> --Andy
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 23:08 ` Andy Lutomirski
(?)
(?)
@ 2015-09-09 20:09 ` Chris Mason
[not found] ` <20150909200921.GD9511-DzB2rL6jT1BHfPKRx072akEOCMrvLtNR@public.gmane.org>
-1 siblings, 1 reply; 121+ messages in thread
From: Chris Mason @ 2015-09-09 20:09 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Darrick J. Wong, Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Michael Kerrisk-manpages, andros,
Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway. Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
> >
> > Perhaps I should have said "...if the contents are the same before the call"?
> >
>
> Oh, I see.
>
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives? I.e. return 1 if the
> ranges are reflinks of each other and 0 if not? Pretty please? I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.
I'd almost rather have a separate call, maybe unshare_file_range()?
Is that the end goal to the sharing check?
-chris
^ permalink raw reply [flat|nested] 121+ messages in thread[parent not found: <CALCETrVsWBdqvAgwxHcG=gbcWRNPG2ZziWUg1g=siKDrDu7S2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 23:08 ` Andy Lutomirski
@ 2015-09-13 23:25 ` Dave Chinner
-1 siblings, 0 replies; 121+ messages in thread
From: Dave Chinner @ 2015-09-13 23:25 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Darrick J. Wong, Pádraig Brady, Anna Schumaker,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros-HgOvQuBEEgTQT0dZR+AlfA,
Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives? I.e. return 1 if the
> ranges are reflinks of each other and 0 if not? Pretty please? I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.
That fundamentally doesn't work for userspace, because the moment
the filesystem drops it's locks on the inodes in the kernel after
doing the comparison the mappings can change. IOWs, by the time the
information gets back to userspace, it's already wrong. e.g. cp made
this mistake by trying to use FIEMAP to optimise hole detection in
files and ended up with corrupt copies.
It really doesn't matter what the syscall/ioctl interface is, trying
to make application logic decisions based on inode block mappings
from userspace is racy and not safe and will go wrong...
Cheers,
Dave.
--
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-13 23:25 ` Dave Chinner
0 siblings, 0 replies; 121+ messages in thread
From: Dave Chinner @ 2015-09-13 23:25 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Darrick J. Wong, Pádraig Brady, Anna Schumaker, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives? I.e. return 1 if the
> ranges are reflinks of each other and 0 if not? Pretty please? I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.
That fundamentally doesn't work for userspace, because the moment
the filesystem drops it's locks on the inodes in the kernel after
doing the comparison the mappings can change. IOWs, by the time the
information gets back to userspace, it's already wrong. e.g. cp made
this mistake by trying to use FIEMAP to optimise hole detection in
files and ended up with corrupt copies.
It really doesn't matter what the syscall/ioctl interface is, trying
to make application logic decisions based on inode block mappings
from userspace is racy and not safe and will go wrong...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-13 23:25 ` Dave Chinner
@ 2015-09-14 17:53 ` Andy Lutomirski
-1 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-14 17:53 UTC (permalink / raw)
To: Dave Chinner
Cc: Zach Brown, Coreutils, Al Viro, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
Anna Schumaker, andros-HgOvQuBEEgTQT0dZR+AlfA, Darrick J. Wong,
Pádraig Brady, Linux btrfs Developers List, Chris Mason,
Linux API, Christoph Hellwig, Michael Kerrisk-manpages,
Linux FS Devel
On Sep 13, 2015 4:25 PM, "Dave Chinner" <david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org> wrote:
>
> On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> > Can we have a clean way to figure out whether two file ranges are the
> > same in a way that allows false negatives? I.e. return 1 if the
> > ranges are reflinks of each other and 0 if not? Pretty please? I've
> > implemented that in the past on btrfs by syncing the ranges and then
> > comparing FIEMAP output, but that's hideous.
>
> That fundamentally doesn't work for userspace, because the moment
> the filesystem drops it's locks on the inodes in the kernel after
> doing the comparison the mappings can change. IOWs, by the time the
> information gets back to userspace, it's already wrong. e.g. cp made
> this mistake by trying to use FIEMAP to optimise hole detection in
> files and ended up with corrupt copies.
>
> It really doesn't matter what the syscall/ioctl interface is, trying
> to make application logic decisions based on inode block mappings
> from userspace is racy and not safe and will go wrong...
>
I agree, and that thing was just an experiment. I'd love to see a
sane and correct interface, though.
--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-14 17:53 ` Andy Lutomirski
0 siblings, 0 replies; 121+ messages in thread
From: Andy Lutomirski @ 2015-09-14 17:53 UTC (permalink / raw)
To: Dave Chinner
Cc: Zach Brown, Coreutils, Al Viro, linux-nfs, Anna Schumaker, andros,
Darrick J. Wong, Pádraig Brady, Linux btrfs Developers List,
Chris Mason, Linux API, Christoph Hellwig,
Michael Kerrisk-manpages, Linux FS Devel
On Sep 13, 2015 4:25 PM, "Dave Chinner" <david@fromorbit.com> wrote:
>
> On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> > Can we have a clean way to figure out whether two file ranges are the
> > same in a way that allows false negatives? I.e. return 1 if the
> > ranges are reflinks of each other and 0 if not? Pretty please? I've
> > implemented that in the past on btrfs by syncing the ranges and then
> > comparing FIEMAP output, but that's hideous.
>
> That fundamentally doesn't work for userspace, because the moment
> the filesystem drops it's locks on the inodes in the kernel after
> doing the comparison the mappings can change. IOWs, by the time the
> information gets back to userspace, it's already wrong. e.g. cp made
> this mistake by trying to use FIEMAP to optimise hole detection in
> files and ended up with corrupt copies.
>
> It really doesn't matter what the syscall/ioctl interface is, trying
> to make application logic decisions based on inode block mappings
> from userspace is racy and not safe and will go wrong...
>
I agree, and that thing was just an experiment. I'd love to see a
sane and correct interface, though.
--Andy
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-08 22:39 ` Darrick J. Wong
@ 2015-09-09 18:52 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-09 18:52 UTC (permalink / raw)
To: Darrick J. Wong, Andy Lutomirski
Cc: Pádraig Brady, linux-nfs, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros, Christoph Hellwig, Coreutils
On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>> and for performance reasons so that write latencies were
>>>>>>> restricted to the copy operation, rather than being
>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>
>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>> then could it be done optionally with a flag?
>>>>>>
>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>
>>> Eww, different default behaviors depending on the filesystem. :)
>>>
>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>
>>>>>> What does everybody think?
>>>>>
>>>>> I think the best you could do is to have a hint asking politely for
>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>> right to transparently deduplicate.
>>>>>
>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>> advantage to deep copying unless you actually want two copies for
>>>>> locality reasons.
>>>>
>>>> Agreed. The relink and server side copy are separate things.
>>>> There's no advantage to not doing a server side copy,
>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>> (another reason not previous mentioned in this thread, would be
>>>> to avoid ENOSPC errors at some time in the future).
>>>>
>>>> So having control over the deep copy seems useful.
>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>> but cp(1) at least will work with whatever is chosen.
>>>
>>> So far it looks like people are interested in at least these "make data appear
>>> in this other place" filesystem operations:
>>>
>>> 1. reflink
>>> 2. reflink, but only if the contents are the same (dedupe)
>>
>> What I meant by this was: if you ask for "regular copy", you may end
>> up with a reflink anyway. Anyway, how can you reflink a range and
>> have the contents *not* be the same?
>
> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> match before, they will afterwards.
>
> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>
> Perhaps I should have said "...if the contents are the same before the call"?
>
>>
>>> 3. regular copy
>>> 4. regular copy, but make the hardware do it for us
>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>
>> If this comes from me, I have no desire to ever use this as a flag.
>
> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> a "reallocate all the shared blocks now" op...
>
>> If someone wants to use chattr or some new operation to say "make this
>> range of this file belong just to me for purpose of optimizing future
>> writes", then sure, go for it, with the understanding that there are
>> plenty of filesystems for which that doesn't even make sense.
>
> "Unshare these blocks" sounds more like something fallocate could do.
>
> So far in my XFS reflink playground, it seems that using the defrag tool to
> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
> fragmented file's data to a second file and use a 'swap extents' operation,
> after which the donor file is unlinked.
>
> Hey, if this syscall turns into a more generic "do something involving two
> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
> extents" as a 7th operation, to refactor the ioctls. <smirk>
>
>>
>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>
>>> (Please add whatever ops I missed.)
>>>
>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>> optimization of (3).
>>>
>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>> Either the kernel can satisfy a request or it can't, but let's not just
>>> assume that we should transmogrify one type of request into another. Userspace
>>> should decide if a reflink failure should turn into one of the copy variants,
>>> depending on whether the user wants to spread allocation costs over rewrites or
>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>> programs find out what actually took place? Or do we simply not allow them to
>>> find out?
>>>
>>> Also, programs that expect reflink either to finish or fail quickly might be
>>> surprised if it's possible for reflink to take a longer time than usual and
>>> with the side effect that a deep(er) copy was made.
>>>
>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>> kernel, like how we handle it right now.
>>>
>>
>> I think we should focus on what the actual legit use cases might be.
>> Certainly we want to support a mode that's "reflink or fail". We
>> could have these flags:
>>
>> COPY_FILE_RANGE_ALLOW_REFLINK
>> COPY_FILE_RANGE_ALLOW_COPY
>>
>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>> ALLOW_COPY will make a best-effort attempt not to reflink but
>> expressly permits reflinking in cases where either (a) plain old
>> write(2) might also result in a reflink or (b) there is no advantage
>> to not reflinking.
>
> I don't agree with having a 'copy' flag that can reflink when we also have a
> 'reflink' flag. I guess I just don't like having a flag with different
> meanings depending on context.
>
> Users should be able to get the default behavior by passing '0' for flags, so
> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> an admonishment that one should only use them if they have a goooood reason.
> Passing neither gets you reflink-xor-copy, which is what I think we both want
> in the general case.
I agree here that 0 for flags should do something useful, and I wanted to double check if reflink-xor-copy is a good default behavior.
>
> FORBID_REFLINK = 1
> FORBID_COPY = 2
I don't like the idea of using flags to forbid behavior. I think it would be more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users can tell us what they want, instead of what they don't want.
While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit of a mouthful. Does anybody have suggestions for ways that I could make this shorter?
Thanks,
Anna
> CHECK_SAME = 4
> HW_COPY = 8
>
> DEDUPE = (FORBID_COPY | CHECK_SAME)
>
> What do you say to that?
>
>> An example of (b) would be a filesystem backed by deduped
>> thinly-provisioned storage that can't do anything about ENOSPC because
>> it doesn't control it in the first place.
>>
>> Another option would be to split up the copy case into "I expect to
>> overwrite a lot of the target file soon, so (c) try to commit space
>> for that or (d) try to make it time-efficient". Of course, (d) is
>> irrelevant on filesystems with no random access (nvdimms, for
>> example).
>>
>> I guess the tl;dr is that I'm highly skeptical of any use for
>> disallowing reflinking other than forcibly committing space in cases
>> where committing space actually means something.
>
> That's more or less where I was going too. :)
>
> --D
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-09 18:52 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-09 18:52 UTC (permalink / raw)
To: Darrick J. Wong, Andy Lutomirski
Cc: Pádraig Brady, linux-nfs, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros, Christoph Hellwig, Coreutils
On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>> and for performance reasons so that write latencies were
>>>>>>> restricted to the copy operation, rather than being
>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>
>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>> then could it be done optionally with a flag?
>>>>>>
>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>
>>> Eww, different default behaviors depending on the filesystem. :)
>>>
>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>
>>>>>> What does everybody think?
>>>>>
>>>>> I think the best you could do is to have a hint asking politely for
>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>> right to transparently deduplicate.
>>>>>
>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>> advantage to deep copying unless you actually want two copies for
>>>>> locality reasons.
>>>>
>>>> Agreed. The relink and server side copy are separate things.
>>>> There's no advantage to not doing a server side copy,
>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>> (another reason not previous mentioned in this thread, would be
>>>> to avoid ENOSPC errors at some time in the future).
>>>>
>>>> So having control over the deep copy seems useful.
>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>> but cp(1) at least will work with whatever is chosen.
>>>
>>> So far it looks like people are interested in at least these "make data appear
>>> in this other place" filesystem operations:
>>>
>>> 1. reflink
>>> 2. reflink, but only if the contents are the same (dedupe)
>>
>> What I meant by this was: if you ask for "regular copy", you may end
>> up with a reflink anyway. Anyway, how can you reflink a range and
>> have the contents *not* be the same?
>
> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> match before, they will afterwards.
>
> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>
> Perhaps I should have said "...if the contents are the same before the call"?
>
>>
>>> 3. regular copy
>>> 4. regular copy, but make the hardware do it for us
>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>
>> If this comes from me, I have no desire to ever use this as a flag.
>
> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> a "reallocate all the shared blocks now" op...
>
>> If someone wants to use chattr or some new operation to say "make this
>> range of this file belong just to me for purpose of optimizing future
>> writes", then sure, go for it, with the understanding that there are
>> plenty of filesystems for which that doesn't even make sense.
>
> "Unshare these blocks" sounds more like something fallocate could do.
>
> So far in my XFS reflink playground, it seems that using the defrag tool to
> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
> fragmented file's data to a second file and use a 'swap extents' operation,
> after which the donor file is unlinked.
>
> Hey, if this syscall turns into a more generic "do something involving two
> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
> extents" as a 7th operation, to refactor the ioctls. <smirk>
>
>>
>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>
>>> (Please add whatever ops I missed.)
>>>
>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>> optimization of (3).
>>>
>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>> Either the kernel can satisfy a request or it can't, but let's not just
>>> assume that we should transmogrify one type of request into another. Userspace
>>> should decide if a reflink failure should turn into one of the copy variants,
>>> depending on whether the user wants to spread allocation costs over rewrites or
>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>> programs find out what actually took place? Or do we simply not allow them to
>>> find out?
>>>
>>> Also, programs that expect reflink either to finish or fail quickly might be
>>> surprised if it's possible for reflink to take a longer time than usual and
>>> with the side effect that a deep(er) copy was made.
>>>
>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>> kernel, like how we handle it right now.
>>>
>>
>> I think we should focus on what the actual legit use cases might be.
>> Certainly we want to support a mode that's "reflink or fail". We
>> could have these flags:
>>
>> COPY_FILE_RANGE_ALLOW_REFLINK
>> COPY_FILE_RANGE_ALLOW_COPY
>>
>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>> ALLOW_COPY will make a best-effort attempt not to reflink but
>> expressly permits reflinking in cases where either (a) plain old
>> write(2) might also result in a reflink or (b) there is no advantage
>> to not reflinking.
>
> I don't agree with having a 'copy' flag that can reflink when we also have a
> 'reflink' flag. I guess I just don't like having a flag with different
> meanings depending on context.
>
> Users should be able to get the default behavior by passing '0' for flags, so
> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> an admonishment that one should only use them if they have a goooood reason.
> Passing neither gets you reflink-xor-copy, which is what I think we both want
> in the general case.
I agree here that 0 for flags should do something useful, and I wanted to double check if reflink-xor-copy is a good default behavior.
>
> FORBID_REFLINK = 1
> FORBID_COPY = 2
I don't like the idea of using flags to forbid behavior. I think it would be more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users can tell us what they want, instead of what they don't want.
While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit of a mouthful. Does anybody have suggestions for ways that I could make this shorter?
Thanks,
Anna
> CHECK_SAME = 4
> HW_COPY = 8
>
> DEDUPE = (FORBID_COPY | CHECK_SAME)
>
> What do you say to that?
>
>> An example of (b) would be a filesystem backed by deduped
>> thinly-provisioned storage that can't do anything about ENOSPC because
>> it doesn't control it in the first place.
>>
>> Another option would be to split up the copy case into "I expect to
>> overwrite a lot of the target file soon, so (c) try to commit space
>> for that or (d) try to make it time-efficient". Of course, (d) is
>> irrelevant on filesystems with no random access (nvdimms, for
>> example).
>>
>> I guess the tl;dr is that I'm highly skeptical of any use for
>> disallowing reflinking other than forcibly committing space in cases
>> where committing space actually means something.
>
> That's more or less where I was going too. :)
>
> --D
>
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <55F07FD8.4020507-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-09 18:52 ` Anna Schumaker
@ 2015-09-09 21:16 ` Darrick J. Wong
-1 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-09 21:16 UTC (permalink / raw)
To: Anna Schumaker
Cc: Andy Lutomirski, Pádraig Brady,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros-HgOvQuBEEgTQT0dZR+AlfA,
Christoph Hellwig, Coreutils
On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >>>> On 08/09/15 20:10, Andy Lutomirski wrote:
> >>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >>>>> <Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
> >>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >>>>>>> I see copy_file_range() is a reflink() on BTRFS?
> >>>>>>> That's a bit surprising, as it avoids the copy completely.
> >>>>>>> cp(1) for example considered doing a BTRFS clone by default,
> >>>>>>> but didn't due to expectations that users actually wanted
> >>>>>>> the data duplicated on disk for resilience reasons,
> >>>>>>> and for performance reasons so that write latencies were
> >>>>>>> restricted to the copy operation, rather than being
> >>>>>>> introduced at usage time as the dest file is CoW'd.
> >>>>>>>
> >>>>>>> If reflink() is a possibility for copy_file_range()
> >>>>>>> then could it be done optionally with a flag?
> >>>>>>
> >>>>>> The idea is that filesystems get to choose how to handle copies in the
> >>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
> >>>
> >>> Eww, different default behaviors depending on the filesystem. :)
> >>>
> >>>>>> copy instead. I can change the default behavior to only do a data copy
> >>>>>> (unless the reflink flag is specified) instead, if that is desirable.
> >>>>>>
> >>>>>> What does everybody think?
> >>>>>
> >>>>> I think the best you could do is to have a hint asking politely for
> >>>>> the data to be deep-copied. After all, some filesystems reserve the
> >>>>> right to transparently deduplicate.
> >>>>>
> >>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >>>>> advantage to deep copying unless you actually want two copies for
> >>>>> locality reasons.
> >>>>
> >>>> Agreed. The relink and server side copy are separate things.
> >>>> There's no advantage to not doing a server side copy,
> >>>> but as mentioned there may be advantages to doing deep copies on BTRFS
> >>>> (another reason not previous mentioned in this thread, would be
> >>>> to avoid ENOSPC errors at some time in the future).
> >>>>
> >>>> So having control over the deep copy seems useful.
> >>>> It's debatable whether ALLOW_REFLINK should be on/off by default
> >>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >>>> but cp(1) at least will work with whatever is chosen.
> >>>
> >>> So far it looks like people are interested in at least these "make data appear
> >>> in this other place" filesystem operations:
> >>>
> >>> 1. reflink
> >>> 2. reflink, but only if the contents are the same (dedupe)
> >>
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway. Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
> >
> > Perhaps I should have said "...if the contents are the same before the call"?
> >
> >>
> >>> 3. regular copy
> >>> 4. regular copy, but make the hardware do it for us
> >>> 5. regular copy, but require a second copy on the media (no-dedupe)
> >>
> >> If this comes from me, I have no desire to ever use this as a flag.
> >
> > I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> > a "reallocate all the shared blocks now" op...
> >
> >> If someone wants to use chattr or some new operation to say "make this
> >> range of this file belong just to me for purpose of optimizing future
> >> writes", then sure, go for it, with the understanding that there are
> >> plenty of filesystems for which that doesn't even make sense.
> >
> > "Unshare these blocks" sounds more like something fallocate could do.
> >
> > So far in my XFS reflink playground, it seems that using the defrag tool to
> > un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
> > fragmented file's data to a second file and use a 'swap extents' operation,
> > after which the donor file is unlinked.
> >
> > Hey, if this syscall turns into a more generic "do something involving two
> > (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
> > extents" as a 7th operation, to refactor the ioctls. <smirk>
> >
> >>
> >>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
> >>>
> >>> (Please add whatever ops I missed.)
> >>>
> >>> I think I can see a case for letting (4) fall back to (3) since (4) is an
> >>> optimization of (3).
> >>>
> >>> However, I particularly don't like the idea of (1) falling back to (3-5).
> >>> Either the kernel can satisfy a request or it can't, but let's not just
> >>> assume that we should transmogrify one type of request into another. Userspace
> >>> should decide if a reflink failure should turn into one of the copy variants,
> >>> depending on whether the user wants to spread allocation costs over rewrites or
> >>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
> >>> programs find out what actually took place? Or do we simply not allow them to
> >>> find out?
> >>>
> >>> Also, programs that expect reflink either to finish or fail quickly might be
> >>> surprised if it's possible for reflink to take a longer time than usual and
> >>> with the side effect that a deep(er) copy was made.
> >>>
> >>> I guess if someone asks for both (1) and (3) we can do the fallback in the
> >>> kernel, like how we handle it right now.
> >>>
> >>
> >> I think we should focus on what the actual legit use cases might be.
> >> Certainly we want to support a mode that's "reflink or fail". We
> >> could have these flags:
> >>
> >> COPY_FILE_RANGE_ALLOW_REFLINK
> >> COPY_FILE_RANGE_ALLOW_COPY
> >>
> >> Setting neither gets -EINVAL. Setting both works as is. Setting just
> >> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> >> ALLOW_COPY will make a best-effort attempt not to reflink but
> >> expressly permits reflinking in cases where either (a) plain old
> >> write(2) might also result in a reflink or (b) there is no advantage
> >> to not reflinking.
> >
> > I don't agree with having a 'copy' flag that can reflink when we also have a
> > 'reflink' flag. I guess I just don't like having a flag with different
> > meanings depending on context.
> >
> > Users should be able to get the default behavior by passing '0' for flags, so
> > provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> > an admonishment that one should only use them if they have a goooood reason.
> > Passing neither gets you reflink-xor-copy, which is what I think we both want
> > in the general case.
>
> I agree here that 0 for flags should do something useful, and I wanted to
> double check if reflink-xor-copy is a good default behavior.
Ok.
> >
> > FORBID_REFLINK = 1
> > FORBID_COPY = 2
>
> I don't like the idea of using flags to forbid behavior. I think it would be
> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
> can tell us what they want, instead of what they don't want.
Seems fine to me.
> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
> of a mouthful. Does anybody have suggestions for ways that I could make this
> shorter?
CFR_REFLINK_ONLY?
--D
>
> Thanks,
> Anna
>
> > CHECK_SAME = 4
> > HW_COPY = 8
> >
> > DEDUPE = (FORBID_COPY | CHECK_SAME)
> >
> > What do you say to that?
> >
> >> An example of (b) would be a filesystem backed by deduped
> >> thinly-provisioned storage that can't do anything about ENOSPC because
> >> it doesn't control it in the first place.
> >>
> >> Another option would be to split up the copy case into "I expect to
> >> overwrite a lot of the target file soon, so (c) try to commit space
> >> for that or (d) try to make it time-efficient". Of course, (d) is
> >> irrelevant on filesystems with no random access (nvdimms, for
> >> example).
> >>
> >> I guess the tl;dr is that I'm highly skeptical of any use for
> >> disallowing reflinking other than forcibly committing space in cases
> >> where committing space actually means something.
> >
> > That's more or less where I was going too. :)
> >
> > --D
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-09 21:16 ` Darrick J. Wong
0 siblings, 0 replies; 121+ messages in thread
From: Darrick J. Wong @ 2015-09-09 21:16 UTC (permalink / raw)
To: Anna Schumaker
Cc: Andy Lutomirski, Pádraig Brady, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >>>> On 08/09/15 20:10, Andy Lutomirski wrote:
> >>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >>>>>>> I see copy_file_range() is a reflink() on BTRFS?
> >>>>>>> That's a bit surprising, as it avoids the copy completely.
> >>>>>>> cp(1) for example considered doing a BTRFS clone by default,
> >>>>>>> but didn't due to expectations that users actually wanted
> >>>>>>> the data duplicated on disk for resilience reasons,
> >>>>>>> and for performance reasons so that write latencies were
> >>>>>>> restricted to the copy operation, rather than being
> >>>>>>> introduced at usage time as the dest file is CoW'd.
> >>>>>>>
> >>>>>>> If reflink() is a possibility for copy_file_range()
> >>>>>>> then could it be done optionally with a flag?
> >>>>>>
> >>>>>> The idea is that filesystems get to choose how to handle copies in the
> >>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
> >>>
> >>> Eww, different default behaviors depending on the filesystem. :)
> >>>
> >>>>>> copy instead. I can change the default behavior to only do a data copy
> >>>>>> (unless the reflink flag is specified) instead, if that is desirable.
> >>>>>>
> >>>>>> What does everybody think?
> >>>>>
> >>>>> I think the best you could do is to have a hint asking politely for
> >>>>> the data to be deep-copied. After all, some filesystems reserve the
> >>>>> right to transparently deduplicate.
> >>>>>
> >>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >>>>> advantage to deep copying unless you actually want two copies for
> >>>>> locality reasons.
> >>>>
> >>>> Agreed. The relink and server side copy are separate things.
> >>>> There's no advantage to not doing a server side copy,
> >>>> but as mentioned there may be advantages to doing deep copies on BTRFS
> >>>> (another reason not previous mentioned in this thread, would be
> >>>> to avoid ENOSPC errors at some time in the future).
> >>>>
> >>>> So having control over the deep copy seems useful.
> >>>> It's debatable whether ALLOW_REFLINK should be on/off by default
> >>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
> >>>> but cp(1) at least will work with whatever is chosen.
> >>>
> >>> So far it looks like people are interested in at least these "make data appear
> >>> in this other place" filesystem operations:
> >>>
> >>> 1. reflink
> >>> 2. reflink, but only if the contents are the same (dedupe)
> >>
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway. Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
> >
> > Perhaps I should have said "...if the contents are the same before the call"?
> >
> >>
> >>> 3. regular copy
> >>> 4. regular copy, but make the hardware do it for us
> >>> 5. regular copy, but require a second copy on the media (no-dedupe)
> >>
> >> If this comes from me, I have no desire to ever use this as a flag.
> >
> > I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> > a "reallocate all the shared blocks now" op...
> >
> >> If someone wants to use chattr or some new operation to say "make this
> >> range of this file belong just to me for purpose of optimizing future
> >> writes", then sure, go for it, with the understanding that there are
> >> plenty of filesystems for which that doesn't even make sense.
> >
> > "Unshare these blocks" sounds more like something fallocate could do.
> >
> > So far in my XFS reflink playground, it seems that using the defrag tool to
> > un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
> > fragmented file's data to a second file and use a 'swap extents' operation,
> > after which the donor file is unlinked.
> >
> > Hey, if this syscall turns into a more generic "do something involving two
> > (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
> > extents" as a 7th operation, to refactor the ioctls. <smirk>
> >
> >>
> >>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
> >>>
> >>> (Please add whatever ops I missed.)
> >>>
> >>> I think I can see a case for letting (4) fall back to (3) since (4) is an
> >>> optimization of (3).
> >>>
> >>> However, I particularly don't like the idea of (1) falling back to (3-5).
> >>> Either the kernel can satisfy a request or it can't, but let's not just
> >>> assume that we should transmogrify one type of request into another. Userspace
> >>> should decide if a reflink failure should turn into one of the copy variants,
> >>> depending on whether the user wants to spread allocation costs over rewrites or
> >>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
> >>> programs find out what actually took place? Or do we simply not allow them to
> >>> find out?
> >>>
> >>> Also, programs that expect reflink either to finish or fail quickly might be
> >>> surprised if it's possible for reflink to take a longer time than usual and
> >>> with the side effect that a deep(er) copy was made.
> >>>
> >>> I guess if someone asks for both (1) and (3) we can do the fallback in the
> >>> kernel, like how we handle it right now.
> >>>
> >>
> >> I think we should focus on what the actual legit use cases might be.
> >> Certainly we want to support a mode that's "reflink or fail". We
> >> could have these flags:
> >>
> >> COPY_FILE_RANGE_ALLOW_REFLINK
> >> COPY_FILE_RANGE_ALLOW_COPY
> >>
> >> Setting neither gets -EINVAL. Setting both works as is. Setting just
> >> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
> >> ALLOW_COPY will make a best-effort attempt not to reflink but
> >> expressly permits reflinking in cases where either (a) plain old
> >> write(2) might also result in a reflink or (b) there is no advantage
> >> to not reflinking.
> >
> > I don't agree with having a 'copy' flag that can reflink when we also have a
> > 'reflink' flag. I guess I just don't like having a flag with different
> > meanings depending on context.
> >
> > Users should be able to get the default behavior by passing '0' for flags, so
> > provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
> > an admonishment that one should only use them if they have a goooood reason.
> > Passing neither gets you reflink-xor-copy, which is what I think we both want
> > in the general case.
>
> I agree here that 0 for flags should do something useful, and I wanted to
> double check if reflink-xor-copy is a good default behavior.
Ok.
> >
> > FORBID_REFLINK = 1
> > FORBID_COPY = 2
>
> I don't like the idea of using flags to forbid behavior. I think it would be
> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
> can tell us what they want, instead of what they don't want.
Seems fine to me.
> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
> of a mouthful. Does anybody have suggestions for ways that I could make this
> shorter?
CFR_REFLINK_ONLY?
--D
>
> Thanks,
> Anna
>
> > CHECK_SAME = 4
> > HW_COPY = 8
> >
> > DEDUPE = (FORBID_COPY | CHECK_SAME)
> >
> > What do you say to that?
> >
> >> An example of (b) would be a filesystem backed by deduped
> >> thinly-provisioned storage that can't do anything about ENOSPC because
> >> it doesn't control it in the first place.
> >>
> >> Another option would be to split up the copy case into "I expect to
> >> overwrite a lot of the target file soon, so (c) try to commit space
> >> for that or (d) try to make it time-efficient". Of course, (d) is
> >> irrelevant on filesystems with no random access (nvdimms, for
> >> example).
> >>
> >> I guess the tl;dr is that I'm highly skeptical of any use for
> >> disallowing reflinking other than forcibly committing space in cases
> >> where committing space actually means something.
> >
> > That's more or less where I was going too. :)
> >
> > --D
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-09 21:16 ` Darrick J. Wong
@ 2015-09-10 15:10 ` Anna Schumaker
-1 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-10 15:10 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Andy Lutomirski, Pádraig Brady, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils, Austin S Hemmelgarn
On 09/09/2015 05:16 PM, Darrick J. Wong wrote:
> On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
>> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
>>> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>>>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>>>> and for performance reasons so that write latencies were
>>>>>>>>> restricted to the copy operation, rather than being
>>>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>>>
>>>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>>>> then could it be done optionally with a flag?
>>>>>>>>
>>>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>>>
>>>>> Eww, different default behaviors depending on the filesystem. :)
>>>>>
>>>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>>>
>>>>>>>> What does everybody think?
>>>>>>>
>>>>>>> I think the best you could do is to have a hint asking politely for
>>>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>>>> right to transparently deduplicate.
>>>>>>>
>>>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>>>> advantage to deep copying unless you actually want two copies for
>>>>>>> locality reasons.
>>>>>>
>>>>>> Agreed. The relink and server side copy are separate things.
>>>>>> There's no advantage to not doing a server side copy,
>>>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>>>> (another reason not previous mentioned in this thread, would be
>>>>>> to avoid ENOSPC errors at some time in the future).
>>>>>>
>>>>>> So having control over the deep copy seems useful.
>>>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>>>> but cp(1) at least will work with whatever is chosen.
>>>>>
>>>>> So far it looks like people are interested in at least these "make data appear
>>>>> in this other place" filesystem operations:
>>>>>
>>>>> 1. reflink
>>>>> 2. reflink, but only if the contents are the same (dedupe)
>>>>
>>>> What I meant by this was: if you ask for "regular copy", you may end
>>>> up with a reflink anyway. Anyway, how can you reflink a range and
>>>> have the contents *not* be the same?
>>>
>>> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
>>> match before, they will afterwards.
>>>
>>> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>>>
>>> Perhaps I should have said "...if the contents are the same before the call"?
>>>
>>>>
>>>>> 3. regular copy
>>>>> 4. regular copy, but make the hardware do it for us
>>>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>>>
>>>> If this comes from me, I have no desire to ever use this as a flag.
>>>
>>> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
>>> a "reallocate all the shared blocks now" op...
>>>
>>>> If someone wants to use chattr or some new operation to say "make this
>>>> range of this file belong just to me for purpose of optimizing future
>>>> writes", then sure, go for it, with the understanding that there are
>>>> plenty of filesystems for which that doesn't even make sense.
>>>
>>> "Unshare these blocks" sounds more like something fallocate could do.
>>>
>>> So far in my XFS reflink playground, it seems that using the defrag tool to
>>> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
>>> fragmented file's data to a second file and use a 'swap extents' operation,
>>> after which the donor file is unlinked.
>>>
>>> Hey, if this syscall turns into a more generic "do something involving two
>>> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
>>> extents" as a 7th operation, to refactor the ioctls. <smirk>
>>>
>>>>
>>>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>>>
>>>>> (Please add whatever ops I missed.)
>>>>>
>>>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>>>> optimization of (3).
>>>>>
>>>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>>>> Either the kernel can satisfy a request or it can't, but let's not just
>>>>> assume that we should transmogrify one type of request into another. Userspace
>>>>> should decide if a reflink failure should turn into one of the copy variants,
>>>>> depending on whether the user wants to spread allocation costs over rewrites or
>>>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>>>> programs find out what actually took place? Or do we simply not allow them to
>>>>> find out?
>>>>>
>>>>> Also, programs that expect reflink either to finish or fail quickly might be
>>>>> surprised if it's possible for reflink to take a longer time than usual and
>>>>> with the side effect that a deep(er) copy was made.
>>>>>
>>>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>>>> kernel, like how we handle it right now.
>>>>>
>>>>
>>>> I think we should focus on what the actual legit use cases might be.
>>>> Certainly we want to support a mode that's "reflink or fail". We
>>>> could have these flags:
>>>>
>>>> COPY_FILE_RANGE_ALLOW_REFLINK
>>>> COPY_FILE_RANGE_ALLOW_COPY
>>>>
>>>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>>>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>>>> ALLOW_COPY will make a best-effort attempt not to reflink but
>>>> expressly permits reflinking in cases where either (a) plain old
>>>> write(2) might also result in a reflink or (b) there is no advantage
>>>> to not reflinking.
>>>
>>> I don't agree with having a 'copy' flag that can reflink when we also have a
>>> 'reflink' flag. I guess I just don't like having a flag with different
>>> meanings depending on context.
>>>
>>> Users should be able to get the default behavior by passing '0' for flags, so
>>> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
>>> an admonishment that one should only use them if they have a goooood reason.
>>> Passing neither gets you reflink-xor-copy, which is what I think we both want
>>> in the general case.
>>
>> I agree here that 0 for flags should do something useful, and I wanted to
>> double check if reflink-xor-copy is a good default behavior.
>
> Ok.
>
>>>
>>> FORBID_REFLINK = 1
>>> FORBID_COPY = 2
>>
>> I don't like the idea of using flags to forbid behavior. I think it would be
>> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
>> can tell us what they want, instead of what they don't want.
>
> Seems fine to me.
>
>> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
>> of a mouthful. Does anybody have suggestions for ways that I could make this
>> shorter?
>
> CFR_REFLINK_ONLY?
That could work! Although I might do as Austin suggests and drop the _ONLY part, and then make the man page clear about what's going on.
Would you expect to trigger a NFS server side copy by passing the pagecache copy flag? Or would that only happen if I pass flags=0?
Anna
>
> --D
>
>>
>> Thanks,
>> Anna
>>
>>> CHECK_SAME = 4
>>> HW_COPY = 8
>>>
>>> DEDUPE = (FORBID_COPY | CHECK_SAME)
>>>
>>> What do you say to that?
>>>
>>>> An example of (b) would be a filesystem backed by deduped
>>>> thinly-provisioned storage that can't do anything about ENOSPC because
>>>> it doesn't control it in the first place.
>>>>
>>>> Another option would be to split up the copy case into "I expect to
>>>> overwrite a lot of the target file soon, so (c) try to commit space
>>>> for that or (d) try to make it time-efficient". Of course, (d) is
>>>> irrelevant on filesystems with no random access (nvdimms, for
>>>> example).
>>>>
>>>> I guess the tl;dr is that I'm highly skeptical of any use for
>>>> disallowing reflinking other than forcibly committing space in cases
>>>> where committing space actually means something.
>>>
>>> That's more or less where I was going too. :)
>>>
>>> --D
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-api" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-10 15:10 ` Anna Schumaker
0 siblings, 0 replies; 121+ messages in thread
From: Anna Schumaker @ 2015-09-10 15:10 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Andy Lutomirski, Pádraig Brady, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils, Austin S Hemmelgarn
On 09/09/2015 05:16 PM, Darrick J. Wong wrote:
> On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
>> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
>>> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>>>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>>>> and for performance reasons so that write latencies were
>>>>>>>>> restricted to the copy operation, rather than being
>>>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>>>
>>>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>>>> then could it be done optionally with a flag?
>>>>>>>>
>>>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>>>
>>>>> Eww, different default behaviors depending on the filesystem. :)
>>>>>
>>>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>>>
>>>>>>>> What does everybody think?
>>>>>>>
>>>>>>> I think the best you could do is to have a hint asking politely for
>>>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>>>> right to transparently deduplicate.
>>>>>>>
>>>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>>>> advantage to deep copying unless you actually want two copies for
>>>>>>> locality reasons.
>>>>>>
>>>>>> Agreed. The relink and server side copy are separate things.
>>>>>> There's no advantage to not doing a server side copy,
>>>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>>>> (another reason not previous mentioned in this thread, would be
>>>>>> to avoid ENOSPC errors at some time in the future).
>>>>>>
>>>>>> So having control over the deep copy seems useful.
>>>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>>>> but cp(1) at least will work with whatever is chosen.
>>>>>
>>>>> So far it looks like people are interested in at least these "make data appear
>>>>> in this other place" filesystem operations:
>>>>>
>>>>> 1. reflink
>>>>> 2. reflink, but only if the contents are the same (dedupe)
>>>>
>>>> What I meant by this was: if you ask for "regular copy", you may end
>>>> up with a reflink anyway. Anyway, how can you reflink a range and
>>>> have the contents *not* be the same?
>>>
>>> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
>>> match before, they will afterwards.
>>>
>>> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>>>
>>> Perhaps I should have said "...if the contents are the same before the call"?
>>>
>>>>
>>>>> 3. regular copy
>>>>> 4. regular copy, but make the hardware do it for us
>>>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>>>
>>>> If this comes from me, I have no desire to ever use this as a flag.
>>>
>>> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
>>> a "reallocate all the shared blocks now" op...
>>>
>>>> If someone wants to use chattr or some new operation to say "make this
>>>> range of this file belong just to me for purpose of optimizing future
>>>> writes", then sure, go for it, with the understanding that there are
>>>> plenty of filesystems for which that doesn't even make sense.
>>>
>>> "Unshare these blocks" sounds more like something fallocate could do.
>>>
>>> So far in my XFS reflink playground, it seems that using the defrag tool to
>>> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
>>> fragmented file's data to a second file and use a 'swap extents' operation,
>>> after which the donor file is unlinked.
>>>
>>> Hey, if this syscall turns into a more generic "do something involving two
>>> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
>>> extents" as a 7th operation, to refactor the ioctls. <smirk>
>>>
>>>>
>>>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>>>
>>>>> (Please add whatever ops I missed.)
>>>>>
>>>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>>>> optimization of (3).
>>>>>
>>>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>>>> Either the kernel can satisfy a request or it can't, but let's not just
>>>>> assume that we should transmogrify one type of request into another. Userspace
>>>>> should decide if a reflink failure should turn into one of the copy variants,
>>>>> depending on whether the user wants to spread allocation costs over rewrites or
>>>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>>>> programs find out what actually took place? Or do we simply not allow them to
>>>>> find out?
>>>>>
>>>>> Also, programs that expect reflink either to finish or fail quickly might be
>>>>> surprised if it's possible for reflink to take a longer time than usual and
>>>>> with the side effect that a deep(er) copy was made.
>>>>>
>>>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>>>> kernel, like how we handle it right now.
>>>>>
>>>>
>>>> I think we should focus on what the actual legit use cases might be.
>>>> Certainly we want to support a mode that's "reflink or fail". We
>>>> could have these flags:
>>>>
>>>> COPY_FILE_RANGE_ALLOW_REFLINK
>>>> COPY_FILE_RANGE_ALLOW_COPY
>>>>
>>>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>>>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>>>> ALLOW_COPY will make a best-effort attempt not to reflink but
>>>> expressly permits reflinking in cases where either (a) plain old
>>>> write(2) might also result in a reflink or (b) there is no advantage
>>>> to not reflinking.
>>>
>>> I don't agree with having a 'copy' flag that can reflink when we also have a
>>> 'reflink' flag. I guess I just don't like having a flag with different
>>> meanings depending on context.
>>>
>>> Users should be able to get the default behavior by passing '0' for flags, so
>>> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
>>> an admonishment that one should only use them if they have a goooood reason.
>>> Passing neither gets you reflink-xor-copy, which is what I think we both want
>>> in the general case.
>>
>> I agree here that 0 for flags should do something useful, and I wanted to
>> double check if reflink-xor-copy is a good default behavior.
>
> Ok.
>
>>>
>>> FORBID_REFLINK = 1
>>> FORBID_COPY = 2
>>
>> I don't like the idea of using flags to forbid behavior. I think it would be
>> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
>> can tell us what they want, instead of what they don't want.
>
> Seems fine to me.
>
>> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
>> of a mouthful. Does anybody have suggestions for ways that I could make this
>> shorter?
>
> CFR_REFLINK_ONLY?
That could work! Although I might do as Austin suggests and drop the _ONLY part, and then make the man page clear about what's going on.
Would you expect to trigger a NFS server side copy by passing the pagecache copy flag? Or would that only happen if I pass flags=0?
Anna
>
> --D
>
>>
>> Thanks,
>> Anna
>>
>>> CHECK_SAME = 4
>>> HW_COPY = 8
>>>
>>> DEDUPE = (FORBID_COPY | CHECK_SAME)
>>>
>>> What do you say to that?
>>>
>>>> An example of (b) would be a filesystem backed by deduped
>>>> thinly-provisioned storage that can't do anything about ENOSPC because
>>>> it doesn't control it in the first place.
>>>>
>>>> Another option would be to split up the copy case into "I expect to
>>>> overwrite a lot of the target file soon, so (c) try to commit space
>>>> for that or (d) try to make it time-efficient". Of course, (d) is
>>>> irrelevant on filesystems with no random access (nvdimms, for
>>>> example).
>>>>
>>>> I guess the tl;dr is that I'm highly skeptical of any use for
>>>> disallowing reflinking other than forcibly committing space in cases
>>>> where committing space actually means something.
>>>
>>> That's more or less where I was going too. :)
>>>
>>> --D
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-api" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 121+ messages in thread
[parent not found: <55F19D7F.5090907-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-10 15:10 ` Anna Schumaker
@ 2015-09-10 15:49 ` Austin S Hemmelgarn
-1 siblings, 0 replies; 121+ messages in thread
From: Austin S Hemmelgarn @ 2015-09-10 15:49 UTC (permalink / raw)
To: Anna Schumaker, Darrick J. Wong
Cc: Andy Lutomirski, Pádraig Brady,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros-HgOvQuBEEgTQT0dZR+AlfA,
Christoph Hellwig, Coreutils
[-- Attachment #1: Type: text/plain, Size: 10045 bytes --]
On 2015-09-10 11:10, Anna Schumaker wrote:
> On 09/09/2015 05:16 PM, Darrick J. Wong wrote:
>> On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
>>> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
>>>> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>>>>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>>>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>>>>> <Anna.Schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org> wrote:
>>>>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>>>>> and for performance reasons so that write latencies were
>>>>>>>>>> restricted to the copy operation, rather than being
>>>>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>>>>
>>>>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>>>>> then could it be done optionally with a flag?
>>>>>>>>>
>>>>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>>>>
>>>>>> Eww, different default behaviors depending on the filesystem. :)
>>>>>>
>>>>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>>>>
>>>>>>>>> What does everybody think?
>>>>>>>>
>>>>>>>> I think the best you could do is to have a hint asking politely for
>>>>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>>>>> right to transparently deduplicate.
>>>>>>>>
>>>>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>>>>> advantage to deep copying unless you actually want two copies for
>>>>>>>> locality reasons.
>>>>>>>
>>>>>>> Agreed. The relink and server side copy are separate things.
>>>>>>> There's no advantage to not doing a server side copy,
>>>>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>>>>> (another reason not previous mentioned in this thread, would be
>>>>>>> to avoid ENOSPC errors at some time in the future).
>>>>>>>
>>>>>>> So having control over the deep copy seems useful.
>>>>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>>>>> but cp(1) at least will work with whatever is chosen.
>>>>>>
>>>>>> So far it looks like people are interested in at least these "make data appear
>>>>>> in this other place" filesystem operations:
>>>>>>
>>>>>> 1. reflink
>>>>>> 2. reflink, but only if the contents are the same (dedupe)
>>>>>
>>>>> What I meant by this was: if you ask for "regular copy", you may end
>>>>> up with a reflink anyway. Anyway, how can you reflink a range and
>>>>> have the contents *not* be the same?
>>>>
>>>> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
>>>> match before, they will afterwards.
>>>>
>>>> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>>>>
>>>> Perhaps I should have said "...if the contents are the same before the call"?
>>>>
>>>>>
>>>>>> 3. regular copy
>>>>>> 4. regular copy, but make the hardware do it for us
>>>>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>>>>
>>>>> If this comes from me, I have no desire to ever use this as a flag.
>>>>
>>>> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
>>>> a "reallocate all the shared blocks now" op...
>>>>
>>>>> If someone wants to use chattr or some new operation to say "make this
>>>>> range of this file belong just to me for purpose of optimizing future
>>>>> writes", then sure, go for it, with the understanding that there are
>>>>> plenty of filesystems for which that doesn't even make sense.
>>>>
>>>> "Unshare these blocks" sounds more like something fallocate could do.
>>>>
>>>> So far in my XFS reflink playground, it seems that using the defrag tool to
>>>> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
>>>> fragmented file's data to a second file and use a 'swap extents' operation,
>>>> after which the donor file is unlinked.
>>>>
>>>> Hey, if this syscall turns into a more generic "do something involving two
>>>> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
>>>> extents" as a 7th operation, to refactor the ioctls. <smirk>
>>>>
>>>>>
>>>>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>>>>
>>>>>> (Please add whatever ops I missed.)
>>>>>>
>>>>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>>>>> optimization of (3).
>>>>>>
>>>>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>>>>> Either the kernel can satisfy a request or it can't, but let's not just
>>>>>> assume that we should transmogrify one type of request into another. Userspace
>>>>>> should decide if a reflink failure should turn into one of the copy variants,
>>>>>> depending on whether the user wants to spread allocation costs over rewrites or
>>>>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>>>>> programs find out what actually took place? Or do we simply not allow them to
>>>>>> find out?
>>>>>>
>>>>>> Also, programs that expect reflink either to finish or fail quickly might be
>>>>>> surprised if it's possible for reflink to take a longer time than usual and
>>>>>> with the side effect that a deep(er) copy was made.
>>>>>>
>>>>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>>>>> kernel, like how we handle it right now.
>>>>>>
>>>>>
>>>>> I think we should focus on what the actual legit use cases might be.
>>>>> Certainly we want to support a mode that's "reflink or fail". We
>>>>> could have these flags:
>>>>>
>>>>> COPY_FILE_RANGE_ALLOW_REFLINK
>>>>> COPY_FILE_RANGE_ALLOW_COPY
>>>>>
>>>>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>>>>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>>>>> ALLOW_COPY will make a best-effort attempt not to reflink but
>>>>> expressly permits reflinking in cases where either (a) plain old
>>>>> write(2) might also result in a reflink or (b) there is no advantage
>>>>> to not reflinking.
>>>>
>>>> I don't agree with having a 'copy' flag that can reflink when we also have a
>>>> 'reflink' flag. I guess I just don't like having a flag with different
>>>> meanings depending on context.
>>>>
>>>> Users should be able to get the default behavior by passing '0' for flags, so
>>>> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
>>>> an admonishment that one should only use them if they have a goooood reason.
>>>> Passing neither gets you reflink-xor-copy, which is what I think we both want
>>>> in the general case.
>>>
>>> I agree here that 0 for flags should do something useful, and I wanted to
>>> double check if reflink-xor-copy is a good default behavior.
>>
>> Ok.
>>
>>>>
>>>> FORBID_REFLINK = 1
>>>> FORBID_COPY = 2
>>>
>>> I don't like the idea of using flags to forbid behavior. I think it would be
>>> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
>>> can tell us what they want, instead of what they don't want.
>>
>> Seems fine to me.
>>
>>> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
>>> of a mouthful. Does anybody have suggestions for ways that I could make this
>>> shorter?
>>
>> CFR_REFLINK_ONLY?
>
> That could work! Although I might do as Austin suggests and drop the _ONLY part, and then make the man page clear about what's going on.
>
> Would you expect to trigger a NFS server side copy by passing the pagecache copy flag? Or would that only happen if I pass flags=0?
Personally, I would think that an NFS server side copy could be counted
under the 'hardware assisted' flag. From the point of view of an NFS
client, the NFS server is a (usually) opaque piece of storage hardware,
similar to a local disk drive in that you pass commands to it and get
responses, the only real difference is that NFS is a much higher level
protocol than for example SCSI.
>>
>> --D
>>
>>>
>>> Thanks,
>>> Anna
>>>
>>>> CHECK_SAME = 4
>>>> HW_COPY = 8
>>>>
>>>> DEDUPE = (FORBID_COPY | CHECK_SAME)
>>>>
>>>> What do you say to that?
>>>>
>>>>> An example of (b) would be a filesystem backed by deduped
>>>>> thinly-provisioned storage that can't do anything about ENOSPC because
>>>>> it doesn't control it in the first place.
>>>>>
>>>>> Another option would be to split up the copy case into "I expect to
>>>>> overwrite a lot of the target file soon, so (c) try to commit space
>>>>> for that or (d) try to make it time-efficient". Of course, (d) is
>>>>> irrelevant on filesystems with no random access (nvdimms, for
>>>>> example).
>>>>>
>>>>> I guess the tl;dr is that I'm highly skeptical of any use for
>>>>> disallowing reflinking other than forcibly committing space in cases
>>>>> where committing space actually means something.
>>>>
>>>> That's more or less where I was going too. :)
>>>>
>>>> --D
>>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-api" in
>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3019 bytes --]
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
@ 2015-09-10 15:49 ` Austin S Hemmelgarn
0 siblings, 0 replies; 121+ messages in thread
From: Austin S Hemmelgarn @ 2015-09-10 15:49 UTC (permalink / raw)
To: Anna Schumaker, Darrick J. Wong
Cc: Andy Lutomirski, Pádraig Brady, linux-nfs,
Linux btrfs Developers List, Linux FS Devel, Linux API,
Zach Brown, Al Viro, Chris Mason, Michael Kerrisk-manpages,
andros, Christoph Hellwig, Coreutils
[-- Attachment #1: Type: text/plain, Size: 9992 bytes --]
On 2015-09-10 11:10, Anna Schumaker wrote:
> On 09/09/2015 05:16 PM, Darrick J. Wong wrote:
>> On Wed, Sep 09, 2015 at 02:52:08PM -0400, Anna Schumaker wrote:
>>> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
>>>> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>>>>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>>>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>>>>> and for performance reasons so that write latencies were
>>>>>>>>>> restricted to the copy operation, rather than being
>>>>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>>>>
>>>>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>>>>> then could it be done optionally with a flag?
>>>>>>>>>
>>>>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>>>>
>>>>>> Eww, different default behaviors depending on the filesystem. :)
>>>>>>
>>>>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>>>>
>>>>>>>>> What does everybody think?
>>>>>>>>
>>>>>>>> I think the best you could do is to have a hint asking politely for
>>>>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>>>>> right to transparently deduplicate.
>>>>>>>>
>>>>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>>>>> advantage to deep copying unless you actually want two copies for
>>>>>>>> locality reasons.
>>>>>>>
>>>>>>> Agreed. The relink and server side copy are separate things.
>>>>>>> There's no advantage to not doing a server side copy,
>>>>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>>>>> (another reason not previous mentioned in this thread, would be
>>>>>>> to avoid ENOSPC errors at some time in the future).
>>>>>>>
>>>>>>> So having control over the deep copy seems useful.
>>>>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>>>>> but cp(1) at least will work with whatever is chosen.
>>>>>>
>>>>>> So far it looks like people are interested in at least these "make data appear
>>>>>> in this other place" filesystem operations:
>>>>>>
>>>>>> 1. reflink
>>>>>> 2. reflink, but only if the contents are the same (dedupe)
>>>>>
>>>>> What I meant by this was: if you ask for "regular copy", you may end
>>>>> up with a reflink anyway. Anyway, how can you reflink a range and
>>>>> have the contents *not* be the same?
>>>>
>>>> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
>>>> match before, they will afterwards.
>>>>
>>>> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>>>>
>>>> Perhaps I should have said "...if the contents are the same before the call"?
>>>>
>>>>>
>>>>>> 3. regular copy
>>>>>> 4. regular copy, but make the hardware do it for us
>>>>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>>>>
>>>>> If this comes from me, I have no desire to ever use this as a flag.
>>>>
>>>> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
>>>> a "reallocate all the shared blocks now" op...
>>>>
>>>>> If someone wants to use chattr or some new operation to say "make this
>>>>> range of this file belong just to me for purpose of optimizing future
>>>>> writes", then sure, go for it, with the understanding that there are
>>>>> plenty of filesystems for which that doesn't even make sense.
>>>>
>>>> "Unshare these blocks" sounds more like something fallocate could do.
>>>>
>>>> So far in my XFS reflink playground, it seems that using the defrag tool to
>>>> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
>>>> fragmented file's data to a second file and use a 'swap extents' operation,
>>>> after which the donor file is unlinked.
>>>>
>>>> Hey, if this syscall turns into a more generic "do something involving two
>>>> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
>>>> extents" as a 7th operation, to refactor the ioctls. <smirk>
>>>>
>>>>>
>>>>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>>>>
>>>>>> (Please add whatever ops I missed.)
>>>>>>
>>>>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>>>>> optimization of (3).
>>>>>>
>>>>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>>>>> Either the kernel can satisfy a request or it can't, but let's not just
>>>>>> assume that we should transmogrify one type of request into another. Userspace
>>>>>> should decide if a reflink failure should turn into one of the copy variants,
>>>>>> depending on whether the user wants to spread allocation costs over rewrites or
>>>>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>>>>> programs find out what actually took place? Or do we simply not allow them to
>>>>>> find out?
>>>>>>
>>>>>> Also, programs that expect reflink either to finish or fail quickly might be
>>>>>> surprised if it's possible for reflink to take a longer time than usual and
>>>>>> with the side effect that a deep(er) copy was made.
>>>>>>
>>>>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>>>>> kernel, like how we handle it right now.
>>>>>>
>>>>>
>>>>> I think we should focus on what the actual legit use cases might be.
>>>>> Certainly we want to support a mode that's "reflink or fail". We
>>>>> could have these flags:
>>>>>
>>>>> COPY_FILE_RANGE_ALLOW_REFLINK
>>>>> COPY_FILE_RANGE_ALLOW_COPY
>>>>>
>>>>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>>>>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>>>>> ALLOW_COPY will make a best-effort attempt not to reflink but
>>>>> expressly permits reflinking in cases where either (a) plain old
>>>>> write(2) might also result in a reflink or (b) there is no advantage
>>>>> to not reflinking.
>>>>
>>>> I don't agree with having a 'copy' flag that can reflink when we also have a
>>>> 'reflink' flag. I guess I just don't like having a flag with different
>>>> meanings depending on context.
>>>>
>>>> Users should be able to get the default behavior by passing '0' for flags, so
>>>> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
>>>> an admonishment that one should only use them if they have a goooood reason.
>>>> Passing neither gets you reflink-xor-copy, which is what I think we both want
>>>> in the general case.
>>>
>>> I agree here that 0 for flags should do something useful, and I wanted to
>>> double check if reflink-xor-copy is a good default behavior.
>>
>> Ok.
>>
>>>>
>>>> FORBID_REFLINK = 1
>>>> FORBID_COPY = 2
>>>
>>> I don't like the idea of using flags to forbid behavior. I think it would be
>>> more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users
>>> can tell us what they want, instead of what they don't want.
>>
>> Seems fine to me.
>>
>>> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit
>>> of a mouthful. Does anybody have suggestions for ways that I could make this
>>> shorter?
>>
>> CFR_REFLINK_ONLY?
>
> That could work! Although I might do as Austin suggests and drop the _ONLY part, and then make the man page clear about what's going on.
>
> Would you expect to trigger a NFS server side copy by passing the pagecache copy flag? Or would that only happen if I pass flags=0?
Personally, I would think that an NFS server side copy could be counted
under the 'hardware assisted' flag. From the point of view of an NFS
client, the NFS server is a (usually) opaque piece of storage hardware,
similar to a local disk drive in that you pass commands to it and get
responses, the only real difference is that NFS is a much higher level
protocol than for example SCSI.
>>
>> --D
>>
>>>
>>> Thanks,
>>> Anna
>>>
>>>> CHECK_SAME = 4
>>>> HW_COPY = 8
>>>>
>>>> DEDUPE = (FORBID_COPY | CHECK_SAME)
>>>>
>>>> What do you say to that?
>>>>
>>>>> An example of (b) would be a filesystem backed by deduped
>>>>> thinly-provisioned storage that can't do anything about ENOSPC because
>>>>> it doesn't control it in the first place.
>>>>>
>>>>> Another option would be to split up the copy case into "I expect to
>>>>> overwrite a lot of the target file soon, so (c) try to commit space
>>>>> for that or (d) try to make it time-efficient". Of course, (d) is
>>>>> irrelevant on filesystems with no random access (nvdimms, for
>>>>> example).
>>>>>
>>>>> I guess the tl;dr is that I'm highly skeptical of any use for
>>>>> disallowing reflinking other than forcibly committing space in cases
>>>>> where committing space actually means something.
>>>>
>>>> That's more or less where I was going too. :)
>>>>
>>>> --D
>>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-api" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3019 bytes --]
^ permalink raw reply [flat|nested] 121+ messages in thread
* Re: [PATCH v1 0/8] VFS: In-kernel copy system call
2015-09-09 18:52 ` Anna Schumaker
(?)
(?)
@ 2015-09-10 11:40 ` Austin S Hemmelgarn
-1 siblings, 0 replies; 121+ messages in thread
From: Austin S Hemmelgarn @ 2015-09-10 11:40 UTC (permalink / raw)
To: Anna Schumaker, Darrick J. Wong, Andy Lutomirski
Cc: Pádraig Brady, linux-nfs, Linux btrfs Developers List,
Linux FS Devel, Linux API, Zach Brown, Al Viro, Chris Mason,
Michael Kerrisk-manpages, andros, Christoph Hellwig, Coreutils
[-- Attachment #1: Type: text/plain, Size: 8794 bytes --]
On 2015-09-09 14:52, Anna Schumaker wrote:
> On 09/08/2015 06:39 PM, Darrick J. Wong wrote:
>> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
>>>> On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>>>>> On 08/09/15 20:10, Andy Lutomirski wrote:
>>>>>> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>>>>>>> I see copy_file_range() is a reflink() on BTRFS?
>>>>>>>> That's a bit surprising, as it avoids the copy completely.
>>>>>>>> cp(1) for example considered doing a BTRFS clone by default,
>>>>>>>> but didn't due to expectations that users actually wanted
>>>>>>>> the data duplicated on disk for resilience reasons,
>>>>>>>> and for performance reasons so that write latencies were
>>>>>>>> restricted to the copy operation, rather than being
>>>>>>>> introduced at usage time as the dest file is CoW'd.
>>>>>>>>
>>>>>>>> If reflink() is a possibility for copy_file_range()
>>>>>>>> then could it be done optionally with a flag?
>>>>>>>
>>>>>>> The idea is that filesystems get to choose how to handle copies in the
>>>>>>> default case. BTRFS could do a reflink, but NFS could do a server side
>>>>
>>>> Eww, different default behaviors depending on the filesystem. :)
>>>>
>>>>>>> copy instead. I can change the default behavior to only do a data copy
>>>>>>> (unless the reflink flag is specified) instead, if that is desirable.
>>>>>>>
>>>>>>> What does everybody think?
>>>>>>
>>>>>> I think the best you could do is to have a hint asking politely for
>>>>>> the data to be deep-copied. After all, some filesystems reserve the
>>>>>> right to transparently deduplicate.
>>>>>>
>>>>>> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>>>>>> advantage to deep copying unless you actually want two copies for
>>>>>> locality reasons.
>>>>>
>>>>> Agreed. The relink and server side copy are separate things.
>>>>> There's no advantage to not doing a server side copy,
>>>>> but as mentioned there may be advantages to doing deep copies on BTRFS
>>>>> (another reason not previous mentioned in this thread, would be
>>>>> to avoid ENOSPC errors at some time in the future).
>>>>>
>>>>> So having control over the deep copy seems useful.
>>>>> It's debatable whether ALLOW_REFLINK should be on/off by default
>>>>> for copy_file_range(). I'd be inclined to have such a setting off by default,
>>>>> but cp(1) at least will work with whatever is chosen.
>>>>
>>>> So far it looks like people are interested in at least these "make data appear
>>>> in this other place" filesystem operations:
>>>>
>>>> 1. reflink
>>>> 2. reflink, but only if the contents are the same (dedupe)
>>>
>>> What I meant by this was: if you ask for "regular copy", you may end
>>> up with a reflink anyway. Anyway, how can you reflink a range and
>>> have the contents *not* be the same?
>>
>> reflink forcibly remaps fd_dest's range to fd_src's range. If they didn't
>> match before, they will afterwards.
>>
>> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>>
>> Perhaps I should have said "...if the contents are the same before the call"?
>>
>>>
>>>> 3. regular copy
>>>> 4. regular copy, but make the hardware do it for us
>>>> 5. regular copy, but require a second copy on the media (no-dedupe)
>>>
>>> If this comes from me, I have no desire to ever use this as a flag.
>>
>> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
>> a "reallocate all the shared blocks now" op...
>>
>>> If someone wants to use chattr or some new operation to say "make this
>>> range of this file belong just to me for purpose of optimizing future
>>> writes", then sure, go for it, with the understanding that there are
>>> plenty of filesystems for which that doesn't even make sense.
>>
>> "Unshare these blocks" sounds more like something fallocate could do.
>>
>> So far in my XFS reflink playground, it seems that using the defrag tool to
>> un-cow a file makes most sense. AFAICT the XFS and ext4 defraggers copy a
>> fragmented file's data to a second file and use a 'swap extents' operation,
>> after which the donor file is unlinked.
>>
>> Hey, if this syscall turns into a more generic "do something involving two
>> (fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
>> extents" as a 7th operation, to refactor the ioctls. <smirk>
>>
>>>
>>>> 6. regular copy, but don't CoW (eatmyothercopies) (joke)
>>>>
>>>> (Please add whatever ops I missed.)
>>>>
>>>> I think I can see a case for letting (4) fall back to (3) since (4) is an
>>>> optimization of (3).
>>>>
>>>> However, I particularly don't like the idea of (1) falling back to (3-5).
>>>> Either the kernel can satisfy a request or it can't, but let's not just
>>>> assume that we should transmogrify one type of request into another. Userspace
>>>> should decide if a reflink failure should turn into one of the copy variants,
>>>> depending on whether the user wants to spread allocation costs over rewrites or
>>>> pay it all up front. Also, if we allow reflink to fall back to copy, how do
>>>> programs find out what actually took place? Or do we simply not allow them to
>>>> find out?
>>>>
>>>> Also, programs that expect reflink either to finish or fail quickly might be
>>>> surprised if it's possible for reflink to take a longer time than usual and
>>>> with the side effect that a deep(er) copy was made.
>>>>
>>>> I guess if someone asks for both (1) and (3) we can do the fallback in the
>>>> kernel, like how we handle it right now.
>>>>
>>>
>>> I think we should focus on what the actual legit use cases might be.
>>> Certainly we want to support a mode that's "reflink or fail". We
>>> could have these flags:
>>>
>>> COPY_FILE_RANGE_ALLOW_REFLINK
>>> COPY_FILE_RANGE_ALLOW_COPY
>>>
>>> Setting neither gets -EINVAL. Setting both works as is. Setting just
>>> ALLOW_REFLINK will fail if a reflink can't be supported. Setting just
>>> ALLOW_COPY will make a best-effort attempt not to reflink but
>>> expressly permits reflinking in cases where either (a) plain old
>>> write(2) might also result in a reflink or (b) there is no advantage
>>> to not reflinking.
>>
>> I don't agree with having a 'copy' flag that can reflink when we also have a
>> 'reflink' flag. I guess I just don't like having a flag with different
>> meanings depending on context.
>>
>> Users should be able to get the default behavior by passing '0' for flags, so
>> provide FORBID_REFLINK and FORBID_COPY flags to turn off those behaviors, with
>> an admonishment that one should only use them if they have a goooood reason.
>> Passing neither gets you reflink-xor-copy, which is what I think we both want
>> in the general case.
>
> I agree here that 0 for flags should do something useful, and I wanted to double check if reflink-xor-copy is a good default behavior.
>
>>
>> FORBID_REFLINK = 1
>> FORBID_COPY = 2
>
> I don't like the idea of using flags to forbid behavior. I think it would be more straightforward to have flags like REFLINK_ONLY or COPY_ONLY so users can tell us what they want, instead of what they don't want.
>
> While I'm thinking about flags, COPY_FILE_RANGE_REFLINK_ONLY would be a bit of a mouthful. Does anybody have suggestions for ways that I could make this shorter?
>
If (and only if) it's _very_ well documented, you could probably drop
the _ONLY part.
>
>> CHECK_SAME = 4
>> HW_COPY = 8
>>
>> DEDUPE = (FORBID_COPY | CHECK_SAME)
>>
>> What do you say to that?
>>
>>> An example of (b) would be a filesystem backed by deduped
>>> thinly-provisioned storage that can't do anything about ENOSPC because
>>> it doesn't control it in the first place.
>>>
>>> Another option would be to split up the copy case into "I expect to
>>> overwrite a lot of the target file soon, so (c) try to commit space
>>> for that or (d) try to make it time-efficient". Of course, (d) is
>>> irrelevant on filesystems with no random access (nvdimms, for
>>> example).
>>>
>>> I guess the tl;dr is that I'm highly skeptical of any use for
>>> disallowing reflinking other than forcibly committing space in cases
>>> where committing space actually means something.
>>
>> That's more or less where I was going too. :)
>>
>> --D
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3019 bytes --]
^ permalink raw reply [flat|nested] 121+ messages in thread