From: Anna Schumaker <Anna.Schumaker@netapp.com>
To: linux-nfs@vger.kernel.org, linux-btrfs@vger.kernel.org,
linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org,
zab@zabbo.net, viro@zeniv.linux.org.uk, clm@fb.com,
darrick.wong@oracle.com, mtk.manpages@gmail.com,
andros@netapp.com, hch@infradead.org
Subject: [PATCH v3 1/9] vfs: add copy_file_range syscall and vfs helper
Date: Fri, 25 Sep 2015 16:48:07 -0400 [thread overview]
Message-ID: <1443214096-12769-2-git-send-email-Anna.Schumaker@Netapp.com> (raw)
In-Reply-To: <1443214096-12769-1-git-send-email-Anna.Schumaker@Netapp.com>
From: Zach Brown <zab@redhat.com>
Add a copy_file_range() system call for offloading copies between
regular files.
This gives an interface to underlying layers of the storage stack which
can copy without reading and writing all the data. There are a few
candidates that should support copy offloading in the nearer term:
- btrfs shares extent references with its clone ioctl
- NFS has patches to add a COPY command which copies on the server
- SCSI has a family of XCOPY commands which copy in the device
This system call avoids the complexity of also accelerating the creation
of the destination file by operating on an existing destination file
descriptor, not a path.
Currently the high level vfs entry point limits copy offloading to files
on the same mount and super (and not in the same file). This can be
relaxed if we get implementations which can copy between file systems
safely.
Signed-off-by: Zach Brown <zab@redhat.com>
[Anna Schumaker: Change -EINVAL to -EBADF during file verification]
[Anna Schumaker: Change flags parameter from int to unsigned int]
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
v3:
- Change flags parameter to take an unsigned int instead of an int
---
fs/read_write.c | 129 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 2 +
kernel/sys_ni.c | 1 +
4 files changed, 135 insertions(+)
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3f..dd10750 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_file_range)
+ return -EBADF;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
+ len, flags);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+ if (!f_in.file || !f_out.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+out:
+ fdput(f_in);
+ fdput(f_out);
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 72d8a84..6220307 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1642,6 +1642,7 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
};
struct inode_operations {
@@ -1695,6 +1696,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
+ loff_t, size_t, unsigned int);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index ee12400..078bd21 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -713,6 +713,8 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
#define __NR_membarrier 283
__SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_copy_file_range 283
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
#undef __NR_syscalls
#define __NR_syscalls 284
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a02decf..83c5c82 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
2.5.3
next prev parent reply other threads:[~2015-09-25 20:48 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-09-25 20:48 [PATCH v3 0/9] VFS: In-kernel copy system call Anna Schumaker
2015-09-25 20:48 ` Anna Schumaker [this message]
2015-09-25 20:48 ` [PATCH v3 2/9] x86: add sys_copy_file_range to syscall tables Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 3/9] btrfs: add .copy_file_range file operation Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 4/9] vfs: Copy should check len after file open mode Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 5/9] vfs: Copy shouldn't forbid ranges inside the same file Anna Schumaker
2015-09-28 18:46 ` Darrick J. Wong
[not found] ` <1443214096-12769-1-git-send-email-Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-25 20:48 ` [PATCH v3 6/9] vfs: Copy should use file_out rather than file_in Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 7/9] vfs: Remove copy_file_range mountpoint checks Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 8/9] vfs: copy_file_range() can do a pagecache copy with splice Anna Schumaker
[not found] ` <1443214096-12769-9-git-send-email-Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-25 21:58 ` Pádraig Brady
2015-09-28 18:31 ` Darrick J. Wong
[not found] ` <20150928183152.GB850-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-28 18:35 ` Anna Schumaker
2015-09-26 0:14 ` Andy Lutomirski
2015-09-28 14:28 ` Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 9/9] btrfs: btrfs_copy_file_range() only supports reflinks Anna Schumaker
2015-09-25 20:48 ` [PATCH v3 10/9] copy_file_range.2: New page documenting copy_file_range() Anna Schumaker
2015-09-28 18:40 ` Darrick J. Wong
2015-09-28 20:42 ` Anna Schumaker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1443214096-12769-2-git-send-email-Anna.Schumaker@Netapp.com \
--to=anna.schumaker@netapp.com \
--cc=andros@netapp.com \
--cc=clm@fb.com \
--cc=darrick.wong@oracle.com \
--cc=hch@infradead.org \
--cc=linux-api@vger.kernel.org \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=mtk.manpages@gmail.com \
--cc=viro@zeniv.linux.org.uk \
--cc=zab@zabbo.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).