From: Zach Brown <zab@redhat.com>
To: "Martin K. Petersen" <martin.petersen@oracle.com>,
Trond Myklebust <Trond.Myklebust@netapp.com>,
linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-btrfs@vger.kernel.org, linux-nfs@vger.kernel.org
Subject: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
Date: Tue, 14 May 2013 14:15:23 -0700 [thread overview]
Message-ID: <1368566126-17610-2-git-send-email-zab@redhat.com> (raw)
In-Reply-To: <1368566126-17610-1-git-send-email-zab@redhat.com>
This adds a syscall and vfs entry point for clone_range which offloads
data copying between existing files.
The syscall is a thin wrapper around the vfs entry point. Its arguments
are inspired by sys_splice().
The behaviour of the vfs helper is derived from the current btrfs
CLONE_RANGE ioctl.
---
fs/Makefile | 2 +-
fs/copy_range.c | 127 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
5 files changed, 135 insertions(+), 2 deletions(-)
create mode 100644 fs/copy_range.c
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3..1be83b3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o copy_range.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/copy_range.c b/fs/copy_range.c
new file mode 100644
index 0000000..3000b9f
--- /dev/null
+++ b/fs/copy_range.c
@@ -0,0 +1,127 @@
+/*
+ * "copy_range": offload data copying between existing files
+ *
+ * Copyright (C) 2013 Zach Brown <zab@redhat.com>
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fsnotify.h>
+
+/**
+ * vfs_copy_range - copy range of bytes from source file to existing file
+ * @file_in: source regular file
+ * @pos_in: starting byte offset to copy from the source file
+ * @file_out: destination regular file
+ * @pos_out: starting byte offset to copy to in the destination file
+ * @count: number of bytes to copy
+ *
+ * Returns number of bytes successfully copied from the start of the range or
+ * a negative errno error value.
+ *
+ * The number of bytes successfully written can be less than the input
+ * count if an error is encountered. In this partial success case the
+ * contents of the destination range after the copied bytes can be a mix
+ * of pre-existing bytes, bytes from the source range, or zeros,
+ * depending on the implementation.
+ *
+ * The source range must be entirely within i_size in the source file.
+ * A destination range outside of the size of the destination file will
+ * extend its size.
+ */
+ssize_t vfs_copy_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (count == 0)
+ return 0;
+
+ /* copy_range allows full ssize_t count, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, count);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, count);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_range)
+ return -EINVAL;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + count < pos_in || pos_out + count < pos_out ||
+ pos_in + count > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* XXX do we want this test? btrfs_ioctl_clone_range() */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file for now */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_range(file_in, pos_in, file_out, pos_out,
+ count);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_range);
+
+SYSCALL_DEFINE5(copy_range, int, fd_in, loff_t __user *, upos_in,
+ int, fd_out, loff_t __user *, upos_out, size_t, count)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ if (get_user(pos_in, upos_in) || get_user(pos_out, upos_out))
+ return -EFAULT;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+
+ if (f_in.file && f_out.file)
+ ret = vfs_copy_range(f_in.file, pos_in, f_out.file, pos_out,
+ count);
+ else
+ ret = -EBADF;
+
+ fdput(f_in);
+ fdput(f_out);
+
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 43db02e..6214893 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1543,6 +1543,7 @@ struct file_operations {
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
int (*show_fdinfo)(struct seq_file *m, struct file *f);
+ ssize_t (*copy_range)(struct file *, loff_t, struct file *, loff_t, size_t);
};
struct inode_operations {
@@ -1588,6 +1589,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_range(struct file *, loff_t , struct file *, loff_t,
+ size_t);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 0cc74c4..3935d1c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -692,9 +692,11 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
__SYSCALL(__NR_kcmp, sys_kcmp)
#define __NR_finit_module 273
__SYSCALL(__NR_finit_module, sys_finit_module)
+#define __NR_copy_range 274
+__SYSCALL(__NR_copy_range, sys_copy_range)
#undef __NR_syscalls
-#define __NR_syscalls 274
+#define __NR_syscalls 275
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052..af7808a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -151,6 +151,7 @@ cond_syscall(sys_process_vm_readv);
cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_copy_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
1.7.11.7
next prev parent reply other threads:[~2013-05-14 21:15 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-14 21:15 [RFC v0 0/4] sys_copy_range() rough draft Zach Brown
2013-05-14 21:15 ` Zach Brown [this message]
2013-05-15 19:44 ` [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Eric Wong
2013-05-15 20:03 ` Zach Brown
2013-05-16 21:16 ` Ric Wheeler
2013-05-21 19:47 ` Eric Wong
2013-05-21 19:50 ` Zach Brown
2013-05-14 21:15 ` [RFC v0 2/4] x86: add sys_copy_range to syscall tables Zach Brown
2013-05-14 21:15 ` [RFC v0 3/4] btrfs: add .copy_range file operation Zach Brown
[not found] ` <1368566126-17610-1-git-send-email-zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2013-05-14 21:15 ` [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support Zach Brown
[not found] ` <1368566126-17610-5-git-send-email-zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2013-05-15 20:19 ` J. Bruce Fields
[not found] ` <20130515201949.GD25994-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2013-05-15 20:21 ` Myklebust, Trond
2013-05-15 20:24 ` J. Bruce Fields
2013-05-14 21:42 ` [RFC v0 0/4] sys_copy_range() rough draft Dave Chinner
2013-05-14 22:04 ` Zach Brown
2013-05-15 1:01 ` Dave Chinner
-- strict thread matches above, loose matches on Subject: below --
2013-05-15 17:50 [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Steve French
2013-05-15 18:54 ` J. Bruce Fields
[not found] ` <20130515185429.GA25994-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2013-05-15 19:39 ` Zach Brown
[not found] ` <CAH2r5ms0P8Hgv1mUpyHA32Er38iiaC1HHC4fhxvz2SBFy6Sucw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-15 19:36 ` Zach Brown
[not found] ` <20130515193600.GA318-fypN+1c5dIyjpB87vu3CluTW4wlIGRCZ@public.gmane.org>
2013-05-15 20:08 ` Steve French
2013-05-15 20:16 ` Chris Mason
[not found] ` <20130515201614.24668.83788-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2013-05-15 20:21 ` Steve French
2013-05-15 20:25 ` Chris Mason
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1368566126-17610-2-git-send-email-zab@redhat.com \
--to=zab@redhat.com \
--cc=Trond.Myklebust@netapp.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=martin.petersen@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).