linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: Anna Schumaker <Anna.Schumaker@netapp.com>
Cc: linux-nfs@vger.kernel.org, linux-btrfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org,
	zab@zabbo.net, viro@zeniv.linux.org.uk, clm@fb.com,
	mtk.manpages@gmail.com, andros@netapp.com, hch@infradead.org
Subject: Re: [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper
Date: Fri, 4 Sep 2015 14:50:55 -0700	[thread overview]
Message-ID: <20150904215055.GD10391@birch.djwong.org> (raw)
In-Reply-To: <1441397823-1203-2-git-send-email-Anna.Schumaker@Netapp.com>

On Fri, Sep 04, 2015 at 04:16:55PM -0400, Anna Schumaker wrote:
> From: Zach Brown <zab@redhat.com>
> 
> Add a copy_file_range() system call for offloading copies between
> regular files.
> 
> This gives an interface to underlying layers of the storage stack which
> can copy without reading and writing all the data.  There are a few
> candidates that should support copy offloading in the nearer term:
> 
> - btrfs shares extent references with its clone ioctl
> - NFS has patches to add a COPY command which copies on the server
> - SCSI has a family of XCOPY commands which copy in the device
> 
> This system call avoids the complexity of also accelerating the creation
> of the destination file by operating on an existing destination file
> descriptor, not a path.
> 
> Currently the high level vfs entry point limits copy offloading to files
> on the same mount and super (and not in the same file).  This can be
> relaxed if we get implementations which can copy between file systems
> safely.
> 
> Signed-off-by: Zach Brown <zab@redhat.com>
> [Anna Schumaker:  Change -EINVAL to -EBADF during file verification]
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
>  fs/read_write.c                   | 129 ++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h                |   3 +
>  include/uapi/asm-generic/unistd.h |   4 +-
>  kernel/sys_ni.c                   |   1 +
>  4 files changed, 136 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 819ef3f..82c4933 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -16,6 +16,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/splice.h>
>  #include <linux/compat.h>
> +#include <linux/mount.h>
>  #include "internal.h"
>  
>  #include <asm/uaccess.h>
> @@ -1327,3 +1328,131 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
>  	return do_sendfile(out_fd, in_fd, NULL, count, 0);
>  }
>  #endif
> +
> +/*
> + * copy_file_range() differs from regular file read and write in that it
> + * specifically allows return partial success.  When it does so is up to
> + * the copy_file_range method.
> + */
> +ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> +			    struct file *file_out, loff_t pos_out,
> +			    size_t len, int flags)
> +{
> +	struct inode *inode_in;
> +	struct inode *inode_out;
> +	ssize_t ret;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (len == 0)
> +		return 0;
> +
> +	/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT  */
> +	ret = rw_verify_area(READ, file_in, &pos_in, len);
> +	if (ret >= 0)
> +		ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (!(file_in->f_mode & FMODE_READ) ||
> +	    !(file_out->f_mode & FMODE_WRITE) ||
> +	    (file_out->f_flags & O_APPEND) ||
> +	    !file_in->f_op || !file_in->f_op->copy_file_range)
> +		return -EBADF;
> +
> +	inode_in = file_inode(file_in);
> +	inode_out = file_inode(file_out);
> +
> +	/* make sure offsets don't wrap and the input is inside i_size */
> +	if (pos_in + len < pos_in || pos_out + len < pos_out ||
> +	    pos_in + len > i_size_read(inode_in))
> +		return -EINVAL;
> +
> +	/* this could be relaxed once a method supports cross-fs copies */
> +	if (inode_in->i_sb != inode_out->i_sb ||
> +	    file_in->f_path.mnt != file_out->f_path.mnt)
> +		return -EXDEV;
> +
> +	/* forbid ranges in the same file */
> +	if (inode_in == inode_out)
> +		return -EINVAL;

btrfs does and XFS will support the case of a file sharing blocks with itself.

--D

> +
> +	ret = mnt_want_write_file(file_out);
> +	if (ret)
> +		return ret;
> +
> +	ret = file_in->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
> +					     len, flags);
> +	if (ret > 0) {
> +		fsnotify_access(file_in);
> +		add_rchar(current, ret);
> +		fsnotify_modify(file_out);
> +		add_wchar(current, ret);
> +	}
> +	inc_syscr(current);
> +	inc_syscw(current);
> +
> +	mnt_drop_write_file(file_out);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range);
> +
> +SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
> +		int, fd_out, loff_t __user *, off_out,
> +		size_t, len, unsigned int, flags)
> +{
> +	loff_t pos_in;
> +	loff_t pos_out;
> +	struct fd f_in;
> +	struct fd f_out;
> +	ssize_t ret;
> +
> +	f_in = fdget(fd_in);
> +	f_out = fdget(fd_out);
> +	if (!f_in.file || !f_out.file) {
> +		ret = -EBADF;
> +		goto out;
> +	}
> +
> +	ret = -EFAULT;
> +	if (off_in) {
> +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_in = f_in.file->f_pos;
> +	}
> +
> +	if (off_out) {
> +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_out = f_out.file->f_pos;
> +	}
> +
> +	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
> +				  flags);
> +	if (ret > 0) {
> +		pos_in += ret;
> +		pos_out += ret;
> +
> +		if (off_in) {
> +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_in.file->f_pos = pos_in;
> +		}
> +
> +		if (off_out) {
> +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_out.file->f_pos = pos_out;
> +		}
> +	}
> +out:
> +	fdput(f_in);
> +	fdput(f_out);
> +	return ret;
> +}
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index cc008c3..c97aed8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1631,6 +1631,7 @@ struct file_operations {
>  #ifndef CONFIG_MMU
>  	unsigned (*mmap_capabilities)(struct file *);
>  #endif
> +	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, int);
>  };
>  
>  struct inode_operations {
> @@ -1684,6 +1685,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
>  		unsigned long, loff_t *);
>  extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
>  		unsigned long, loff_t *);
> +extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> +				   loff_t, size_t, int);
>  
>  struct super_operations {
>     	struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index e016bd9..2b60f0c 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
>  __SYSCALL(__NR_bpf, sys_bpf)
>  #define __NR_execveat 281
>  __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
> +#define __NR_copy_file_range 282
> +__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 282
> +#define __NR_syscalls 283
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 7995ef5..4e01cd9 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -173,6 +173,7 @@ cond_syscall(sys_setfsuid);
>  cond_syscall(sys_setfsgid);
>  cond_syscall(sys_capget);
>  cond_syscall(sys_capset);
> +cond_syscall(sys_copy_file_range);
>  
>  /* arch-specific weak syscall entries */
>  cond_syscall(sys_pciconfig_read);
> -- 
> 2.5.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

  reply	other threads:[~2015-09-04 21:50 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-04 20:16 [PATCH v1 0/8] VFS: In-kernel copy system call Anna Schumaker
2015-09-04 20:16 ` [PATCH v1 2/8] x86: add sys_copy_file_range to syscall tables Anna Schumaker
2015-09-04 20:16 ` [PATCH v1 3/8] btrfs: add .copy_file_range file operation Anna Schumaker
     [not found]   ` <1441397823-1203-4-git-send-email-Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-04 21:02     ` Josef Bacik
2015-09-09  8:39   ` David Sterba
     [not found] ` <1441397823-1203-1-git-send-email-Anna.Schumaker-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-04 20:16   ` [PATCH v1 1/9] vfs: add copy_file_range syscall and vfs helper Anna Schumaker
2015-09-04 21:50     ` Darrick J. Wong [this message]
2015-09-04 20:16   ` [PATCH v1 4/8] btrfs: Add mountpoint checking during btrfs_copy_file_range Anna Schumaker
2015-09-09  9:18     ` David Sterba
2015-09-09 15:56       ` Anna Schumaker
2015-09-04 20:16   ` [PATCH v1 5/8] vfs: Remove copy_file_range mountpoint checks Anna Schumaker
2015-09-04 20:17   ` [PATCH v1 6/8] vfs: Copy should check len after file open mode Anna Schumaker
2015-09-04 20:17   ` [PATCH v1 8/8] vfs: Fall back on splice if no copy function defined Anna Schumaker
2015-09-04 21:08     ` Darrick J. Wong
     [not found]       ` <20150904210813.GA30681-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-08 14:57         ` Anna Schumaker
2015-09-04 20:17   ` [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range() Anna Schumaker
2015-09-04 21:38     ` Darrick J. Wong
     [not found]       ` <20150904213856.GC10391-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-04 22:31         ` Andreas Dilger
     [not found]           ` <95674806-645C-410C-8A4B-A46F03AFFE20-m1MBpc4rdrD3fQ9qLvQP4Q@public.gmane.org>
2015-09-08 15:05             ` Anna Schumaker
2015-09-08 15:04         ` Anna Schumaker
2015-09-08 20:39           ` Darrick J. Wong
2015-09-09  9:16             ` David Sterba
     [not found]             ` <20150908203918.GB30681-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-09 11:38               ` Austin S Hemmelgarn
2015-09-09 17:17                 ` Darrick J. Wong
     [not found]                   ` <20150909171757.GE10391-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-09 17:31                     ` Anna Schumaker
     [not found]                       ` <55F06CEC.5040208-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-09 18:12                         ` Darrick J. Wong
2015-09-09 19:25                           ` Anna Schumaker
2015-09-10 15:42                     ` David Sterba
     [not found]                       ` <20150910154251.GM8891-1ReQVI26iDCaZKY3DrU6dA@public.gmane.org>
2015-09-10 16:43                         ` Darrick J. Wong
2015-09-04 22:25   ` [PATCH v1 0/8] VFS: In-kernel copy system call Andreas Dilger
     [not found]     ` <4B41043F-5D85-42D6-8F20-2DCC45930EF4-m1MBpc4rdrD3fQ9qLvQP4Q@public.gmane.org>
2015-09-05  8:33       ` Al Viro
     [not found]         ` <20150905083342.GG22011-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2015-09-08 15:08           ` Anna Schumaker
2015-09-08 20:45             ` Darrick J. Wong
     [not found]               ` <20150908204517.GC30681-PTl6brltDGh4DFYR7WNSRA@public.gmane.org>
2015-09-08 20:49                 ` Anna Schumaker
2015-09-08 15:07     ` Anna Schumaker
2015-09-08 15:21   ` Pádraig Brady
     [not found]     ` <55EEFCEE.5090000-V8g9lnOeT5ydJdNcDFJN0w@public.gmane.org>
2015-09-08 18:23       ` Anna Schumaker
     [not found]         ` <55EF279B.3020101-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-08 19:10           ` Andy Lutomirski
     [not found]             ` <CALCETrXxRB-LXVb+=nkwfj0zEjWuXXTctkSAc9Oec0fgyOQ5Yg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-09-08 20:03               ` Pádraig Brady
     [not found]                 ` <55EF3EFD.3080302-V8g9lnOeT5ydJdNcDFJN0w@public.gmane.org>
2015-09-08 21:29                   ` Darrick J. Wong
2015-09-08 21:45                     ` Andy Lutomirski
2015-09-08 22:39                       ` Darrick J. Wong
2015-09-08 23:08                         ` Andy Lutomirski
2015-09-09  1:19                           ` Darrick J. Wong
2015-09-09 20:09                           ` Chris Mason
     [not found]                             ` <20150909200921.GD9511-DzB2rL6jT1BHfPKRx072akEOCMrvLtNR@public.gmane.org>
2015-09-09 20:26                               ` Trond Myklebust
     [not found]                                 ` <CAHQdGtTSZ1beMMF4DJv=OuA1j2ww0xzJj3+9HMRAf3UpCCLaZg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-09-09 20:38                                   ` Chris Mason
     [not found]                                     ` <20150909203805.GE9511-DzB2rL6jT1BHfPKRx072akEOCMrvLtNR@public.gmane.org>
2015-09-09 20:41                                       ` Anna Schumaker
     [not found]                                         ` <55F0997E.1040105-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-09 21:42                                           ` Darrick J. Wong
2015-09-09 20:37                               ` Andy Lutomirski
     [not found]                                 ` <CALCETrXPcxHWGwqhtkGStVabWDOsRbBy+VzrN+XxVZA_F9O0qA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-09-09 20:42                                   ` Chris Mason
     [not found]                           ` <CALCETrVsWBdqvAgwxHcG=gbcWRNPG2ZziWUg1g=siKDrDu7S2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-09-13 23:25                             ` Dave Chinner
2015-09-14 17:53                               ` Andy Lutomirski
2015-09-09 18:52                         ` Anna Schumaker
     [not found]                           ` <55F07FD8.4020507-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-09 21:16                             ` Darrick J. Wong
2015-09-10 15:10                               ` Anna Schumaker
     [not found]                                 ` <55F19D7F.5090907-ZwjVKphTwtPQT0dZR+AlfA@public.gmane.org>
2015-09-10 15:49                                   ` Austin S Hemmelgarn
2015-09-10 11:40                           ` Austin S Hemmelgarn
2015-09-04 20:17 ` [PATCH v1 7/8] vfs: Copy should use file_out rather than file_in Anna Schumaker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150904215055.GD10391@birch.djwong.org \
    --to=darrick.wong@oracle.com \
    --cc=Anna.Schumaker@netapp.com \
    --cc=andros@netapp.com \
    --cc=clm@fb.com \
    --cc=hch@infradead.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=mtk.manpages@gmail.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).