[PATCH 1/1] [RFC] 64bit copy_file

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/1] [RFC] 64bit copy_file_range system call
@ 2017-06-14 17:06 Olga Kornievskaia
  2017-06-14 17:24 ` Anna Schumaker
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-14 17:06 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-nfs

This is a proposal to allow 64bit count to copy and return as
a result of a copy_file_range. No attempt was made to share code
with the 32bit function because 32bit interface should probably
get depreciated.

Why use 64bit? Current uses of 32bit are by clone_file_range()
which could use 64bit count and NFS copy_file_range also supports
64bit count value.

Also with this proposal off-the-bet allow the userland to copy
between different mount points.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
 include/linux/fs.h                     |   4 +
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/sys_ni.c                        |   1 +
 7 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac21..1f5f1e7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	copy_file_range64	sys_copy_file_range64
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..e658bbe 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	64	copy_file_range64	sys_copy_file_range64
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d44..e2665c1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	return ret;
 }
 
+u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    u64 len, unsigned int flags)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	u64 ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	ret = rw_verify_area(READ, file_in, &pos_in, len);
+	if (unlikely(ret))
+		return ret;
+
+	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+	if (unlikely(ret))
+		return ret;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND))
+		return -EBADF;
+
+	if (len == 0)
+		return 0;
+
+	file_start_write(file_out);
+
+	/*
+	 * Try cloning first, this is supported by more file systems, and
+	 * more efficient if both clone and copy are supported (e.g. NFS).
+	 */
+	if (file_in->f_op->clone_file_range) {
+		ret = file_in->f_op->clone_file_range(file_in, pos_in,
+				file_out, pos_out, len);
+		if (ret == 0) {
+			ret = len;
+			goto done;
+		}
+	}
+
+	if (file_out->f_op->copy_file_range64) {
+		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
+					file_out, pos_out, len, flags);
+		if (ret != -EOPNOTSUPP)
+			goto done;
+	}
+
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+done:
+	if (ret > 0) {
+		fsnotify_access(file_in);
+		add_rchar(current, ret);
+		fsnotify_modify(file_out);
+		add_wchar(current, ret);
+	}
+
+	inc_syscr(current);
+	inc_syscw(current);
+
+	file_end_write(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range64);
+
+SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		u64, len, unsigned int, flags)
+{
+	loff_t pos_in;
+	loff_t pos_out;
+	struct fd f_in;
+	struct fd f_out;
+	u64 ret = -EBADF;
+
+	f_in = fdget(fd_in);
+	if (!f_in.file)
+		goto out2;
+
+	f_out = fdget(fd_out);
+	if (!f_out.file)
+		goto out1;
+
+	ret = -EFAULT;
+	if (off_in) {
+		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_in = f_in.file->f_pos;
+	}
+
+	if (off_out) {
+		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_out = f_out.file->f_pos;
+	}
+
+	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
+				  flags);
+	if (ret > 0) {
+		pos_in += ret;
+		pos_out += ret;
+
+		if (off_in) {
+			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_in.file->f_pos = pos_in;
+		}
+
+		if (off_out) {
+			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_out.file->f_pos = pos_out;
+		}
+	}
+
+out:
+	fdput(f_out);
+out1:
+	fdput(f_in);
+out2:
+	return ret;
+}
+
 static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 {
 	struct inode *inode = file_inode(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9..2727a98 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1690,6 +1690,8 @@ struct file_operations {
 			u64);
 	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
 			u64);
+	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
+			loff_t, u64, unsigned int);
 };
 
 struct inode_operations {
@@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 					 loff_t len, bool *is_same);
 extern int vfs_dedupe_file_range(struct file *file,
 				 struct file_dedupe_range *same);
+extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
+			       loff_t, u64, unsigned int);
 
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9..f7ea78e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
+				    int fd_out, loff_t __user *off_out,
+				    u64 len, unsigned int flags);
 
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..e385a76 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -731,9 +731,11 @@
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#define __NR_copy_file_range64 292
+__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef85..8e0cfd4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
 cond_syscall(sys_capget);
 cond_syscall(sys_capset);
 cond_syscall(sys_copy_file_range);
+cond_syscall(sys_copy_file_range64);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia
@ 2017-06-14 17:24 ` Anna Schumaker
  2017-06-14 18:53   ` Olga Kornievskaia
  2017-06-15  8:15 ` Christoph Hellwig
  2017-06-24 13:23 ` Jeff Layton
  2 siblings, 1 reply; 35+ messages in thread
From: Anna Schumaker @ 2017-06-14 17:24 UTC (permalink / raw)
  To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs

Hi Olga,

On 06/14/2017 01:06 PM, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
> 
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
> 
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
>  include/linux/fs.h                     |   4 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/sys_ni.c                        |   1 +
>  7 files changed, 149 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
>  382	i386	pkey_free		sys_pkey_free
>  383	i386	statx			sys_statx
>  384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
> +385	i386	copy_file_range64	sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	64	copy_file_range64	sys_copy_file_range64
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..e2665c1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	return ret;
>  }
>  
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> +			    struct file *file_out, loff_t pos_out,
> +			    u64 len, unsigned int flags)
> +{
> +	struct inode *inode_in = file_inode(file_in);
> +	struct inode *inode_out = file_inode(file_out);
> +	u64 ret;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> +		return -EISDIR;
> +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +		return -EINVAL;
> +
> +	ret = rw_verify_area(READ, file_in, &pos_in, len);
> +	if (unlikely(ret))
> +		return ret;

Doesn't rw_verify_area() only accept 32-bit ranges?  You might have to use clone_verify_area() instead (maybe consider renaming that function to something generic?)

Thanks,
Anna

> +
> +	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	if (!(file_in->f_mode & FMODE_READ) ||
> +	    !(file_out->f_mode & FMODE_WRITE) ||
> +	    (file_out->f_flags & O_APPEND))
> +		return -EBADF;
> +
> +	if (len == 0)
> +		return 0;
> +
> +	file_start_write(file_out);
> +
> +	/*
> +	 * Try cloning first, this is supported by more file systems, and
> +	 * more efficient if both clone and copy are supported (e.g. NFS).
> +	 */
> +	if (file_in->f_op->clone_file_range) {
> +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
> +				file_out, pos_out, len);
> +		if (ret == 0) {
> +			ret = len;
> +			goto done;
> +		}
> +	}
> +
> +	if (file_out->f_op->copy_file_range64) {
> +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> +					file_out, pos_out, len, flags);
> +		if (ret != -EOPNOTSUPP)
> +			goto done;
> +	}
> +
> +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> +	if (ret > 0) {
> +		fsnotify_access(file_in);
> +		add_rchar(current, ret);
> +		fsnotify_modify(file_out);
> +		add_wchar(current, ret);
> +	}
> +
> +	inc_syscr(current);
> +	inc_syscw(current);
> +
> +	file_end_write(file_out);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> +		int, fd_out, loff_t __user *, off_out,
> +		u64, len, unsigned int, flags)
> +{
> +	loff_t pos_in;
> +	loff_t pos_out;
> +	struct fd f_in;
> +	struct fd f_out;
> +	u64 ret = -EBADF;
> +
> +	f_in = fdget(fd_in);
> +	if (!f_in.file)
> +		goto out2;
> +
> +	f_out = fdget(fd_out);
> +	if (!f_out.file)
> +		goto out1;
> +
> +	ret = -EFAULT;
> +	if (off_in) {
> +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_in = f_in.file->f_pos;
> +	}
> +
> +	if (off_out) {
> +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_out = f_out.file->f_pos;
> +	}
> +
> +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> +				  flags);
> +	if (ret > 0) {
> +		pos_in += ret;
> +		pos_out += ret;
> +
> +		if (off_in) {
> +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_in.file->f_pos = pos_in;
> +		}
> +
> +		if (off_out) {
> +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_out.file->f_pos = pos_out;
> +		}
> +	}
> +
> +out:
> +	fdput(f_out);
> +out1:
> +	fdput(f_in);
> +out2:
> +	return ret;
> +}
> +
>  static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>  {
>  	struct inode *inode = file_inode(file);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..2727a98 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
>  			u64);
>  	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>  			u64);
> +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> +			loff_t, u64, unsigned int);
>  };
>  
>  struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>  					 loff_t len, bool *is_same);
>  extern int vfs_dedupe_file_range(struct file *file,
>  				 struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> +			       loff_t, u64, unsigned int);
>  
>  struct super_operations {
>     	struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> +				    int fd_out, loff_t __user *off_out,
> +				    u64 len, unsigned int flags);
>  
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>  #define __NR_statx 291
>  __SYSCALL(__NR_statx,     sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>  cond_syscall(sys_capget);
>  cond_syscall(sys_capset);
>  cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>  
>  /* arch-specific weak syscall entries */
>  cond_syscall(sys_pciconfig_read);
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 17:24 ` Anna Schumaker
@ 2017-06-14 18:53   ` Olga Kornievskaia
  2017-06-14 19:32     ` Amir Goldstein
                       ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-14 18:53 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-nfs

This is a proposal to allow 64bit count to copy and return as
a result of a copy_file_range. No attempt was made to share code
with the 32bit function because 32bit interface should probably
get depreciated.

Why use 64bit? Current uses of 32bit are by clone_file_range()
which could use 64bit count and NFS copy_file_range also supports
64bit count value.

Also with this proposal off-the-bet allow the userland to copy
between different mount points.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
 include/linux/fs.h                     |   4 +
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/sys_ni.c                        |   1 +
 7 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac21..1f5f1e7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	copy_file_range64	sys_copy_file_range64
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..e658bbe 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	64	copy_file_range64	sys_copy_file_range64
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/read_write.c b/fs/read_write.c
index e3ee985..c9c072b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	return ret;
 }
 
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 {
 	struct inode *inode = file_inode(file);
 
@@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
 
+u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    u64 len, unsigned int flags)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	u64 ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	ret = rw64_verify_area(file_in, pos_in, len, false);
+	if (unlikely(ret))
+		return ret;
+
+	ret = rw64_verify_area(file_out, pos_out, len, true);
+	if (unlikely(ret))
+		return ret;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND))
+		return -EBADF;
+
+	if (len == 0)
+		return 0;
+
+	file_start_write(file_out);
+
+	/*
+	 * Try cloning first, this is supported by more file systems, and
+	 * more efficient if both clone and copy are supported (e.g. NFS).
+	 */
+	if (file_in->f_op->clone_file_range) {
+		ret = file_in->f_op->clone_file_range(file_in, pos_in,
+				file_out, pos_out, len);
+		if (ret == 0) {
+			ret = len;
+			goto done;
+		}
+	}
+
+	if (file_out->f_op->copy_file_range64) {
+		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
+					file_out, pos_out, len, flags);
+		if (ret != -EOPNOTSUPP)
+			goto done;
+	}
+
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+done:
+	if (ret > 0) {
+		fsnotify_access(file_in);
+		add_rchar(current, ret);
+		fsnotify_modify(file_out);
+		add_wchar(current, ret);
+	}
+
+	inc_syscr(current);
+	inc_syscw(current);
+
+	file_end_write(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range64);
+
+SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		u64, len, unsigned int, flags)
+{
+	loff_t pos_in;
+	loff_t pos_out;
+	struct fd f_in;
+	struct fd f_out;
+	u64 ret = -EBADF;
+
+	f_in = fdget(fd_in);
+	if (!f_in.file)
+		goto out2;
+
+	f_out = fdget(fd_out);
+	if (!f_out.file)
+		goto out1;
+
+	ret = -EFAULT;
+	if (off_in) {
+		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_in = f_in.file->f_pos;
+	}
+
+	if (off_out) {
+		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_out = f_out.file->f_pos;
+	}
+
+	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
+				  flags);
+	if (ret > 0) {
+		pos_in += ret;
+		pos_out += ret;
+
+		if (off_in) {
+			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_in.file->f_pos = pos_in;
+		}
+
+		if (off_out) {
+			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_out.file->f_pos = pos_out;
+		}
+	}
+
+out:
+	fdput(f_out);
+out1:
+	fdput(f_in);
+out2:
+	return ret;
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 	if (!file_in->f_op->clone_file_range)
 		return -EOPNOTSUPP;
 
-	ret = clone_verify_area(file_in, pos_in, len, false);
+	ret = rw64_verify_area(file_in, pos_in, len, false);
 	if (ret)
 		return ret;
 
-	ret = clone_verify_area(file_out, pos_out, len, true);
+	ret = rw64_verify_area(file_out, pos_out, len, true);
 	if (ret)
 		return ret;
 
@@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 	if (!S_ISREG(src->i_mode))
 		goto out;
 
-	ret = clone_verify_area(file, off, len, false);
+	ret = rw64_verify_area(file, off, len, false);
 	if (ret < 0)
 		goto out;
 	ret = 0;
@@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 		}
 
 		dst_off = info->dest_offset;
-		ret = clone_verify_area(dst_file, dst_off, len, true);
+		ret = rw64_verify_area(dst_file, dst_off, len, true);
 		if (ret < 0) {
 			info->status = ret;
 			goto next_file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9..41fce43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1690,6 +1690,8 @@ struct file_operations {
 			u64);
 	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
 			u64);
+	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
+			loff_t, u64, unsigned int);
 };
 
 struct inode_operations {
@@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 					 loff_t len, bool *is_same);
 extern int vfs_dedupe_file_range(struct file *file,
 				 struct file_dedupe_range *same);
+extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
+			       loff_t, u64, unsigned int);
 
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9..f7ea78e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
+				    int fd_out, loff_t __user *off_out,
+				    u64 len, unsigned int flags);
 
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..e385a76 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -731,9 +731,11 @@
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#define __NR_copy_file_range64 292
+__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef85..8e0cfd4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
 cond_syscall(sys_capget);
 cond_syscall(sys_capset);
 cond_syscall(sys_copy_file_range);
+cond_syscall(sys_copy_file_range64);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 18:53   ` Olga Kornievskaia
@ 2017-06-14 19:32     ` Amir Goldstein
  2017-06-14 20:08       ` Olga Kornievskaia
  2017-06-14 21:50     ` Randy Dunlap
  2017-06-15  3:59     ` Darrick J. Wong
  2 siblings, 1 reply; 35+ messages in thread
From: Amir Goldstein @ 2017-06-14 19:32 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: linux-fsdevel, Linux NFS Mailing List, Christoph Hellwig

On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <kolga@netapp.com> wrote:
>
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
...
> +
> +       /*
> +        * Try cloning first, this is supported by more file systems, and
> +        * more efficient if both clone and copy are supported (e.g. NFS).
> +        */
> +       if (file_in->f_op->clone_file_range) {
> +               ret = file_in->f_op->clone_file_range(file_in, pos_in,
> +                               file_out, pos_out, len);
> +               if (ret == 0) {
> +                       ret = len;
> +                       goto done;
> +               }
> +       }
> +
> +       if (file_out->f_op->copy_file_range64) {
> +               ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> +                                       file_out, pos_out, len, flags);
> +               if (ret != -EOPNOTSUPP)
> +                       goto done;
> +       }
> +
> +       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +

Olga,

I know this is copied from vfs_copy_file_range() as is, so my comment
applies to the origin function as well, but it is easier to change the new
function without changing userspace behavior, so..

A while back, either Dave Chinner or Christoph suggested that I use
vfs_copy_file_range() from ovl_copy_up_data() instead of calling
vfs_clone_file_range() and falling back to do_splice_direct() loop.
Then Christoph added the clone_file_range attempt into
vfs_copy_file_range(), which was one part of the work.

However, ovl_copy_up_data() uses a killable loop of do_splice_direct()
calls with smaller chunks, so it is not the same as vfs_copy_file_range().

I wonder if it makes sense to use a similar killable loop in the new
function?
I wonder, for reference, if NFS implementation of copy_file_range64()
is killable?

Amir.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 19:32     ` Amir Goldstein
@ 2017-06-14 20:08       ` Olga Kornievskaia
  0 siblings, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-14 20:08 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Olga Kornievskaia, linux-fsdevel, Linux NFS Mailing List,
	Christoph Hellwig

On Wed, Jun 14, 2017 at 3:32 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <kolga@netapp.com> wrote:
>>
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> ---
> ...
>> +
>> +       /*
>> +        * Try cloning first, this is supported by more file systems, and
>> +        * more efficient if both clone and copy are supported (e.g. NFS).
>> +        */
>> +       if (file_in->f_op->clone_file_range) {
>> +               ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> +                               file_out, pos_out, len);
>> +               if (ret == 0) {
>> +                       ret = len;
>> +                       goto done;
>> +               }
>> +       }
>> +
>> +       if (file_out->f_op->copy_file_range64) {
>> +               ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> +                                       file_out, pos_out, len, flags);
>> +               if (ret != -EOPNOTSUPP)
>> +                       goto done;
>> +       }
>> +
>> +       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> +                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>
> Olga,

Hi Amir,

Thanks for the comments.

> I know this is copied from vfs_copy_file_range() as is, so my comment
> applies to the origin function as well, but it is easier to change the new
> function without changing userspace behavior, so..
>
> A while back, either Dave Chinner or Christoph suggested that I use
> vfs_copy_file_range() from ovl_copy_up_data() instead of calling
> vfs_clone_file_range() and falling back to do_splice_direct() loop.
> Then Christoph added the clone_file_range attempt into
> vfs_copy_file_range(), which was one part of the work.
>
> However, ovl_copy_up_data() uses a killable loop of do_splice_direct()
> calls with smaller chunks, so it is not the same as vfs_copy_file_range().
>
> I wonder if it makes sense to use a similar killable loop in the new
> function?

I'm currently not proposing to change the semantics of
copy_file_range() call to in some cases return a partial copy and thus
making the application responsible for looping to finish the copy.
Basically copy_file_range() follows the same semantics as read/write
where it is possible that not all requested bytes are completed.
However, if it seems more desirable to shift the responsibility to
completing the copy into the kernel, then I can definitely add the
same looping as does ovl_copy_up_data().

> I wonder, for reference, if NFS implementation of copy_file_range64()
> is killable?

I believe the current implementation of "intra" (same server) copy is
not killable. The outstanding proposal to add asynchronous copy as
well as "inter" (server to server) copy is killable.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 18:53   ` Olga Kornievskaia
  2017-06-14 19:32     ` Amir Goldstein
@ 2017-06-14 21:50     ` Randy Dunlap
  2017-06-15 13:07       ` Olga Kornievskaia
  2017-06-15  3:59     ` Darrick J. Wong
  2 siblings, 1 reply; 35+ messages in thread
From: Randy Dunlap @ 2017-06-14 21:50 UTC (permalink / raw)
  To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs

On 06/14/17 11:53, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.

      deprecated.

> 
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
> 
> Also with this proposal off-the-bet allow the userland to copy

what does "off-the-bet" mean?

> between different mount points.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>  include/linux/fs.h                     |   4 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/sys_ni.c                        |   1 +
>  7 files changed, 154 insertions(+), 6 deletions(-)


-- 
~Randy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 21:50     ` Randy Dunlap
@ 2017-06-15 13:07       ` Olga Kornievskaia
  0 siblings, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-15 13:07 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Wed, Jun 14, 2017 at 5:50 PM, Randy Dunlap <rdunlap@infradead.org> wrote:
> On 06/14/17 11:53, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>
>       deprecated.
>
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>
> what does "off-the-bet" mean?

from the beginning.

>
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> ---
>>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>>  include/linux/fs.h                     |   4 +
>>  include/linux/syscalls.h               |   3 +
>>  include/uapi/asm-generic/unistd.h      |   4 +-
>>  kernel/sys_ni.c                        |   1 +
>>  7 files changed, 154 insertions(+), 6 deletions(-)
>
>
> --
> ~Randy
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 18:53   ` Olga Kornievskaia
  2017-06-14 19:32     ` Amir Goldstein
  2017-06-14 21:50     ` Randy Dunlap
@ 2017-06-15  3:59     ` Darrick J. Wong
  2017-06-15 14:39       ` Olga Kornievskaia
  2 siblings, 1 reply; 35+ messages in thread
From: Darrick J. Wong @ 2017-06-15  3:59 UTC (permalink / raw)
  To: Olga Kornievskaia; +Cc: linux-fsdevel, linux-nfs

On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
> 
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
> 
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>  include/linux/fs.h                     |   4 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/sys_ni.c                        |   1 +
>  7 files changed, 154 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
>  382	i386	pkey_free		sys_pkey_free
>  383	i386	statx			sys_statx
>  384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
> +385	i386	copy_file_range64	sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	64	copy_file_range64	sys_copy_file_range64
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index e3ee985..c9c072b 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	return ret;
>  }
>  
> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>  {
>  	struct inode *inode = file_inode(file);
>  
> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>  	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>  }
>  
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> +			    struct file *file_out, loff_t pos_out,
> +			    u64 len, unsigned int flags)

AFAICT the difference between vfs_copy_file_range() and
vfs_copy_file_range64() is that you've replaced 'size_t len' with
'u64 len', correct?

sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
the userspace interface is concerned, this only makes a difference on
32-bit userlands, right?  So, uh, how many 32-bit user programs need to
c_f_r more than 2GB at a time?

Follow up question -- does c_f_r not work for > 2GB of data when
userspace is 64-bit?

--D

> +{
> +	struct inode *inode_in = file_inode(file_in);
> +	struct inode *inode_out = file_inode(file_out);
> +	u64 ret;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> +		return -EISDIR;
> +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +		return -EINVAL;
> +
> +	ret = rw64_verify_area(file_in, pos_in, len, false);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	ret = rw64_verify_area(file_out, pos_out, len, true);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	if (!(file_in->f_mode & FMODE_READ) ||
> +	    !(file_out->f_mode & FMODE_WRITE) ||
> +	    (file_out->f_flags & O_APPEND))
> +		return -EBADF;
> +
> +	if (len == 0)
> +		return 0;
> +
> +	file_start_write(file_out);
> +
> +	/*
> +	 * Try cloning first, this is supported by more file systems, and
> +	 * more efficient if both clone and copy are supported (e.g. NFS).
> +	 */
> +	if (file_in->f_op->clone_file_range) {
> +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
> +				file_out, pos_out, len);
> +		if (ret == 0) {
> +			ret = len;
> +			goto done;
> +		}
> +	}
> +
> +	if (file_out->f_op->copy_file_range64) {
> +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> +					file_out, pos_out, len, flags);
> +		if (ret != -EOPNOTSUPP)
> +			goto done;
> +	}
> +
> +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> +	if (ret > 0) {
> +		fsnotify_access(file_in);
> +		add_rchar(current, ret);
> +		fsnotify_modify(file_out);
> +		add_wchar(current, ret);
> +	}
> +
> +	inc_syscr(current);
> +	inc_syscw(current);
> +
> +	file_end_write(file_out);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> +		int, fd_out, loff_t __user *, off_out,
> +		u64, len, unsigned int, flags)
> +{
> +	loff_t pos_in;
> +	loff_t pos_out;
> +	struct fd f_in;
> +	struct fd f_out;
> +	u64 ret = -EBADF;
> +
> +	f_in = fdget(fd_in);
> +	if (!f_in.file)
> +		goto out2;
> +
> +	f_out = fdget(fd_out);
> +	if (!f_out.file)
> +		goto out1;
> +
> +	ret = -EFAULT;
> +	if (off_in) {
> +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_in = f_in.file->f_pos;
> +	}
> +
> +	if (off_out) {
> +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_out = f_out.file->f_pos;
> +	}
> +
> +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> +				  flags);
> +	if (ret > 0) {
> +		pos_in += ret;
> +		pos_out += ret;
> +
> +		if (off_in) {
> +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_in.file->f_pos = pos_in;
> +		}
> +
> +		if (off_out) {
> +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_out.file->f_pos = pos_out;
> +		}
> +	}
> +
> +out:
> +	fdput(f_out);
> +out1:
> +	fdput(f_in);
> +out2:
> +	return ret;
> +}
> +
>  /*
>   * Check that the two inodes are eligible for cloning, the ranges make
>   * sense, and then flush all dirty data.  Caller must ensure that the
> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>  	if (!file_in->f_op->clone_file_range)
>  		return -EOPNOTSUPP;
>  
> -	ret = clone_verify_area(file_in, pos_in, len, false);
> +	ret = rw64_verify_area(file_in, pos_in, len, false);
>  	if (ret)
>  		return ret;
>  
> -	ret = clone_verify_area(file_out, pos_out, len, true);
> +	ret = rw64_verify_area(file_out, pos_out, len, true);
>  	if (ret)
>  		return ret;
>  
> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>  	if (!S_ISREG(src->i_mode))
>  		goto out;
>  
> -	ret = clone_verify_area(file, off, len, false);
> +	ret = rw64_verify_area(file, off, len, false);
>  	if (ret < 0)
>  		goto out;
>  	ret = 0;
> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>  		}
>  
>  		dst_off = info->dest_offset;
> -		ret = clone_verify_area(dst_file, dst_off, len, true);
> +		ret = rw64_verify_area(dst_file, dst_off, len, true);
>  		if (ret < 0) {
>  			info->status = ret;
>  			goto next_file;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..41fce43 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
>  			u64);
>  	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>  			u64);
> +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> +			loff_t, u64, unsigned int);
>  };
>  
>  struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>  					 loff_t len, bool *is_same);
>  extern int vfs_dedupe_file_range(struct file *file,
>  				 struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> +			       loff_t, u64, unsigned int);
>  
>  struct super_operations {
>     	struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> +				    int fd_out, loff_t __user *off_out,
> +				    u64 len, unsigned int flags);
>  
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>  #define __NR_statx 291
>  __SYSCALL(__NR_statx,     sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>  cond_syscall(sys_capget);
>  cond_syscall(sys_capset);
>  cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>  
>  /* arch-specific weak syscall entries */
>  cond_syscall(sys_pciconfig_read);
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15  3:59     ` Darrick J. Wong
@ 2017-06-15 14:39       ` Olga Kornievskaia
  2017-06-15 20:53         ` Darrick J. Wong
  0 siblings, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-15 14:39 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> ---
>>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>>  include/linux/fs.h                     |   4 +
>>  include/linux/syscalls.h               |   3 +
>>  include/uapi/asm-generic/unistd.h      |   4 +-
>>  kernel/sys_ni.c                        |   1 +
>>  7 files changed, 154 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 448ac21..1f5f1e7 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -391,3 +391,4 @@
>>  382  i386    pkey_free               sys_pkey_free
>>  383  i386    statx                   sys_statx
>>  384  i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
>> +385  i386    copy_file_range64       sys_copy_file_range64
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 5aef183..e658bbe 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -339,6 +339,7 @@
>>  330  common  pkey_alloc              sys_pkey_alloc
>>  331  common  pkey_free               sys_pkey_free
>>  332  common  statx                   sys_statx
>> +333  64      copy_file_range64       sys_copy_file_range64
>>
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index e3ee985..c9c072b 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>       return ret;
>>  }
>>
>> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>  {
>>       struct inode *inode = file_inode(file);
>>
>> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>       return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>>  }
>>
>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> +                         struct file *file_out, loff_t pos_out,
>> +                         u64 len, unsigned int flags)
>
> AFAICT the difference between vfs_copy_file_range() and
> vfs_copy_file_range64() is that you've replaced 'size_t len' with
> 'u64 len', correct?

That and I'm asking to allow for cross-device copy to be allowed by
the system call.

> sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
> the userspace interface is concerned, this only makes a difference on
> 32-bit userlands, right?  So, uh, how many 32-bit user programs need to
> c_f_r more than 2GB at a time?

Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
machine. If I'm wrong, then this patch is indeed not needed.

> Follow up question -- does c_f_r not work for > 2GB of data when
> userspace is 64-bit?

Current copy_file_range would work for any file size, it would just
need to be called in a loop by the application. With the given
proposal, there wouldn't be a need for the application to loop due to
the API size limitation. It might still loop if a partial copy was
done.

>
> --D
>
>> +{
>> +     struct inode *inode_in = file_inode(file_in);
>> +     struct inode *inode_out = file_inode(file_out);
>> +     u64 ret;
>> +
>> +     if (flags != 0)
>> +             return -EINVAL;
>> +
>> +     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> +             return -EISDIR;
>> +     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> +             return -EINVAL;
>> +
>> +     ret = rw64_verify_area(file_in, pos_in, len, false);
>> +     if (unlikely(ret))
>> +             return ret;
>> +
>> +     ret = rw64_verify_area(file_out, pos_out, len, true);
>> +     if (unlikely(ret))
>> +             return ret;
>> +
>> +     if (!(file_in->f_mode & FMODE_READ) ||
>> +         !(file_out->f_mode & FMODE_WRITE) ||
>> +         (file_out->f_flags & O_APPEND))
>> +             return -EBADF;
>> +
>> +     if (len == 0)
>> +             return 0;
>> +
>> +     file_start_write(file_out);
>> +
>> +     /*
>> +      * Try cloning first, this is supported by more file systems, and
>> +      * more efficient if both clone and copy are supported (e.g. NFS).
>> +      */
>> +     if (file_in->f_op->clone_file_range) {
>> +             ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> +                             file_out, pos_out, len);
>> +             if (ret == 0) {
>> +                     ret = len;
>> +                     goto done;
>> +             }
>> +     }
>> +
>> +     if (file_out->f_op->copy_file_range64) {
>> +             ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> +                                     file_out, pos_out, len, flags);
>> +             if (ret != -EOPNOTSUPP)
>> +                     goto done;
>> +     }
>> +
>> +     ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> +                     len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>> +done:
>> +     if (ret > 0) {
>> +             fsnotify_access(file_in);
>> +             add_rchar(current, ret);
>> +             fsnotify_modify(file_out);
>> +             add_wchar(current, ret);
>> +     }
>> +
>> +     inc_syscr(current);
>> +     inc_syscw(current);
>> +
>> +     file_end_write(file_out);
>> +
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> +
>> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> +             int, fd_out, loff_t __user *, off_out,
>> +             u64, len, unsigned int, flags)
>> +{
>> +     loff_t pos_in;
>> +     loff_t pos_out;
>> +     struct fd f_in;
>> +     struct fd f_out;
>> +     u64 ret = -EBADF;
>> +
>> +     f_in = fdget(fd_in);
>> +     if (!f_in.file)
>> +             goto out2;
>> +
>> +     f_out = fdget(fd_out);
>> +     if (!f_out.file)
>> +             goto out1;
>> +
>> +     ret = -EFAULT;
>> +     if (off_in) {
>> +             if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> +                     goto out;
>> +     } else {
>> +             pos_in = f_in.file->f_pos;
>> +     }
>> +
>> +     if (off_out) {
>> +             if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> +                     goto out;
>> +     } else {
>> +             pos_out = f_out.file->f_pos;
>> +     }
>> +
>> +     ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> +                               flags);
>> +     if (ret > 0) {
>> +             pos_in += ret;
>> +             pos_out += ret;
>> +
>> +             if (off_in) {
>> +                     if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> +                             ret = -EFAULT;
>> +             } else {
>> +                     f_in.file->f_pos = pos_in;
>> +             }
>> +
>> +             if (off_out) {
>> +                     if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> +                             ret = -EFAULT;
>> +             } else {
>> +                     f_out.file->f_pos = pos_out;
>> +             }
>> +     }
>> +
>> +out:
>> +     fdput(f_out);
>> +out1:
>> +     fdput(f_in);
>> +out2:
>> +     return ret;
>> +}
>> +
>>  /*
>>   * Check that the two inodes are eligible for cloning, the ranges make
>>   * sense, and then flush all dirty data.  Caller must ensure that the
>> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>>       if (!file_in->f_op->clone_file_range)
>>               return -EOPNOTSUPP;
>>
>> -     ret = clone_verify_area(file_in, pos_in, len, false);
>> +     ret = rw64_verify_area(file_in, pos_in, len, false);
>>       if (ret)
>>               return ret;
>>
>> -     ret = clone_verify_area(file_out, pos_out, len, true);
>> +     ret = rw64_verify_area(file_out, pos_out, len, true);
>>       if (ret)
>>               return ret;
>>
>> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>>       if (!S_ISREG(src->i_mode))
>>               goto out;
>>
>> -     ret = clone_verify_area(file, off, len, false);
>> +     ret = rw64_verify_area(file, off, len, false);
>>       if (ret < 0)
>>               goto out;
>>       ret = 0;
>> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>>               }
>>
>>               dst_off = info->dest_offset;
>> -             ret = clone_verify_area(dst_file, dst_off, len, true);
>> +             ret = rw64_verify_area(dst_file, dst_off, len, true);
>>               if (ret < 0) {
>>                       info->status = ret;
>>                       goto next_file;
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 803e5a9..41fce43 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1690,6 +1690,8 @@ struct file_operations {
>>                       u64);
>>       ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>>                       u64);
>> +     u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> +                     loff_t, u64, unsigned int);
>>  };
>>
>>  struct inode_operations {
>> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>>                                        loff_t len, bool *is_same);
>>  extern int vfs_dedupe_file_range(struct file *file,
>>                                struct file_dedupe_range *same);
>> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> +                            loff_t, u64, unsigned int);
>>
>>  struct super_operations {
>>       struct inode *(*alloc_inode)(struct super_block *sb);
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index 980c3c9..f7ea78e 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>>  asmlinkage long sys_pkey_free(int pkey);
>>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>>                         unsigned mask, struct statx __user *buffer);
>> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> +                                 int fd_out, loff_t __user *off_out,
>> +                                 u64 len, unsigned int flags);
>>
>>  #endif
>> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> index 061185a..e385a76 100644
>> --- a/include/uapi/asm-generic/unistd.h
>> +++ b/include/uapi/asm-generic/unistd.h
>> @@ -731,9 +731,11 @@
>>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>>  #define __NR_statx 291
>>  __SYSCALL(__NR_statx,     sys_statx)
>> +#define __NR_copy_file_range64 292
>> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>>
>>  #undef __NR_syscalls
>> -#define __NR_syscalls 292
>> +#define __NR_syscalls 293
>>
>>  /*
>>   * All syscalls below here should go away really,
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index 8acef85..8e0cfd4 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>>  cond_syscall(sys_capget);
>>  cond_syscall(sys_capset);
>>  cond_syscall(sys_copy_file_range);
>> +cond_syscall(sys_copy_file_range64);
>>
>>  /* arch-specific weak syscall entries */
>>  cond_syscall(sys_pciconfig_read);
>> --
>> 1.8.3.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 14:39       ` Olga Kornievskaia
@ 2017-06-15 20:53         ` Darrick J. Wong
  2017-06-19 18:34           ` Olga Kornievskaia
  2017-06-20 12:44           ` Steve Dickson
  0 siblings, 2 replies; 35+ messages in thread
From: Darrick J. Wong @ 2017-06-15 20:53 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
> >> This is a proposal to allow 64bit count to copy and return as
> >> a result of a copy_file_range. No attempt was made to share code
> >> with the 32bit function because 32bit interface should probably
> >> get depreciated.
> >>
> >> Why use 64bit? Current uses of 32bit are by clone_file_range()
> >> which could use 64bit count and NFS copy_file_range also supports
> >> 64bit count value.
> >>
> >> Also with this proposal off-the-bet allow the userland to copy
> >> between different mount points.
> >>
> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> >> ---
> >>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
> >>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> >>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
> >>  include/linux/fs.h                     |   4 +
> >>  include/linux/syscalls.h               |   3 +
> >>  include/uapi/asm-generic/unistd.h      |   4 +-
> >>  kernel/sys_ni.c                        |   1 +
> >>  7 files changed, 154 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> >> index 448ac21..1f5f1e7 100644
> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >> @@ -391,3 +391,4 @@
> >>  382  i386    pkey_free               sys_pkey_free
> >>  383  i386    statx                   sys_statx
> >>  384  i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
> >> +385  i386    copy_file_range64       sys_copy_file_range64
> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> >> index 5aef183..e658bbe 100644
> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >> @@ -339,6 +339,7 @@
> >>  330  common  pkey_alloc              sys_pkey_alloc
> >>  331  common  pkey_free               sys_pkey_free
> >>  332  common  statx                   sys_statx
> >> +333  64      copy_file_range64       sys_copy_file_range64
> >>
> >>  #
> >>  # x32-specific system call numbers start at 512 to avoid cache impact
> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> index e3ee985..c9c072b 100644
> >> --- a/fs/read_write.c
> >> +++ b/fs/read_write.c
> >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> >>       return ret;
> >>  }
> >>
> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >>  {
> >>       struct inode *inode = file_inode(file);
> >>
> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >>       return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
> >>  }
> >>
> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> >> +                         struct file *file_out, loff_t pos_out,
> >> +                         u64 len, unsigned int flags)
> >
> > AFAICT the difference between vfs_copy_file_range() and
> > vfs_copy_file_range64() is that you've replaced 'size_t len' with
> > 'u64 len', correct?
> 
> That and I'm asking to allow for cross-device copy to be allowed by
> the system call.

I was fine letting each fs decide if it was going to allow cross-device
copy or not, but Christoph argued that since we don't generally allow
operations across mountpoints we shouldn't allow cross-mountpoint
{clone,copy}_file_range either.

Roughly translated, XFS has no cross-device copy abilities, there aren't
any plans to add it, so I don't feel motivated to argue for it... but if
the nfs community is so motivated (it sounds like it) then y'all could
try one more time.

> > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
> > the userspace interface is concerned, this only makes a difference on
> > 32-bit userlands, right?  So, uh, how many 32-bit user programs need to
> > c_f_r more than 2GB at a time?
> 
> Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
> under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
> machine. If I'm wrong, then this patch is indeed not needed.

>From include/uapi/asm-generic/posix_types.h:

/*
 * Most 32 bit architectures use "unsigned int" size_t,
 * and all 64 bit architectures use "unsigned long" size_t.
 */
#ifndef __kernel_size_t
#if __BITS_PER_LONG != 64
typedef unsigned int	__kernel_size_t;
typedef int		__kernel_ssize_t;
typedef int		__kernel_ptrdiff_t;
#else
typedef __kernel_ulong_t __kernel_size_t;
typedef __kernel_long_t	__kernel_ssize_t;
typedef __kernel_long_t	__kernel_ptrdiff_t;
#endif
#endif

Note that 'long' is 64-bit, at least on x64.

> > Follow up question -- does c_f_r not work for > 2GB of data when
> > userspace is 64-bit?
> 
> Current copy_file_range would work for any file size, it would just
> need to be called in a loop by the application. With the given
> proposal, there wouldn't be a need for the application to loop due to
> the API size limitation. It might still loop if a partial copy was
> done.

...which is what happens if we fall back to do_splice_direct, since a
single call won't pagecache-copy more than INT_MAX bytes from one file
to another.  Therefore, any serious user of this syscall has to check
the arguments and loop if it wants the whole range done, similar to how
one is (supposed) to call write().

--D

> 
> >
> > --D
> >
> >> +{
> >> +     struct inode *inode_in = file_inode(file_in);
> >> +     struct inode *inode_out = file_inode(file_out);
> >> +     u64 ret;
> >> +
> >> +     if (flags != 0)
> >> +             return -EINVAL;
> >> +
> >> +     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> >> +             return -EISDIR;
> >> +     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> >> +             return -EINVAL;
> >> +
> >> +     ret = rw64_verify_area(file_in, pos_in, len, false);
> >> +     if (unlikely(ret))
> >> +             return ret;
> >> +
> >> +     ret = rw64_verify_area(file_out, pos_out, len, true);
> >> +     if (unlikely(ret))
> >> +             return ret;
> >> +
> >> +     if (!(file_in->f_mode & FMODE_READ) ||
> >> +         !(file_out->f_mode & FMODE_WRITE) ||
> >> +         (file_out->f_flags & O_APPEND))
> >> +             return -EBADF;
> >> +
> >> +     if (len == 0)
> >> +             return 0;
> >> +
> >> +     file_start_write(file_out);
> >> +
> >> +     /*
> >> +      * Try cloning first, this is supported by more file systems, and
> >> +      * more efficient if both clone and copy are supported (e.g. NFS).
> >> +      */
> >> +     if (file_in->f_op->clone_file_range) {
> >> +             ret = file_in->f_op->clone_file_range(file_in, pos_in,
> >> +                             file_out, pos_out, len);
> >> +             if (ret == 0) {
> >> +                     ret = len;
> >> +                     goto done;
> >> +             }
> >> +     }
> >> +
> >> +     if (file_out->f_op->copy_file_range64) {
> >> +             ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> >> +                                     file_out, pos_out, len, flags);
> >> +             if (ret != -EOPNOTSUPP)
> >> +                     goto done;
> >> +     }
> >> +
> >> +     ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> >> +                     len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> >> +
> >> +done:
> >> +     if (ret > 0) {
> >> +             fsnotify_access(file_in);
> >> +             add_rchar(current, ret);
> >> +             fsnotify_modify(file_out);
> >> +             add_wchar(current, ret);
> >> +     }
> >> +
> >> +     inc_syscr(current);
> >> +     inc_syscw(current);
> >> +
> >> +     file_end_write(file_out);
> >> +
> >> +     return ret;
> >> +}
> >> +EXPORT_SYMBOL(vfs_copy_file_range64);
> >> +
> >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> >> +             int, fd_out, loff_t __user *, off_out,
> >> +             u64, len, unsigned int, flags)
> >> +{
> >> +     loff_t pos_in;
> >> +     loff_t pos_out;
> >> +     struct fd f_in;
> >> +     struct fd f_out;
> >> +     u64 ret = -EBADF;
> >> +
> >> +     f_in = fdget(fd_in);
> >> +     if (!f_in.file)
> >> +             goto out2;
> >> +
> >> +     f_out = fdget(fd_out);
> >> +     if (!f_out.file)
> >> +             goto out1;
> >> +
> >> +     ret = -EFAULT;
> >> +     if (off_in) {
> >> +             if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> >> +                     goto out;
> >> +     } else {
> >> +             pos_in = f_in.file->f_pos;
> >> +     }
> >> +
> >> +     if (off_out) {
> >> +             if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> >> +                     goto out;
> >> +     } else {
> >> +             pos_out = f_out.file->f_pos;
> >> +     }
> >> +
> >> +     ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> >> +                               flags);
> >> +     if (ret > 0) {
> >> +             pos_in += ret;
> >> +             pos_out += ret;
> >> +
> >> +             if (off_in) {
> >> +                     if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> >> +                             ret = -EFAULT;
> >> +             } else {
> >> +                     f_in.file->f_pos = pos_in;
> >> +             }
> >> +
> >> +             if (off_out) {
> >> +                     if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> >> +                             ret = -EFAULT;
> >> +             } else {
> >> +                     f_out.file->f_pos = pos_out;
> >> +             }
> >> +     }
> >> +
> >> +out:
> >> +     fdput(f_out);
> >> +out1:
> >> +     fdput(f_in);
> >> +out2:
> >> +     return ret;
> >> +}
> >> +
> >>  /*
> >>   * Check that the two inodes are eligible for cloning, the ranges make
> >>   * sense, and then flush all dirty data.  Caller must ensure that the
> >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
> >>       if (!file_in->f_op->clone_file_range)
> >>               return -EOPNOTSUPP;
> >>
> >> -     ret = clone_verify_area(file_in, pos_in, len, false);
> >> +     ret = rw64_verify_area(file_in, pos_in, len, false);
> >>       if (ret)
> >>               return ret;
> >>
> >> -     ret = clone_verify_area(file_out, pos_out, len, true);
> >> +     ret = rw64_verify_area(file_out, pos_out, len, true);
> >>       if (ret)
> >>               return ret;
> >>
> >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
> >>       if (!S_ISREG(src->i_mode))
> >>               goto out;
> >>
> >> -     ret = clone_verify_area(file, off, len, false);
> >> +     ret = rw64_verify_area(file, off, len, false);
> >>       if (ret < 0)
> >>               goto out;
> >>       ret = 0;
> >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
> >>               }
> >>
> >>               dst_off = info->dest_offset;
> >> -             ret = clone_verify_area(dst_file, dst_off, len, true);
> >> +             ret = rw64_verify_area(dst_file, dst_off, len, true);
> >>               if (ret < 0) {
> >>                       info->status = ret;
> >>                       goto next_file;
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 803e5a9..41fce43 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -1690,6 +1690,8 @@ struct file_operations {
> >>                       u64);
> >>       ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> >>                       u64);
> >> +     u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> >> +                     loff_t, u64, unsigned int);
> >>  };
> >>
> >>  struct inode_operations {
> >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> >>                                        loff_t len, bool *is_same);
> >>  extern int vfs_dedupe_file_range(struct file *file,
> >>                                struct file_dedupe_range *same);
> >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> >> +                            loff_t, u64, unsigned int);
> >>
> >>  struct super_operations {
> >>       struct inode *(*alloc_inode)(struct super_block *sb);
> >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> >> index 980c3c9..f7ea78e 100644
> >> --- a/include/linux/syscalls.h
> >> +++ b/include/linux/syscalls.h
> >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> >>  asmlinkage long sys_pkey_free(int pkey);
> >>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> >>                         unsigned mask, struct statx __user *buffer);
> >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> >> +                                 int fd_out, loff_t __user *off_out,
> >> +                                 u64 len, unsigned int flags);
> >>
> >>  #endif
> >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> >> index 061185a..e385a76 100644
> >> --- a/include/uapi/asm-generic/unistd.h
> >> +++ b/include/uapi/asm-generic/unistd.h
> >> @@ -731,9 +731,11 @@
> >>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
> >>  #define __NR_statx 291
> >>  __SYSCALL(__NR_statx,     sys_statx)
> >> +#define __NR_copy_file_range64 292
> >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
> >>
> >>  #undef __NR_syscalls
> >> -#define __NR_syscalls 292
> >> +#define __NR_syscalls 293
> >>
> >>  /*
> >>   * All syscalls below here should go away really,
> >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> >> index 8acef85..8e0cfd4 100644
> >> --- a/kernel/sys_ni.c
> >> +++ b/kernel/sys_ni.c
> >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> >>  cond_syscall(sys_capget);
> >>  cond_syscall(sys_capset);
> >>  cond_syscall(sys_copy_file_range);
> >> +cond_syscall(sys_copy_file_range64);
> >>
> >>  /* arch-specific weak syscall entries */
> >>  cond_syscall(sys_pciconfig_read);
> >> --
> >> 1.8.3.1
> >>
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 20:53         ` Darrick J. Wong
@ 2017-06-19 18:34           ` Olga Kornievskaia
  2017-06-19 19:39             ` Christoph Hellwig
  2017-06-20 12:44           ` Steve Dickson
  1 sibling, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-19 18:34 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Thu, Jun 15, 2017 at 4:53 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
>> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
>> <darrick.wong@oracle.com> wrote:
>> > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>> >> This is a proposal to allow 64bit count to copy and return as
>> >> a result of a copy_file_range. No attempt was made to share code
>> >> with the 32bit function because 32bit interface should probably
>> >> get depreciated.
>> >>
>> >> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> >> which could use 64bit count and NFS copy_file_range also supports
>> >> 64bit count value.
>> >>
>> >> Also with this proposal off-the-bet allow the userland to copy
>> >> between different mount points.
>> >>
>> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> >> ---
>> >>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>> >>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>> >>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>> >>  include/linux/fs.h                     |   4 +
>> >>  include/linux/syscalls.h               |   3 +
>> >>  include/uapi/asm-generic/unistd.h      |   4 +-
>> >>  kernel/sys_ni.c                        |   1 +
>> >>  7 files changed, 154 insertions(+), 6 deletions(-)
>> >>
>> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> >> index 448ac21..1f5f1e7 100644
>> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> >> @@ -391,3 +391,4 @@
>> >>  382  i386    pkey_free               sys_pkey_free
>> >>  383  i386    statx                   sys_statx
>> >>  384  i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
>> >> +385  i386    copy_file_range64       sys_copy_file_range64
>> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> >> index 5aef183..e658bbe 100644
>> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> >> @@ -339,6 +339,7 @@
>> >>  330  common  pkey_alloc              sys_pkey_alloc
>> >>  331  common  pkey_free               sys_pkey_free
>> >>  332  common  statx                   sys_statx
>> >> +333  64      copy_file_range64       sys_copy_file_range64
>> >>
>> >>  #
>> >>  # x32-specific system call numbers start at 512 to avoid cache impact
>> >> diff --git a/fs/read_write.c b/fs/read_write.c
>> >> index e3ee985..c9c072b 100644
>> >> --- a/fs/read_write.c
>> >> +++ b/fs/read_write.c
>> >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> >>       return ret;
>> >>  }
>> >>
>> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >>  {
>> >>       struct inode *inode = file_inode(file);
>> >>
>> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >>       return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>> >>  }
>> >>
>> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> >> +                         struct file *file_out, loff_t pos_out,
>> >> +                         u64 len, unsigned int flags)
>> >
>> > AFAICT the difference between vfs_copy_file_range() and
>> > vfs_copy_file_range64() is that you've replaced 'size_t len' with
>> > 'u64 len', correct?
>>
>> That and I'm asking to allow for cross-device copy to be allowed by
>> the system call.
>
> I was fine letting each fs decide if it was going to allow cross-device
> copy or not, but Christoph argued that since we don't generally allow
> operations across mountpoints we shouldn't allow cross-mountpoint
> {clone,copy}_file_range either.
>
> Roughly translated, XFS has no cross-device copy abilities, there aren't
> any plans to add it, so I don't feel motivated to argue for it... but if
> the nfs community is so motivated (it sounds like it) then y'all could
> try one more time.

Yes NFS community does need one for doing a server-to-server copy
(performance) feature.

There was an argument that none of the VFS operations allow for
cross-device operations. What about splice, doesn't it allow to cross
mount points? So why not allow copy_file_range to do it too?

>> > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
>> > the userspace interface is concerned, this only makes a difference on
>> > 32-bit userlands, right?  So, uh, how many 32-bit user programs need to
>> > c_f_r more than 2GB at a time?
>>
>> Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
>> under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
>> machine. If I'm wrong, then this patch is indeed not needed.
>
> From include/uapi/asm-generic/posix_types.h:
>
> /*
>  * Most 32 bit architectures use "unsigned int" size_t,
>  * and all 64 bit architectures use "unsigned long" size_t.
>  */
> #ifndef __kernel_size_t
> #if __BITS_PER_LONG != 64
> typedef unsigned int    __kernel_size_t;
> typedef int             __kernel_ssize_t;
> typedef int             __kernel_ptrdiff_t;
> #else
> typedef __kernel_ulong_t __kernel_size_t;
> typedef __kernel_long_t __kernel_ssize_t;
> typedef __kernel_long_t __kernel_ptrdiff_t;
> #endif
> #endif
>
> Note that 'long' is 64-bit, at least on x64.

Ok, thank you for pointing this out. I see now that it's not important
to change the size_t to allow for larger than 4GB copies.

>
>> > Follow up question -- does c_f_r not work for > 2GB of data when
>> > userspace is 64-bit?
>>
>> Current copy_file_range would work for any file size, it would just
>> need to be called in a loop by the application. With the given
>> proposal, there wouldn't be a need for the application to loop due to
>> the API size limitation. It might still loop if a partial copy was
>> done.
>
> ...which is what happens if we fall back to do_splice_direct, since a
> single call won't pagecache-copy more than INT_MAX bytes from one file
> to another.  Therefore, any serious user of this syscall has to check
> the arguments and loop if it wants the whole range done, similar to how
> one is (supposed) to call write().
>
> --D
>
>>
>> >
>> > --D
>> >
>> >> +{
>> >> +     struct inode *inode_in = file_inode(file_in);
>> >> +     struct inode *inode_out = file_inode(file_out);
>> >> +     u64 ret;
>> >> +
>> >> +     if (flags != 0)
>> >> +             return -EINVAL;
>> >> +
>> >> +     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> >> +             return -EISDIR;
>> >> +     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> >> +             return -EINVAL;
>> >> +
>> >> +     ret = rw64_verify_area(file_in, pos_in, len, false);
>> >> +     if (unlikely(ret))
>> >> +             return ret;
>> >> +
>> >> +     ret = rw64_verify_area(file_out, pos_out, len, true);
>> >> +     if (unlikely(ret))
>> >> +             return ret;
>> >> +
>> >> +     if (!(file_in->f_mode & FMODE_READ) ||
>> >> +         !(file_out->f_mode & FMODE_WRITE) ||
>> >> +         (file_out->f_flags & O_APPEND))
>> >> +             return -EBADF;
>> >> +
>> >> +     if (len == 0)
>> >> +             return 0;
>> >> +
>> >> +     file_start_write(file_out);
>> >> +
>> >> +     /*
>> >> +      * Try cloning first, this is supported by more file systems, and
>> >> +      * more efficient if both clone and copy are supported (e.g. NFS).
>> >> +      */
>> >> +     if (file_in->f_op->clone_file_range) {
>> >> +             ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> >> +                             file_out, pos_out, len);
>> >> +             if (ret == 0) {
>> >> +                     ret = len;
>> >> +                     goto done;
>> >> +             }
>> >> +     }
>> >> +
>> >> +     if (file_out->f_op->copy_file_range64) {
>> >> +             ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> >> +                                     file_out, pos_out, len, flags);
>> >> +             if (ret != -EOPNOTSUPP)
>> >> +                     goto done;
>> >> +     }
>> >> +
>> >> +     ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> >> +                     len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> >> +
>> >> +done:
>> >> +     if (ret > 0) {
>> >> +             fsnotify_access(file_in);
>> >> +             add_rchar(current, ret);
>> >> +             fsnotify_modify(file_out);
>> >> +             add_wchar(current, ret);
>> >> +     }
>> >> +
>> >> +     inc_syscr(current);
>> >> +     inc_syscw(current);
>> >> +
>> >> +     file_end_write(file_out);
>> >> +
>> >> +     return ret;
>> >> +}
>> >> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> >> +
>> >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> >> +             int, fd_out, loff_t __user *, off_out,
>> >> +             u64, len, unsigned int, flags)
>> >> +{
>> >> +     loff_t pos_in;
>> >> +     loff_t pos_out;
>> >> +     struct fd f_in;
>> >> +     struct fd f_out;
>> >> +     u64 ret = -EBADF;
>> >> +
>> >> +     f_in = fdget(fd_in);
>> >> +     if (!f_in.file)
>> >> +             goto out2;
>> >> +
>> >> +     f_out = fdget(fd_out);
>> >> +     if (!f_out.file)
>> >> +             goto out1;
>> >> +
>> >> +     ret = -EFAULT;
>> >> +     if (off_in) {
>> >> +             if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> >> +                     goto out;
>> >> +     } else {
>> >> +             pos_in = f_in.file->f_pos;
>> >> +     }
>> >> +
>> >> +     if (off_out) {
>> >> +             if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> >> +                     goto out;
>> >> +     } else {
>> >> +             pos_out = f_out.file->f_pos;
>> >> +     }
>> >> +
>> >> +     ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> >> +                               flags);
>> >> +     if (ret > 0) {
>> >> +             pos_in += ret;
>> >> +             pos_out += ret;
>> >> +
>> >> +             if (off_in) {
>> >> +                     if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> >> +                             ret = -EFAULT;
>> >> +             } else {
>> >> +                     f_in.file->f_pos = pos_in;
>> >> +             }
>> >> +
>> >> +             if (off_out) {
>> >> +                     if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> >> +                             ret = -EFAULT;
>> >> +             } else {
>> >> +                     f_out.file->f_pos = pos_out;
>> >> +             }
>> >> +     }
>> >> +
>> >> +out:
>> >> +     fdput(f_out);
>> >> +out1:
>> >> +     fdput(f_in);
>> >> +out2:
>> >> +     return ret;
>> >> +}
>> >> +
>> >>  /*
>> >>   * Check that the two inodes are eligible for cloning, the ranges make
>> >>   * sense, and then flush all dirty data.  Caller must ensure that the
>> >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>> >>       if (!file_in->f_op->clone_file_range)
>> >>               return -EOPNOTSUPP;
>> >>
>> >> -     ret = clone_verify_area(file_in, pos_in, len, false);
>> >> +     ret = rw64_verify_area(file_in, pos_in, len, false);
>> >>       if (ret)
>> >>               return ret;
>> >>
>> >> -     ret = clone_verify_area(file_out, pos_out, len, true);
>> >> +     ret = rw64_verify_area(file_out, pos_out, len, true);
>> >>       if (ret)
>> >>               return ret;
>> >>
>> >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> >>       if (!S_ISREG(src->i_mode))
>> >>               goto out;
>> >>
>> >> -     ret = clone_verify_area(file, off, len, false);
>> >> +     ret = rw64_verify_area(file, off, len, false);
>> >>       if (ret < 0)
>> >>               goto out;
>> >>       ret = 0;
>> >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> >>               }
>> >>
>> >>               dst_off = info->dest_offset;
>> >> -             ret = clone_verify_area(dst_file, dst_off, len, true);
>> >> +             ret = rw64_verify_area(dst_file, dst_off, len, true);
>> >>               if (ret < 0) {
>> >>                       info->status = ret;
>> >>                       goto next_file;
>> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> >> index 803e5a9..41fce43 100644
>> >> --- a/include/linux/fs.h
>> >> +++ b/include/linux/fs.h
>> >> @@ -1690,6 +1690,8 @@ struct file_operations {
>> >>                       u64);
>> >>       ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> >>                       u64);
>> >> +     u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> >> +                     loff_t, u64, unsigned int);
>> >>  };
>> >>
>> >>  struct inode_operations {
>> >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> >>                                        loff_t len, bool *is_same);
>> >>  extern int vfs_dedupe_file_range(struct file *file,
>> >>                                struct file_dedupe_range *same);
>> >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> >> +                            loff_t, u64, unsigned int);
>> >>
>> >>  struct super_operations {
>> >>       struct inode *(*alloc_inode)(struct super_block *sb);
>> >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> >> index 980c3c9..f7ea78e 100644
>> >> --- a/include/linux/syscalls.h
>> >> +++ b/include/linux/syscalls.h
>> >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> >>  asmlinkage long sys_pkey_free(int pkey);
>> >>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> >>                         unsigned mask, struct statx __user *buffer);
>> >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> >> +                                 int fd_out, loff_t __user *off_out,
>> >> +                                 u64 len, unsigned int flags);
>> >>
>> >>  #endif
>> >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> >> index 061185a..e385a76 100644
>> >> --- a/include/uapi/asm-generic/unistd.h
>> >> +++ b/include/uapi/asm-generic/unistd.h
>> >> @@ -731,9 +731,11 @@
>> >>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>> >>  #define __NR_statx 291
>> >>  __SYSCALL(__NR_statx,     sys_statx)
>> >> +#define __NR_copy_file_range64 292
>> >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>> >>
>> >>  #undef __NR_syscalls
>> >> -#define __NR_syscalls 292
>> >> +#define __NR_syscalls 293
>> >>
>> >>  /*
>> >>   * All syscalls below here should go away really,
>> >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> >> index 8acef85..8e0cfd4 100644
>> >> --- a/kernel/sys_ni.c
>> >> +++ b/kernel/sys_ni.c
>> >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> >>  cond_syscall(sys_capget);
>> >>  cond_syscall(sys_capset);
>> >>  cond_syscall(sys_copy_file_range);
>> >> +cond_syscall(sys_copy_file_range64);
>> >>
>> >>  /* arch-specific weak syscall entries */
>> >>  cond_syscall(sys_pciconfig_read);
>> >> --
>> >> 1.8.3.1
>> >>
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-19 18:34           ` Olga Kornievskaia
@ 2017-06-19 19:39             ` Christoph Hellwig
  2017-06-19 19:48               ` Olga Kornievskaia
                                 ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Christoph Hellwig @ 2017-06-19 19:39 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org,
	linux-nfs

On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
> Yes NFS community does need one for doing a server-to-server copy
> (performance) feature.

s/NFS community/NetApp/

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-19 19:39             ` Christoph Hellwig
@ 2017-06-19 19:48               ` Olga Kornievskaia
  2017-06-20 12:38               ` Steve Dickson
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-19 19:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org,
	linux-nfs

On Mon, Jun 19, 2017 at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
>
> s/NFS community/NetApp/

NetApp = users. Is there another reason for having a feature in the
kernel if not for the users?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-19 19:39             ` Christoph Hellwig
  2017-06-19 19:48               ` Olga Kornievskaia
@ 2017-06-20 12:38               ` Steve Dickson
  2017-06-20 19:33               ` Chuck Lever
  2017-06-24 11:29               ` Mkrtchyan, Tigran
  3 siblings, 0 replies; 35+ messages in thread
From: Steve Dickson @ 2017-06-20 12:38 UTC (permalink / raw)
  To: Christoph Hellwig, Olga Kornievskaia
  Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org,
	linux-nfs

Hello,

On 06/19/2017 03:39 PM, Christoph Hellwig wrote:
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
> 
> s/NFS community/NetApp/
Wrong... Red Hat is also very interested in this work
being that this support is in both Fedora and upcoming 
RHEL releases... 

I must admin I just don't understand what there is *any*
push back on a feature that would dramatically increase
the I/O speed of NFS. That would be good for the
*entire* Linux community... IMHO.

I'm coming to the party late... So is the only
sticking point is to have this vfs call 
write to a different filesystem, correct?
Why? (please point to a email that already
explains it) Does that break some type of
boundary or is it because it was never
needed before??  

Again, looking at the big picture... Allowing
NFS to copy files by only sending meta-data 
is HUGE! It is such a win for the entire 
community so I'm so surprise this community 
is not trying to help the process instead
of push it back.

my two cents,

steved.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-19 19:39             ` Christoph Hellwig
  2017-06-19 19:48               ` Olga Kornievskaia
  2017-06-20 12:38               ` Steve Dickson
@ 2017-06-20 19:33               ` Chuck Lever
  2017-06-20 19:37                 ` Olga Kornievskaia
  2017-06-24 11:29               ` Mkrtchyan, Tigran
  3 siblings, 1 reply; 35+ messages in thread
From: Chuck Lever @ 2017-06-20 19:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Olga Kornievskaia, Darrick J. Wong, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, Linux NFS Mailing List


> On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
> 
> s/NFS community/NetApp/

I don't have an opinion on the API details, but Oracle is interested
in server-to-server copy, especially if there is an implementation
of it in the Linux NFS server.


--
Chuck Lever

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-20 19:33               ` Chuck Lever
@ 2017-06-20 19:37                 ` Olga Kornievskaia
  0 siblings, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-20 19:37 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Christoph Hellwig, Darrick J. Wong, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, Linux NFS Mailing List

On Tue, Jun 20, 2017 at 3:33 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
>
>> On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote:
>>
>> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>>> Yes NFS community does need one for doing a server-to-server copy
>>> (performance) feature.
>>
>> s/NFS community/NetApp/
>
> I don't have an opinion on the API details, but Oracle is interested
> in server-to-server copy, especially if there is an implementation
> of it in the Linux NFS server.

The proposed patch set is the linux client AND linux server. We need
removal of the VFS cross-device check on both side (client and
destination server) to make the server-to-server copy work. On the
server side, we also call vfs_copy_file_range() that (when check is
removed) ends up reading from the NFS source file and writing to the
local file system.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-19 19:39             ` Christoph Hellwig
                                 ` (2 preceding siblings ...)
  2017-06-20 19:33               ` Chuck Lever
@ 2017-06-24 11:29               ` Mkrtchyan, Tigran
  3 siblings, 0 replies; 35+ messages in thread
From: Mkrtchyan, Tigran @ 2017-06-24 11:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Olga Kornievskaia, Darrick J. Wong, Olga Kornievskaia,
	linux-fsdevel, linux-nfs

----- Original Message -----
> From: "Christoph Hellwig" <hch@infradead.org>
> To: "Olga Kornievskaia" <aglo@umich.edu>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>, "Olga Kornievskaia" <kolga@netapp.com>, linux-fsdevel@vger.kernel.org,
> "linux-nfs" <linux-nfs@vger.kernel.org>
> Sent: Monday, June 19, 2017 9:39:03 PM
> Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
> 
> s/NFS community/NetApp/

As being probably the first person who have raised the need in server-to-server copy 
in NFS (see slide 15 http://nfsv4bat.org/Documents/ConnectAThon/2008/managedstorage.pdf),
let me disagree with you.

Having end-user hat on me I can say, that over a decade we are moving scientific
data around the world with server-side-copy, or 3rd-party copy as we call
it. While we more-and-more use NFS (pNFS) for data access, we always need a
second protocol, like FTP or WebDAV, to perform efficient data movement.

I am not deep in kernel internals, but there must be a very good reason to block
such functionality in NFS. Such functionality will drastically simplify our
workflow. And I am sure, we are not the only users of it. 

Tigran.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 20:53         ` Darrick J. Wong
  2017-06-19 18:34           ` Olga Kornievskaia
@ 2017-06-20 12:44           ` Steve Dickson
  1 sibling, 0 replies; 35+ messages in thread
From: Steve Dickson @ 2017-06-20 12:44 UTC (permalink / raw)
  To: Darrick J. Wong, Olga Kornievskaia
  Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs



On 06/15/2017 04:53 PM, Darrick J. Wong wrote:
> On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
>> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
>> <darrick.wong@oracle.com> wrote:
>>> On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>>>> This is a proposal to allow 64bit count to copy and return as
>>>> a result of a copy_file_range. No attempt was made to share code
>>>> with the 32bit function because 32bit interface should probably
>>>> get depreciated.
>>>>
>>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>>> which could use 64bit count and NFS copy_file_range also supports
>>>> 64bit count value.
>>>>
>>>> Also with this proposal off-the-bet allow the userland to copy
>>>> between different mount points.
>>>>
>>>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>>>> ---
>>>>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>>>>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>>>>  fs/read_write.c                        | 146 +++++++++++++++++++++++++++++++--
>>>>  include/linux/fs.h                     |   4 +
>>>>  include/linux/syscalls.h               |   3 +
>>>>  include/uapi/asm-generic/unistd.h      |   4 +-
>>>>  kernel/sys_ni.c                        |   1 +
>>>>  7 files changed, 154 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>>>> index 448ac21..1f5f1e7 100644
>>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>>> @@ -391,3 +391,4 @@
>>>>  382  i386    pkey_free               sys_pkey_free
>>>>  383  i386    statx                   sys_statx
>>>>  384  i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
>>>> +385  i386    copy_file_range64       sys_copy_file_range64
>>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>>>> index 5aef183..e658bbe 100644
>>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>>> @@ -339,6 +339,7 @@
>>>>  330  common  pkey_alloc              sys_pkey_alloc
>>>>  331  common  pkey_free               sys_pkey_free
>>>>  332  common  statx                   sys_statx
>>>> +333  64      copy_file_range64       sys_copy_file_range64
>>>>
>>>>  #
>>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>>> diff --git a/fs/read_write.c b/fs/read_write.c
>>>> index e3ee985..c9c072b 100644
>>>> --- a/fs/read_write.c
>>>> +++ b/fs/read_write.c
>>>> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>>>       return ret;
>>>>  }
>>>>
>>>> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>>  {
>>>>       struct inode *inode = file_inode(file);
>>>>
>>>> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>>       return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>>>>  }
>>>>
>>>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>>>> +                         struct file *file_out, loff_t pos_out,
>>>> +                         u64 len, unsigned int flags)
>>>
>>> AFAICT the difference between vfs_copy_file_range() and
>>> vfs_copy_file_range64() is that you've replaced 'size_t len' with
>>> 'u64 len', correct?
>>
>> That and I'm asking to allow for cross-device copy to be allowed by
>> the system call.
> 
> I was fine letting each fs decide if it was going to allow cross-device
> copy or not, but Christoph argued that since we don't generally allow
> operations across mountpoints we shouldn't allow cross-mountpoint
> {clone,copy}_file_range either.
> 
> Roughly translated, XFS has no cross-device copy abilities, there aren't
> any plans to add it, so I don't feel motivated to argue for it... but if
> the nfs community is so motivated (it sounds like it) then y'all could
> try one more time.
> 
So since XFS (or any other local fs) does not have this ability,
NFS does not needed it?? That can't be the reason for the
push back that is denying NFS this ability.. 

steved.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia
  2017-06-14 17:24 ` Anna Schumaker
@ 2017-06-15  8:15 ` Christoph Hellwig
  2017-06-15 13:07   ` Olga Kornievskaia
  2017-06-24 13:23 ` Jeff Layton
  2 siblings, 1 reply; 35+ messages in thread
From: Christoph Hellwig @ 2017-06-15  8:15 UTC (permalink / raw)
  To: Olga Kornievskaia; +Cc: linux-fsdevel, linux-nfs

On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
> 
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.

Please provide a very good use case that actually matters to a large
portion of the Linux users.

> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.

Totally different issue that we've discussed N times.  Trying to sneak
it in behind peoples backs isn't going to improve the situation in any way.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15  8:15 ` Christoph Hellwig
@ 2017-06-15 13:07   ` Olga Kornievskaia
  2017-06-15 18:35     ` Andreas Dilger
  0 siblings, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-15 13:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>
> Please provide a very good use case that actually matters to a large
> portion of the Linux users.

Reason: it will not hurt any user but it will help for
server-to-server NFS copy because for each invocation of of
copy_file_range() it requires that the client sends COPY_NOTIFY, then
COPY and then a copy triggers a establishment of clientid/session
before the reading of the source file can happen. All that could be
saved by have a 64bit value.

>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>
> Totally different issue that we've discussed N times.  Trying to sneak
> it in behind peoples backs isn't going to improve the situation in any way.

I would think that sneaking would have been to remove the check and
not mention it in the commit description. I have explicitly brought
the topic up in the commit description to bring the attention to the
issue as a way to restart the conversation about the issue.

I have several times have asked for the reason to why the check can
not be removed. I'm asking again can you please explain why.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 13:07   ` Olga Kornievskaia
@ 2017-06-15 18:35     ` Andreas Dilger
  2017-06-15 19:11       ` Olga Kornievskaia
  2017-06-16 17:36       ` J. Bruce Fields
  0 siblings, 2 replies; 35+ messages in thread
From: Andreas Dilger @ 2017-06-15 18:35 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Christoph Hellwig, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, linux-nfs

[-- Attachment #1: Type: text/plain, Size: 1733 bytes --]


> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
> 
> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>>> This is a proposal to allow 64bit count to copy and return as
>>> a result of a copy_file_range. No attempt was made to share code
>>> with the 32bit function because 32bit interface should probably
>>> get depreciated.
>>> 
>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>> which could use 64bit count and NFS copy_file_range also supports
>>> 64bit count value.
>> 
>> Please provide a very good use case that actually matters to a large
>> portion of the Linux users.
> 
> Reason: it will not hurt any user but it will help for
> server-to-server NFS copy because for each invocation of of
> copy_file_range() it requires that the client sends COPY_NOTIFY, then
> COPY and then a copy triggers a establishment of clientid/session
> before the reading of the source file can happen. All that could be
> saved by have a 64bit value.

What is the overhead of sending a couple of RPCs vs. copying 4GB of
data on the server?

> Current copy_file_range would work for any file size, it would just
> need to be called in a loop by the application. With the given
> proposal, there wouldn't be a need for the application to loop due to
> the API size limitation. It might still loop if a partial copy was
> done.


Given that there always needs to be a loop to handle partial completion,
adding the 64-bit syscall doesn't really simplify the application at
all, but adds duplicate code to the kernel for little benefit.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 18:35     ` Andreas Dilger
@ 2017-06-15 19:11       ` Olga Kornievskaia
  2017-06-16 17:36       ` J. Bruce Fields
  1 sibling, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-15 19:11 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Christoph Hellwig, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, linux-nfs

On Thu, Jun 15, 2017 at 2:35 PM, Andreas Dilger <adilger@dilger.ca> wrote:
>
>> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
>>
>> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote:
>>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>>>> This is a proposal to allow 64bit count to copy and return as
>>>> a result of a copy_file_range. No attempt was made to share code
>>>> with the 32bit function because 32bit interface should probably
>>>> get depreciated.
>>>>
>>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>>> which could use 64bit count and NFS copy_file_range also supports
>>>> 64bit count value.
>>>
>>> Please provide a very good use case that actually matters to a large
>>> portion of the Linux users.
>>
>> Reason: it will not hurt any user but it will help for
>> server-to-server NFS copy because for each invocation of of
>> copy_file_range() it requires that the client sends COPY_NOTIFY, then
>> COPY and then a copy triggers a establishment of clientid/session
>> before the reading of the source file can happen. All that could be
>> saved by have a 64bit value.
>
> What is the overhead of sending a couple of RPCs vs. copying 4GB of
> data on the server?

We get about 11% improvement having an interface that removes the need
for multiple calls. That's testing 8gb copy that normally would have
needed 2 copies.

>> Current copy_file_range would work for any file size, it would just
>> need to be called in a loop by the application. With the given
>> proposal, there wouldn't be a need for the application to loop due to
>> the API size limitation. It might still loop if a partial copy was
>> done.
>
>
> Given that there always needs to be a loop to handle partial completion,
> adding the 64-bit syscall doesn't really simplify the application at
> all, but adds duplicate code to the kernel for little benefit.

If 32bit version would be deprecated then there won't be duplicate code.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-15 18:35     ` Andreas Dilger
  2017-06-15 19:11       ` Olga Kornievskaia
@ 2017-06-16 17:36       ` J. Bruce Fields
  2017-06-17 10:03         ` Christoph Hellwig
  1 sibling, 1 reply; 35+ messages in thread
From: J. Bruce Fields @ 2017-06-16 17:36 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Olga Kornievskaia, Christoph Hellwig, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, linux-nfs

On Thu, Jun 15, 2017 at 12:35:29PM -0600, Andreas Dilger wrote:
> 
> > On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
> > Reason: it will not hurt any user but it will help for
> > server-to-server NFS copy because for each invocation of of
> > copy_file_range() it requires that the client sends COPY_NOTIFY, then
> > COPY and then a copy triggers a establishment of clientid/session
> > before the reading of the source file can happen. All that could be
> > saved by have a 64bit value.
> 
> What is the overhead of sending a couple of RPCs vs. copying 4GB of
> data on the server?

It makes a difference in the clone case, doesn't it?  (Though I'd think
best there would be a special "copy to end of file" value.)

--b.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-16 17:36       ` J. Bruce Fields
@ 2017-06-17 10:03         ` Christoph Hellwig
  2017-06-17 12:42           ` Olga Kornievskaia
  0 siblings, 1 reply; 35+ messages in thread
From: Christoph Hellwig @ 2017-06-17 10:03 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Andreas Dilger, Olga Kornievskaia, Christoph Hellwig,
	Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> It makes a difference in the clone case, doesn't it?  (Though I'd think
> best there would be a special "copy to end of file" value.)

Clones uses a 0 length as "whole" file.  Both for the NFS operation
and the Linux ioctl.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-17 10:03         ` Christoph Hellwig
@ 2017-06-17 12:42           ` Olga Kornievskaia
  2017-06-17 15:09             ` Christoph Hellwig
  0 siblings, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-17 12:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: J. Bruce Fields, Andreas Dilger, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, linux-nfs


> On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
>> It makes a difference in the clone case, doesn't it?  (Though I'd think
>> best there would be a special "copy to end of file" value.)
> 
> Clones uses a 0 length as "whole" file.  Both for the NFS operation
> and the Linux ioctl.

Does that actually work? There is  check vfs_copy_file_range()

if (len == 0)
	return 0;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-17 12:42           ` Olga Kornievskaia
@ 2017-06-17 15:09             ` Christoph Hellwig
  2017-06-17 18:24               ` J. Bruce Fields
  0 siblings, 1 reply; 35+ messages in thread
From: Christoph Hellwig @ 2017-06-17 15:09 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Christoph Hellwig, J. Bruce Fields, Andreas Dilger,
	Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote:
> 
> > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> > 
> > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> >> It makes a difference in the clone case, doesn't it?  (Though I'd think
> >> best there would be a special "copy to end of file" value.)
> > 
> > Clones uses a 0 length as "whole" file.  Both for the NFS operation
> > and the Linux ioctl.
> 
> Does that actually work? There is  check vfs_copy_file_range()

Please re-read the above sentence.  I'm talking about NFS clone and
IOC_CLONE_RANGE, not copy_file_range.  copy_file_range had to follow
the NFS semantics that don't have the special 0 case.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-17 15:09             ` Christoph Hellwig
@ 2017-06-17 18:24               ` J. Bruce Fields
  0 siblings, 0 replies; 35+ messages in thread
From: J. Bruce Fields @ 2017-06-17 18:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Olga Kornievskaia, Andreas Dilger, Olga Kornievskaia,
	linux-fsdevel@vger.kernel.org, linux-nfs

On Sat, Jun 17, 2017 at 08:09:38AM -0700, Christoph Hellwig wrote:
> On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote:
> > 
> > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> > > 
> > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> > >> It makes a difference in the clone case, doesn't it?  (Though I'd think
> > >> best there would be a special "copy to end of file" value.)
> > > 
> > > Clones uses a 0 length as "whole" file.  Both for the NFS operation
> > > and the Linux ioctl.
> > 
> > Does that actually work? There is  check vfs_copy_file_range()
> 
> Please re-read the above sentence.  I'm talking about NFS clone and
> IOC_CLONE_RANGE, not copy_file_range.  copy_file_range had to follow
> the NFS semantics that don't have the special 0 case.

Nope, COPY has it:

	https://tools.ietf.org/html/rfc7862#section-15.2.3

	A count of 0 (zero) requests that all bytes from ca_src_offset
	through EOF be copied to the destination.

I think leaving it out of copy_file_range was just an oversight.

--b.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia
  2017-06-14 17:24 ` Anna Schumaker
  2017-06-15  8:15 ` Christoph Hellwig
@ 2017-06-24 13:23 ` Jeff Layton
  2017-06-26 13:21   ` Anna Schumaker
  2 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2017-06-24 13:23 UTC (permalink / raw)
  To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs

On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
> 
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
> 
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
>  include/linux/fs.h                     |   4 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/sys_ni.c                        |   1 +
>  7 files changed, 149 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
>  382	i386	pkey_free		sys_pkey_free
>  383	i386	statx			sys_statx
>  384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
> +385	i386	copy_file_range64	sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	64	copy_file_range64	sys_copy_file_range64
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..e2665c1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	return ret;
>  }
>  
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> +			    struct file *file_out, loff_t pos_out,
> +			    u64 len, unsigned int flags)
> +{
> +	struct inode *inode_in = file_inode(file_in);
> +	struct inode *inode_out = file_inode(file_out);
> +	u64 ret;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> +		return -EISDIR;
> +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +		return -EINVAL;
> +
> +	ret = rw_verify_area(READ, file_in, &pos_in, len);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	if (!(file_in->f_mode & FMODE_READ) ||
> +	    !(file_out->f_mode & FMODE_WRITE) ||
> +	    (file_out->f_flags & O_APPEND))
> +		return -EBADF;
> +
> +	if (len == 0)
> +		return 0;
> +
> +	file_start_write(file_out);
> +
> +	/*
> +	 * Try cloning first, this is supported by more file systems, and
> +	 * more efficient if both clone and copy are supported (e.g. NFS).
> +	 */
> +	if (file_in->f_op->clone_file_range) {
> +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
> +				file_out, pos_out, len);
> +		if (ret == 0) {
> +			ret = len;
> +			goto done;
> +		}
> +	}
> +
> +	if (file_out->f_op->copy_file_range64) {
> +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> +					file_out, pos_out, len, flags);
> +		if (ret != -EOPNOTSUPP)
> +			goto done;
> +	}
> +
> +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> +	if (ret > 0) {
> +		fsnotify_access(file_in);
> +		add_rchar(current, ret);
> +		fsnotify_modify(file_out);
> +		add_wchar(current, ret);
> +	}
> +
> +	inc_syscr(current);
> +	inc_syscw(current);
> +
> +	file_end_write(file_out);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> +		int, fd_out, loff_t __user *, off_out,
> +		u64, len, unsigned int, flags)
> +{
> +	loff_t pos_in;
> +	loff_t pos_out;
> +	struct fd f_in;
> +	struct fd f_out;
> +	u64 ret = -EBADF;
> +
> +	f_in = fdget(fd_in);
> +	if (!f_in.file)
> +		goto out2;
> +
> +	f_out = fdget(fd_out);
> +	if (!f_out.file)
> +		goto out1;
> +
> +	ret = -EFAULT;
> +	if (off_in) {
> +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_in = f_in.file->f_pos;
> +	}
> +
> +	if (off_out) {
> +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> +			goto out;
> +	} else {
> +		pos_out = f_out.file->f_pos;
> +	}
> +
> +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> +				  flags);
> +	if (ret > 0) {
> +		pos_in += ret;
> +		pos_out += ret;
> +
> +		if (off_in) {
> +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_in.file->f_pos = pos_in;
> +		}
> +
> +		if (off_out) {
> +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> +				ret = -EFAULT;
> +		} else {
> +			f_out.file->f_pos = pos_out;
> +		}
> +	}
> +
> +out:
> +	fdput(f_out);
> +out1:
> +	fdput(f_in);
> +out2:
> +	return ret;
> +}
> +
>  static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>  {
>  	struct inode *inode = file_inode(file);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..2727a98 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
>  			u64);
>  	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>  			u64);
> +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> +			loff_t, u64, unsigned int);
>  };
>  
>  struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>  					 loff_t len, bool *is_same);
>  extern int vfs_dedupe_file_range(struct file *file,
>  				 struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> +			       loff_t, u64, unsigned int);
>  
>  struct super_operations {
>     	struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> +				    int fd_out, loff_t __user *off_out,
> +				    u64 len, unsigned int flags);
>  
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>  #define __NR_statx 291
>  __SYSCALL(__NR_statx,     sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>  cond_syscall(sys_capget);
>  cond_syscall(sys_capset);
>  cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>  
>  /* arch-specific weak syscall entries */
>  cond_syscall(sys_pciconfig_read);

Hi Olga,

We discussed this a bit privately, but I'll note it here too.

Server-to-server copy seems like a nice thing to me as well. There are
several filesystems that can likely make use of that ability.

I don't have a real opinion on the API change either, but you're making
a very subtle change with this patch. Prior to this, the existing
copy_file_range calls could count on the superblocks being the same.
Now, it looks like you can pass them any old file pointer.

The exsiting copy_file_range file ops need to be fixed to vet the struct
files they've been handed if you want to do this.

It would also be nice to see something on copy_file_range and
clone_file_range in Documentation/filesystems/vfs.txt.

Cheers,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-24 13:23 ` Jeff Layton
@ 2017-06-26 13:21   ` Anna Schumaker
  2017-06-26 13:46     ` Jeff Layton
  0 siblings, 1 reply; 35+ messages in thread
From: Anna Schumaker @ 2017-06-26 13:21 UTC (permalink / raw)
  To: Jeff Layton, Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs



On 06/24/2017 09:23 AM, Jeff Layton wrote:
> On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> ---
>>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>>  fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
>>  include/linux/fs.h                     |   4 +
>>  include/linux/syscalls.h               |   3 +
>>  include/uapi/asm-generic/unistd.h      |   4 +-
>>  kernel/sys_ni.c                        |   1 +
>>  7 files changed, 149 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 448ac21..1f5f1e7 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -391,3 +391,4 @@
>>  382	i386	pkey_free		sys_pkey_free
>>  383	i386	statx			sys_statx
>>  384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
>> +385	i386	copy_file_range64	sys_copy_file_range64
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 5aef183..e658bbe 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -339,6 +339,7 @@
>>  330	common	pkey_alloc		sys_pkey_alloc
>>  331	common	pkey_free		sys_pkey_free
>>  332	common	statx			sys_statx
>> +333	64	copy_file_range64	sys_copy_file_range64
>>  
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d44..e2665c1 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>  	return ret;
>>  }
>>  
>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> +			    struct file *file_out, loff_t pos_out,
>> +			    u64 len, unsigned int flags)
>> +{
>> +	struct inode *inode_in = file_inode(file_in);
>> +	struct inode *inode_out = file_inode(file_out);
>> +	u64 ret;
>> +
>> +	if (flags != 0)
>> +		return -EINVAL;
>> +
>> +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> +		return -EISDIR;
>> +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> +		return -EINVAL;
>> +
>> +	ret = rw_verify_area(READ, file_in, &pos_in, len);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	if (!(file_in->f_mode & FMODE_READ) ||
>> +	    !(file_out->f_mode & FMODE_WRITE) ||
>> +	    (file_out->f_flags & O_APPEND))
>> +		return -EBADF;
>> +
>> +	if (len == 0)
>> +		return 0;
>> +
>> +	file_start_write(file_out);
>> +
>> +	/*
>> +	 * Try cloning first, this is supported by more file systems, and
>> +	 * more efficient if both clone and copy are supported (e.g. NFS).
>> +	 */
>> +	if (file_in->f_op->clone_file_range) {
>> +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> +				file_out, pos_out, len);
>> +		if (ret == 0) {
>> +			ret = len;
>> +			goto done;
>> +		}
>> +	}
>> +
>> +	if (file_out->f_op->copy_file_range64) {
>> +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> +					file_out, pos_out, len, flags);
>> +		if (ret != -EOPNOTSUPP)
>> +			goto done;
>> +	}
>> +
>> +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>> +done:
>> +	if (ret > 0) {
>> +		fsnotify_access(file_in);
>> +		add_rchar(current, ret);
>> +		fsnotify_modify(file_out);
>> +		add_wchar(current, ret);
>> +	}
>> +
>> +	inc_syscr(current);
>> +	inc_syscw(current);
>> +
>> +	file_end_write(file_out);
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> +
>> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> +		int, fd_out, loff_t __user *, off_out,
>> +		u64, len, unsigned int, flags)
>> +{
>> +	loff_t pos_in;
>> +	loff_t pos_out;
>> +	struct fd f_in;
>> +	struct fd f_out;
>> +	u64 ret = -EBADF;
>> +
>> +	f_in = fdget(fd_in);
>> +	if (!f_in.file)
>> +		goto out2;
>> +
>> +	f_out = fdget(fd_out);
>> +	if (!f_out.file)
>> +		goto out1;
>> +
>> +	ret = -EFAULT;
>> +	if (off_in) {
>> +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> +			goto out;
>> +	} else {
>> +		pos_in = f_in.file->f_pos;
>> +	}
>> +
>> +	if (off_out) {
>> +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> +			goto out;
>> +	} else {
>> +		pos_out = f_out.file->f_pos;
>> +	}
>> +
>> +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> +				  flags);
>> +	if (ret > 0) {
>> +		pos_in += ret;
>> +		pos_out += ret;
>> +
>> +		if (off_in) {
>> +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> +				ret = -EFAULT;
>> +		} else {
>> +			f_in.file->f_pos = pos_in;
>> +		}
>> +
>> +		if (off_out) {
>> +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> +				ret = -EFAULT;
>> +		} else {
>> +			f_out.file->f_pos = pos_out;
>> +		}
>> +	}
>> +
>> +out:
>> +	fdput(f_out);
>> +out1:
>> +	fdput(f_in);
>> +out2:
>> +	return ret;
>> +}
>> +
>>  static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>  {
>>  	struct inode *inode = file_inode(file);
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 803e5a9..2727a98 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1690,6 +1690,8 @@ struct file_operations {
>>  			u64);
>>  	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>>  			u64);
>> +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> +			loff_t, u64, unsigned int);
>>  };
>>  
>>  struct inode_operations {
>> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>>  					 loff_t len, bool *is_same);
>>  extern int vfs_dedupe_file_range(struct file *file,
>>  				 struct file_dedupe_range *same);
>> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> +			       loff_t, u64, unsigned int);
>>  
>>  struct super_operations {
>>     	struct inode *(*alloc_inode)(struct super_block *sb);
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index 980c3c9..f7ea78e 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>>  asmlinkage long sys_pkey_free(int pkey);
>>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>>  			  unsigned mask, struct statx __user *buffer);
>> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> +				    int fd_out, loff_t __user *off_out,
>> +				    u64 len, unsigned int flags);
>>  
>>  #endif
>> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> index 061185a..e385a76 100644
>> --- a/include/uapi/asm-generic/unistd.h
>> +++ b/include/uapi/asm-generic/unistd.h
>> @@ -731,9 +731,11 @@
>>  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>>  #define __NR_statx 291
>>  __SYSCALL(__NR_statx,     sys_statx)
>> +#define __NR_copy_file_range64 292
>> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>>  
>>  #undef __NR_syscalls
>> -#define __NR_syscalls 292
>> +#define __NR_syscalls 293
>>  
>>  /*
>>   * All syscalls below here should go away really,
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index 8acef85..8e0cfd4 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>>  cond_syscall(sys_capget);
>>  cond_syscall(sys_capset);
>>  cond_syscall(sys_copy_file_range);
>> +cond_syscall(sys_copy_file_range64);
>>  
>>  /* arch-specific weak syscall entries */
>>  cond_syscall(sys_pciconfig_read);
> 
> Hi Olga,
> 
> We discussed this a bit privately, but I'll note it here too.
> 
> Server-to-server copy seems like a nice thing to me as well. There are
> several filesystems that can likely make use of that ability.
> 
> I don't have a real opinion on the API change either, but you're making
> a very subtle change with this patch. Prior to this, the existing
> copy_file_range calls could count on the superblocks being the same.
> Now, it looks like you can pass them any old file pointer.

What if we add a new file op for xdev copy that gets called when superblocks are different, but filesystem type is the same?  We could keep the current copy_file_range fop to fall back on if superblocks are the same.

Thoughts?
Anna

> 
> The exsiting copy_file_range file ops need to be fixed to vet the struct
> files they've been handed if you want to do this.
> 
> It would also be nice to see something on copy_file_range and
> clone_file_range in Documentation/filesystems/vfs.txt.
> 
> Cheers,
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-26 13:21   ` Anna Schumaker
@ 2017-06-26 13:46     ` Jeff Layton
  2017-06-26 14:46       ` Olga Kornievskaia
       [not found]       ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>
  0 siblings, 2 replies; 35+ messages in thread
From: Jeff Layton @ 2017-06-26 13:46 UTC (permalink / raw)
  To: Anna Schumaker, Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs

On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
> 
> On 06/24/2017 09:23 AM, Jeff Layton wrote:
> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> > > This is a proposal to allow 64bit count to copy and return as
> > > a result of a copy_file_range. No attempt was made to share code
> > > with the 32bit function because 32bit interface should probably
> > > get depreciated.
> > > 
> > > Why use 64bit? Current uses of 32bit are by clone_file_range()
> > > which could use 64bit count and NFS copy_file_range also supports
> > > 64bit count value.
> > > 
> > > Also with this proposal off-the-bet allow the userland to copy
> > > between different mount points.
> > > 
> > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> > > ---
> > >  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
> > >  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> > >  fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
> > >  include/linux/fs.h                     |   4 +
> > >  include/linux/syscalls.h               |   3 +
> > >  include/uapi/asm-generic/unistd.h      |   4 +-
> > >  kernel/sys_ni.c                        |   1 +
> > >  7 files changed, 149 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > > index 448ac21..1f5f1e7 100644
> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > > @@ -391,3 +391,4 @@
> > >  382	i386	pkey_free		sys_pkey_free
> > >  383	i386	statx			sys_statx
> > >  384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
> > > +385	i386	copy_file_range64	sys_copy_file_range64
> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > > index 5aef183..e658bbe 100644
> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > > @@ -339,6 +339,7 @@
> > >  330	common	pkey_alloc		sys_pkey_alloc
> > >  331	common	pkey_free		sys_pkey_free
> > >  332	common	statx			sys_statx
> > > +333	64	copy_file_range64	sys_copy_file_range64
> > >  
> > >  #
> > >  # x32-specific system call numbers start at 512 to avoid cache impact
> > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > index 47c1d44..e2665c1 100644
> > > --- a/fs/read_write.c
> > > +++ b/fs/read_write.c
> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> > >  	return ret;
> > >  }
> > >  
> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> > > +			    struct file *file_out, loff_t pos_out,
> > > +			    u64 len, unsigned int flags)
> > > +{
> > > +	struct inode *inode_in = file_inode(file_in);
> > > +	struct inode *inode_out = file_inode(file_out);
> > > +	u64 ret;
> > > +
> > > +	if (flags != 0)
> > > +		return -EINVAL;
> > > +
> > > +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> > > +		return -EISDIR;
> > > +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> > > +		return -EINVAL;
> > > +
> > > +	ret = rw_verify_area(READ, file_in, &pos_in, len);
> > > +	if (unlikely(ret))
> > > +		return ret;
> > > +
> > > +	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> > > +	if (unlikely(ret))
> > > +		return ret;
> > > +
> > > +	if (!(file_in->f_mode & FMODE_READ) ||
> > > +	    !(file_out->f_mode & FMODE_WRITE) ||
> > > +	    (file_out->f_flags & O_APPEND))
> > > +		return -EBADF;
> > > +
> > > +	if (len == 0)
> > > +		return 0;
> > > +
> > > +	file_start_write(file_out);
> > > +
> > > +	/*
> > > +	 * Try cloning first, this is supported by more file systems, and
> > > +	 * more efficient if both clone and copy are supported (e.g. NFS).
> > > +	 */
> > > +	if (file_in->f_op->clone_file_range) {
> > > +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
> > > +				file_out, pos_out, len);
> > > +		if (ret == 0) {
> > > +			ret = len;
> > > +			goto done;
> > > +		}
> > > +	}
> > > +
> > > +	if (file_out->f_op->copy_file_range64) {
> > > +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> > > +					file_out, pos_out, len, flags);
> > > +		if (ret != -EOPNOTSUPP)
> > > +			goto done;
> > > +	}
> > > +
> > > +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > +
> > > +done:
> > > +	if (ret > 0) {
> > > +		fsnotify_access(file_in);
> > > +		add_rchar(current, ret);
> > > +		fsnotify_modify(file_out);
> > > +		add_wchar(current, ret);
> > > +	}
> > > +
> > > +	inc_syscr(current);
> > > +	inc_syscw(current);
> > > +
> > > +	file_end_write(file_out);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(vfs_copy_file_range64);
> > > +
> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> > > +		int, fd_out, loff_t __user *, off_out,
> > > +		u64, len, unsigned int, flags)
> > > +{
> > > +	loff_t pos_in;
> > > +	loff_t pos_out;
> > > +	struct fd f_in;
> > > +	struct fd f_out;
> > > +	u64 ret = -EBADF;
> > > +
> > > +	f_in = fdget(fd_in);
> > > +	if (!f_in.file)
> > > +		goto out2;
> > > +
> > > +	f_out = fdget(fd_out);
> > > +	if (!f_out.file)
> > > +		goto out1;
> > > +
> > > +	ret = -EFAULT;
> > > +	if (off_in) {
> > > +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> > > +			goto out;
> > > +	} else {
> > > +		pos_in = f_in.file->f_pos;
> > > +	}
> > > +
> > > +	if (off_out) {
> > > +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> > > +			goto out;
> > > +	} else {
> > > +		pos_out = f_out.file->f_pos;
> > > +	}
> > > +
> > > +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> > > +				  flags);
> > > +	if (ret > 0) {
> > > +		pos_in += ret;
> > > +		pos_out += ret;
> > > +
> > > +		if (off_in) {
> > > +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> > > +				ret = -EFAULT;
> > > +		} else {
> > > +			f_in.file->f_pos = pos_in;
> > > +		}
> > > +
> > > +		if (off_out) {
> > > +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> > > +				ret = -EFAULT;
> > > +		} else {
> > > +			f_out.file->f_pos = pos_out;
> > > +		}
> > > +	}
> > > +
> > > +out:
> > > +	fdput(f_out);
> > > +out1:
> > > +	fdput(f_in);
> > > +out2:
> > > +	return ret;
> > > +}
> > > +
> > >  static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> > >  {
> > >  	struct inode *inode = file_inode(file);
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index 803e5a9..2727a98 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1690,6 +1690,8 @@ struct file_operations {
> > >  			u64);
> > >  	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> > >  			u64);
> > > +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> > > +			loff_t, u64, unsigned int);
> > >  };
> > >  
> > >  struct inode_operations {
> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > >  					 loff_t len, bool *is_same);
> > >  extern int vfs_dedupe_file_range(struct file *file,
> > >  				 struct file_dedupe_range *same);
> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> > > +			       loff_t, u64, unsigned int);
> > >  
> > >  struct super_operations {
> > >     	struct inode *(*alloc_inode)(struct super_block *sb);
> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > index 980c3c9..f7ea78e 100644
> > > --- a/include/linux/syscalls.h
> > > +++ b/include/linux/syscalls.h
> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> > >  asmlinkage long sys_pkey_free(int pkey);
> > >  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> > >  			  unsigned mask, struct statx __user *buffer);
> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> > > +				    int fd_out, loff_t __user *off_out,
> > > +				    u64 len, unsigned int flags);
> > >  
> > >  #endif
> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > > index 061185a..e385a76 100644
> > > --- a/include/uapi/asm-generic/unistd.h
> > > +++ b/include/uapi/asm-generic/unistd.h
> > > @@ -731,9 +731,11 @@
> > >  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
> > >  #define __NR_statx 291
> > >  __SYSCALL(__NR_statx,     sys_statx)
> > > +#define __NR_copy_file_range64 292
> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
> > >  
> > >  #undef __NR_syscalls
> > > -#define __NR_syscalls 292
> > > +#define __NR_syscalls 293
> > >  
> > >  /*
> > >   * All syscalls below here should go away really,
> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > > index 8acef85..8e0cfd4 100644
> > > --- a/kernel/sys_ni.c
> > > +++ b/kernel/sys_ni.c
> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> > >  cond_syscall(sys_capget);
> > >  cond_syscall(sys_capset);
> > >  cond_syscall(sys_copy_file_range);
> > > +cond_syscall(sys_copy_file_range64);
> > >  
> > >  /* arch-specific weak syscall entries */
> > >  cond_syscall(sys_pciconfig_read);
> > 
> > Hi Olga,
> > 
> > We discussed this a bit privately, but I'll note it here too.
> > 
> > Server-to-server copy seems like a nice thing to me as well. There are
> > several filesystems that can likely make use of that ability.
> > 
> > I don't have a real opinion on the API change either, but you're making
> > a very subtle change with this patch. Prior to this, the existing
> > copy_file_range calls could count on the superblocks being the same.
> > Now, it looks like you can pass them any old file pointer.
> 
> What if we add a new file op for xdev copy that gets called when
> superblocks are different, but filesystem type is the same?  We could
> keep the current copy_file_range fop to fall back on if superblocks
> are the same.

I don't think that would really help much. There are only two
filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
think we really need another special case op for this.

What I would probably suggest is to just push the check for the same
superblock down into the copy_file_range ops in one patch (with no
functional change). Then, you could go and lift that restriction in the
NFSv4 operation without regressing cifs. The CIFS folks could eventually
do the same for theirs.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
  2017-06-26 13:46     ` Jeff Layton
@ 2017-06-26 14:46       ` Olga Kornievskaia
  2017-06-26 14:49         ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia
       [not found]       ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>
  1 sibling, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-26 14:46 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Anna Schumaker, Olga Kornievskaia, linux-fsdevel@vger.kernel.org,
	linux-nfs

On Mon, Jun 26, 2017 at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote:
> On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
>>
>> On 06/24/2017 09:23 AM, Jeff Layton wrote:
>> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
>> > > This is a proposal to allow 64bit count to copy and return as
>> > > a result of a copy_file_range. No attempt was made to share code
>> > > with the 32bit function because 32bit interface should probably
>> > > get depreciated.
>> > >
>> > > Why use 64bit? Current uses of 32bit are by clone_file_range()
>> > > which could use 64bit count and NFS copy_file_range also supports
>> > > 64bit count value.
>> > >
>> > > Also with this proposal off-the-bet allow the userland to copy
>> > > between different mount points.
>> > >
>> > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> > > ---
>> > >  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>> > >  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>> > >  fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
>> > >  include/linux/fs.h                     |   4 +
>> > >  include/linux/syscalls.h               |   3 +
>> > >  include/uapi/asm-generic/unistd.h      |   4 +-
>> > >  kernel/sys_ni.c                        |   1 +
>> > >  7 files changed, 149 insertions(+), 1 deletion(-)
>> > >
>> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> > > index 448ac21..1f5f1e7 100644
>> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> > > @@ -391,3 +391,4 @@
>> > >  382      i386    pkey_free               sys_pkey_free
>> > >  383      i386    statx                   sys_statx
>> > >  384      i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
>> > > +385      i386    copy_file_range64       sys_copy_file_range64
>> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> > > index 5aef183..e658bbe 100644
>> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> > > @@ -339,6 +339,7 @@
>> > >  330      common  pkey_alloc              sys_pkey_alloc
>> > >  331      common  pkey_free               sys_pkey_free
>> > >  332      common  statx                   sys_statx
>> > > +333      64      copy_file_range64       sys_copy_file_range64
>> > >
>> > >  #
>> > >  # x32-specific system call numbers start at 512 to avoid cache impact
>> > > diff --git a/fs/read_write.c b/fs/read_write.c
>> > > index 47c1d44..e2665c1 100644
>> > > --- a/fs/read_write.c
>> > > +++ b/fs/read_write.c
>> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> > >   return ret;
>> > >  }
>> > >
>> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> > > +                     struct file *file_out, loff_t pos_out,
>> > > +                     u64 len, unsigned int flags)
>> > > +{
>> > > + struct inode *inode_in = file_inode(file_in);
>> > > + struct inode *inode_out = file_inode(file_out);
>> > > + u64 ret;
>> > > +
>> > > + if (flags != 0)
>> > > +         return -EINVAL;
>> > > +
>> > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> > > +         return -EISDIR;
>> > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> > > +         return -EINVAL;
>> > > +
>> > > + ret = rw_verify_area(READ, file_in, &pos_in, len);
>> > > + if (unlikely(ret))
>> > > +         return ret;
>> > > +
>> > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
>> > > + if (unlikely(ret))
>> > > +         return ret;
>> > > +
>> > > + if (!(file_in->f_mode & FMODE_READ) ||
>> > > +     !(file_out->f_mode & FMODE_WRITE) ||
>> > > +     (file_out->f_flags & O_APPEND))
>> > > +         return -EBADF;
>> > > +
>> > > + if (len == 0)
>> > > +         return 0;
>> > > +
>> > > + file_start_write(file_out);
>> > > +
>> > > + /*
>> > > +  * Try cloning first, this is supported by more file systems, and
>> > > +  * more efficient if both clone and copy are supported (e.g. NFS).
>> > > +  */
>> > > + if (file_in->f_op->clone_file_range) {
>> > > +         ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> > > +                         file_out, pos_out, len);
>> > > +         if (ret == 0) {
>> > > +                 ret = len;
>> > > +                 goto done;
>> > > +         }
>> > > + }
>> > > +
>> > > + if (file_out->f_op->copy_file_range64) {
>> > > +         ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> > > +                                 file_out, pos_out, len, flags);
>> > > +         if (ret != -EOPNOTSUPP)
>> > > +                 goto done;
>> > > + }
>> > > +
>> > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> > > +                 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> > > +
>> > > +done:
>> > > + if (ret > 0) {
>> > > +         fsnotify_access(file_in);
>> > > +         add_rchar(current, ret);
>> > > +         fsnotify_modify(file_out);
>> > > +         add_wchar(current, ret);
>> > > + }
>> > > +
>> > > + inc_syscr(current);
>> > > + inc_syscw(current);
>> > > +
>> > > + file_end_write(file_out);
>> > > +
>> > > + return ret;
>> > > +}
>> > > +EXPORT_SYMBOL(vfs_copy_file_range64);
>> > > +
>> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> > > +         int, fd_out, loff_t __user *, off_out,
>> > > +         u64, len, unsigned int, flags)
>> > > +{
>> > > + loff_t pos_in;
>> > > + loff_t pos_out;
>> > > + struct fd f_in;
>> > > + struct fd f_out;
>> > > + u64 ret = -EBADF;
>> > > +
>> > > + f_in = fdget(fd_in);
>> > > + if (!f_in.file)
>> > > +         goto out2;
>> > > +
>> > > + f_out = fdget(fd_out);
>> > > + if (!f_out.file)
>> > > +         goto out1;
>> > > +
>> > > + ret = -EFAULT;
>> > > + if (off_in) {
>> > > +         if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> > > +                 goto out;
>> > > + } else {
>> > > +         pos_in = f_in.file->f_pos;
>> > > + }
>> > > +
>> > > + if (off_out) {
>> > > +         if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> > > +                 goto out;
>> > > + } else {
>> > > +         pos_out = f_out.file->f_pos;
>> > > + }
>> > > +
>> > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> > > +                           flags);
>> > > + if (ret > 0) {
>> > > +         pos_in += ret;
>> > > +         pos_out += ret;
>> > > +
>> > > +         if (off_in) {
>> > > +                 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> > > +                         ret = -EFAULT;
>> > > +         } else {
>> > > +                 f_in.file->f_pos = pos_in;
>> > > +         }
>> > > +
>> > > +         if (off_out) {
>> > > +                 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> > > +                         ret = -EFAULT;
>> > > +         } else {
>> > > +                 f_out.file->f_pos = pos_out;
>> > > +         }
>> > > + }
>> > > +
>> > > +out:
>> > > + fdput(f_out);
>> > > +out1:
>> > > + fdput(f_in);
>> > > +out2:
>> > > + return ret;
>> > > +}
>> > > +
>> > >  static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> > >  {
>> > >   struct inode *inode = file_inode(file);
>> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
>> > > index 803e5a9..2727a98 100644
>> > > --- a/include/linux/fs.h
>> > > +++ b/include/linux/fs.h
>> > > @@ -1690,6 +1690,8 @@ struct file_operations {
>> > >                   u64);
>> > >   ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> > >                   u64);
>> > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> > > +                 loff_t, u64, unsigned int);
>> > >  };
>> > >
>> > >  struct inode_operations {
>> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> > >                                    loff_t len, bool *is_same);
>> > >  extern int vfs_dedupe_file_range(struct file *file,
>> > >                            struct file_dedupe_range *same);
>> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> > > +                        loff_t, u64, unsigned int);
>> > >
>> > >  struct super_operations {
>> > >           struct inode *(*alloc_inode)(struct super_block *sb);
>> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> > > index 980c3c9..f7ea78e 100644
>> > > --- a/include/linux/syscalls.h
>> > > +++ b/include/linux/syscalls.h
>> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> > >  asmlinkage long sys_pkey_free(int pkey);
>> > >  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> > >                     unsigned mask, struct statx __user *buffer);
>> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> > > +                             int fd_out, loff_t __user *off_out,
>> > > +                             u64 len, unsigned int flags);
>> > >
>> > >  #endif
>> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> > > index 061185a..e385a76 100644
>> > > --- a/include/uapi/asm-generic/unistd.h
>> > > +++ b/include/uapi/asm-generic/unistd.h
>> > > @@ -731,9 +731,11 @@
>> > >  __SYSCALL(__NR_pkey_free,     sys_pkey_free)
>> > >  #define __NR_statx 291
>> > >  __SYSCALL(__NR_statx,     sys_statx)
>> > > +#define __NR_copy_file_range64 292
>> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>> > >
>> > >  #undef __NR_syscalls
>> > > -#define __NR_syscalls 292
>> > > +#define __NR_syscalls 293
>> > >
>> > >  /*
>> > >   * All syscalls below here should go away really,
>> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> > > index 8acef85..8e0cfd4 100644
>> > > --- a/kernel/sys_ni.c
>> > > +++ b/kernel/sys_ni.c
>> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> > >  cond_syscall(sys_capget);
>> > >  cond_syscall(sys_capset);
>> > >  cond_syscall(sys_copy_file_range);
>> > > +cond_syscall(sys_copy_file_range64);
>> > >
>> > >  /* arch-specific weak syscall entries */
>> > >  cond_syscall(sys_pciconfig_read);
>> >
>> > Hi Olga,
>> >
>> > We discussed this a bit privately, but I'll note it here too.
>> >
>> > Server-to-server copy seems like a nice thing to me as well. There are
>> > several filesystems that can likely make use of that ability.
>> >
>> > I don't have a real opinion on the API change either, but you're making
>> > a very subtle change with this patch. Prior to this, the existing
>> > copy_file_range calls could count on the superblocks being the same.
>> > Now, it looks like you can pass them any old file pointer.
>>
>> What if we add a new file op for xdev copy that gets called when
>> superblocks are different, but filesystem type is the same?  We could
>> keep the current copy_file_range fop to fall back on if superblocks
>> are the same.
>
> I don't think that would really help much. There are only two
> filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
> think we really need another special case op for this.
>
> What I would probably suggest is to just push the check for the same
> superblock down into the copy_file_range ops in one patch (with no
> functional change). Then, you could go and lift that restriction in the
> NFSv4 operation without regressing cifs. The CIFS folks could eventually
> do the same for theirs.

CIFS folks already check if their target and source destination are
different then they return with EXDEV. So I believe the check that
file system ops that are the same for the fd_in and fd_out would be
sufficient for the current uses?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/1] VFS permit cross device vfs_copy_file_range
  2017-06-26 14:46       ` Olga Kornievskaia
@ 2017-06-26 14:49         ` Olga Kornievskaia
  2017-06-27 11:47           ` Jeff Layton
  0 siblings, 1 reply; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-26 14:49 UTC (permalink / raw)
  To: linux-fsdevel, linux-nfs

Allow copy_file_range to copy between different superblocks but only
of the same file system types.

This feature is needed by NFSv4.2 to perform file copy operation on
the same server or file copy between different NFSv4.2 servers.

If a file system's fileoperations copy_file_range operation prohibits
cross-device copies, fall back to do_splice_direct. This is needed for
the NFS (destination) server side implementation of the file copy.

Currently there is only 1 implementor of the copy_file_range FS
operation -- CIFS. CIFS assumes incoming file descriptors are both
CIFS but it will check if they are coming from different servers.

NFS will allow for copies between different NFS servers.

Adding to the vfs.txt documentation to explicitly warn about allowing
for different superblocks of the same file type to be passed into the
copy_file_range for the future users of copy_file_range method.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
---
 Documentation/filesystems/vfs.txt |  7 +++++++
 fs/read_write.c                   | 12 +++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f42b906..cf22424 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -845,6 +845,8 @@ struct file_operations {
 #ifndef CONFIG_MMU
 	unsigned (*mmap_capabilities)(struct file *);
 #endif
+	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
+				   loff_t, size_t, unsigned int);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -913,6 +915,11 @@ otherwise noted.
 
   fallocate: called by the VFS to preallocate blocks or punch a hole.
 
+  copy_file_range: called by copy_file_range(2) system call. This method
+		   works on two file descriptors that might reside on
+		   different superblocks of the same type of file system.
+		   
+
 Note that the file operations are implemented by the specific
 filesystem in which the inode resides. When opening a device node
 (character or block special) most filesystems will call special
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d44..effbfb2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	    (file_out->f_flags & O_APPEND))
 		return -EBADF;
 
-	/* this could be relaxed once a method supports cross-fs copies */
-	if (inode_in->i_sb != inode_out->i_sb)
-		return -EXDEV;
-
 	if (len == 0)
 		return 0;
 
@@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
 	 */
-	if (file_in->f_op->clone_file_range) {
+	if (inode_in->i_sb == inode_out->i_sb &&
+			file_in->f_op->clone_file_range) {
 		ret = file_in->f_op->clone_file_range(file_in, pos_in,
 				file_out, pos_out, len);
 		if (ret == 0) {
@@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		}
 	}
 
-	if (file_out->f_op->copy_file_range) {
+	if (file_out->f_op->copy_file_range &&
+			(file_in->f_op == file_out->f_op)) {
 		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
 						      pos_out, len, flags);
-		if (ret != -EOPNOTSUPP)
+		if (ret != -EOPNOTSUPP && ret != -EXDEV)
 			goto done;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range
  2017-06-26 14:49         ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia
@ 2017-06-27 11:47           ` Jeff Layton
  2017-06-27 16:12             ` Olga Kornievskaia
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2017-06-27 11:47 UTC (permalink / raw)
  To: Olga Kornievskaia, linux-fsdevel, linux-nfs

On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote:
> Allow copy_file_range to copy between different superblocks but only
> of the same file system types.
> 
> This feature is needed by NFSv4.2 to perform file copy operation on
> the same server or file copy between different NFSv4.2 servers.
> 
> If a file system's fileoperations copy_file_range operation prohibits
> cross-device copies, fall back to do_splice_direct. This is needed for
> the NFS (destination) server side implementation of the file copy.
> 
> Currently there is only 1 implementor of the copy_file_range FS
> operation -- CIFS. CIFS assumes incoming file descriptors are both
> CIFS but it will check if they are coming from different servers.
> 
> NFS will allow for copies between different NFS servers.
> 
> Adding to the vfs.txt documentation to explicitly warn about allowing
> for different superblocks of the same file type to be passed into the
> copy_file_range for the future users of copy_file_range method.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  Documentation/filesystems/vfs.txt |  7 +++++++
>  fs/read_write.c                   | 12 +++++-------
>  2 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
> index f42b906..cf22424 100644
> --- a/Documentation/filesystems/vfs.txt
> +++ b/Documentation/filesystems/vfs.txt
> @@ -845,6 +845,8 @@ struct file_operations {
>  #ifndef CONFIG_MMU
>  	unsigned (*mmap_capabilities)(struct file *);
>  #endif
> +	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
> +				   loff_t, size_t, unsigned int);
>  };
>  
>  Again, all methods are called without any locks being held, unless
> @@ -913,6 +915,11 @@ otherwise noted.
>  
>    fallocate: called by the VFS to preallocate blocks or punch a hole.
>  
> +  copy_file_range: called by copy_file_range(2) system call. This method
> +		   works on two file descriptors that might reside on
> +		   different superblocks of the same type of file system.
> +		   
> +
>  Note that the file operations are implemented by the specific
>  filesystem in which the inode resides. When opening a device node
>  (character or block special) most filesystems will call special
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..effbfb2 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	    (file_out->f_flags & O_APPEND))
>  		return -EBADF;
>  
> -	/* this could be relaxed once a method supports cross-fs copies */
> -	if (inode_in->i_sb != inode_out->i_sb)
> -		return -EXDEV;
> -
>  	if (len == 0)
>  		return 0;
>  
> @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	 * Try cloning first, this is supported by more file systems, and
>  	 * more efficient if both clone and copy are supported (e.g. NFS).
>  	 */
> -	if (file_in->f_op->clone_file_range) {
> +	if (inode_in->i_sb == inode_out->i_sb &&
> +			file_in->f_op->clone_file_range) {

I do wonder if we really ought to check for the same superblock here.

ISTM that it might be possible for some fs' (btrfs?) to clone blocks
across superblocks in some situations?

Oh well -- I guess we can cross that bridge when we come to it.

>  		ret = file_in->f_op->clone_file_range(file_in, pos_in,
>  				file_out, pos_out, len);
>  		if (ret == 0) {
> @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  		}
>  	}
>  
> -	if (file_out->f_op->copy_file_range) {
> +	if (file_out->f_op->copy_file_range &&
> +			(file_in->f_op == file_out->f_op)) {

It's possible that they might not have the same f_op pointer, but have
the same copy_file_range operation (e.g. cifs.ko has a bunch of
different f_op structures for different cases, so they might not be the
same even though they have the same copy_file_range op).

Maybe this should be:

    file_in->f_op->copy_file_range == file_out->f_op->copy_file_range

?

>  		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
>  						      pos_out, len, flags);
> -		if (ret != -EOPNOTSUPP)
> +		if (ret != -EOPNOTSUPP && ret != -EXDEV)
>  			goto done;
>  	}
>  

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range
  2017-06-27 11:47           ` Jeff Layton
@ 2017-06-27 16:12             ` Olga Kornievskaia
  0 siblings, 0 replies; 35+ messages in thread
From: Olga Kornievskaia @ 2017-06-27 16:12 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs

On Tue, Jun 27, 2017 at 7:47 AM, Jeff Layton <jlayton@redhat.com> wrote:
> On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote:
>> Allow copy_file_range to copy between different superblocks but only
>> of the same file system types.
>>
>> This feature is needed by NFSv4.2 to perform file copy operation on
>> the same server or file copy between different NFSv4.2 servers.
>>
>> If a file system's fileoperations copy_file_range operation prohibits
>> cross-device copies, fall back to do_splice_direct. This is needed for
>> the NFS (destination) server side implementation of the file copy.
>>
>> Currently there is only 1 implementor of the copy_file_range FS
>> operation -- CIFS. CIFS assumes incoming file descriptors are both
>> CIFS but it will check if they are coming from different servers.
>>
>> NFS will allow for copies between different NFS servers.
>>
>> Adding to the vfs.txt documentation to explicitly warn about allowing
>> for different superblocks of the same file type to be passed into the
>> copy_file_range for the future users of copy_file_range method.
>>
>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> ---
>>  Documentation/filesystems/vfs.txt |  7 +++++++
>>  fs/read_write.c                   | 12 +++++-------
>>  2 files changed, 12 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
>> index f42b906..cf22424 100644
>> --- a/Documentation/filesystems/vfs.txt
>> +++ b/Documentation/filesystems/vfs.txt
>> @@ -845,6 +845,8 @@ struct file_operations {
>>  #ifndef CONFIG_MMU
>>       unsigned (*mmap_capabilities)(struct file *);
>>  #endif
>> +     ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
>> +                                loff_t, size_t, unsigned int);
>>  };
>>
>>  Again, all methods are called without any locks being held, unless
>> @@ -913,6 +915,11 @@ otherwise noted.
>>
>>    fallocate: called by the VFS to preallocate blocks or punch a hole.
>>
>> +  copy_file_range: called by copy_file_range(2) system call. This method
>> +                works on two file descriptors that might reside on
>> +                different superblocks of the same type of file system.
>> +
>> +
>>  Note that the file operations are implemented by the specific
>>  filesystem in which the inode resides. When opening a device node
>>  (character or block special) most filesystems will call special
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d44..effbfb2 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>           (file_out->f_flags & O_APPEND))
>>               return -EBADF;
>>
>> -     /* this could be relaxed once a method supports cross-fs copies */
>> -     if (inode_in->i_sb != inode_out->i_sb)
>> -             return -EXDEV;
>> -
>>       if (len == 0)
>>               return 0;
>>
>> @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>        * Try cloning first, this is supported by more file systems, and
>>        * more efficient if both clone and copy are supported (e.g. NFS).
>>        */
>> -     if (file_in->f_op->clone_file_range) {
>> +     if (inode_in->i_sb == inode_out->i_sb &&
>> +                     file_in->f_op->clone_file_range) {
>
> I do wonder if we really ought to check for the same superblock here.
>
> ISTM that it might be possible for some fs' (btrfs?) to clone blocks
> across superblocks in some situations?
>
> Oh well -- I guess we can cross that bridge when we come to it.

I was trying to keep with the expectations of the underlying function
calls. Since I'm not proposing to change clone_file_range, I added
that check.

>>               ret = file_in->f_op->clone_file_range(file_in, pos_in,
>>                               file_out, pos_out, len);
>>               if (ret == 0) {
>> @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>               }
>>       }
>>
>> -     if (file_out->f_op->copy_file_range) {
>> +     if (file_out->f_op->copy_file_range &&
>> +                     (file_in->f_op == file_out->f_op)) {
>
> It's possible that they might not have the same f_op pointer, but have
> the same copy_file_range operation (e.g. cifs.ko has a bunch of
> different f_op structures for different cases, so they might not be the
> same even though they have the same copy_file_range op).
>
> Maybe this should be:
>
>     file_in->f_op->copy_file_range == file_out->f_op->copy_file_range
>
> ?

Yeah, I was debating between those two options and decided to go with
f_op pointer but I agree the other way is the way to go. I'll wait to
hear if there are any other comments and if not make this change and
send the next version.

thank you for the comments!

>>               ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
>>                                                     pos_out, len, flags);
>> -             if (ret != -EOPNOTSUPP)
>> +             if (ret != -EOPNOTSUPP && ret != -EXDEV)
>>                       goto done;
>>       }
>>
>
> --
> Jeff Layton <jlayton@redhat.com>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

[parent not found: <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>]

* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call
       [not found]       ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>
@ 2017-06-26 14:51         ` Jeff Layton
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff Layton @ 2017-06-26 14:51 UTC (permalink / raw)
  To: Olga Kornievskaia; +Cc: Anna Schumaker, linux-fsdevel, linux-nfs

On Mon, 2017-06-26 at 10:40 -0400, Olga Kornievskaia wrote:
> > On Jun 26, 2017, at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote:
> > 
> > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
> > > On 06/24/2017 09:23 AM, Jeff Layton wrote:
> > > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> > > > > This is a proposal to allow 64bit count to copy and return as
> > > > > a result of a copy_file_range. No attempt was made to share code
> > > > > with the 32bit function because 32bit interface should probably
> > > > > get depreciated.
> > > > > 
> > > > > Why use 64bit? Current uses of 32bit are by clone_file_range()
> > > > > which could use 64bit count and NFS copy_file_range also supports
> > > > > 64bit count value.
> > > > > 
> > > > > Also with this proposal off-the-bet allow the userland to copy
> > > > > between different mount points.
> > > > > 
> > > > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> > > > > ---
> > > > > arch/x86/entry/syscalls/syscall_32.tbl |   1 +
> > > > > arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> > > > > fs/read_write.c                        | 136 +++++++++++++++++++++++++++++++++
> > > > > include/linux/fs.h                     |   4 +
> > > > > include/linux/syscalls.h               |   3 +
> > > > > include/uapi/asm-generic/unistd.h      |   4 +-
> > > > > kernel/sys_ni.c                        |   1 +
> > > > > 7 files changed, 149 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > index 448ac21..1f5f1e7 100644
> > > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > @@ -391,3 +391,4 @@
> > > > > 382	i386	pkey_free		sys_pkey_free
> > > > > 383	i386	statx			sys_statx
> > > > > 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
> > > > > +385	i386	copy_file_range64	sys_copy_file_range64
> > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > index 5aef183..e658bbe 100644
> > > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > @@ -339,6 +339,7 @@
> > > > > 330	common	pkey_alloc		sys_pkey_alloc
> > > > > 331	common	pkey_free		sys_pkey_free
> > > > > 332	common	statx			sys_statx
> > > > > +333	64	copy_file_range64	sys_copy_file_range64
> > > > > 
> > > > > #
> > > > > # x32-specific system call numbers start at 512 to avoid cache impact
> > > > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > index 47c1d44..e2665c1 100644
> > > > > --- a/fs/read_write.c
> > > > > +++ b/fs/read_write.c
> > > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > 	return ret;
> > > > > }
> > > > > 
> > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> > > > > +			    struct file *file_out, loff_t pos_out,
> > > > > +			    u64 len, unsigned int flags)
> > > > > +{
> > > > > +	struct inode *inode_in = file_inode(file_in);
> > > > > +	struct inode *inode_out = file_inode(file_out);
> > > > > +	u64 ret;
> > > > > +
> > > > > +	if (flags != 0)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> > > > > +		return -EISDIR;
> > > > > +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	ret = rw_verify_area(READ, file_in, &pos_in, len);
> > > > > +	if (unlikely(ret))
> > > > > +		return ret;
> > > > > +
> > > > > +	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> > > > > +	if (unlikely(ret))
> > > > > +		return ret;
> > > > > +
> > > > > +	if (!(file_in->f_mode & FMODE_READ) ||
> > > > > +	    !(file_out->f_mode & FMODE_WRITE) ||
> > > > > +	    (file_out->f_flags & O_APPEND))
> > > > > +		return -EBADF;
> > > > > +
> > > > > +	if (len == 0)
> > > > > +		return 0;
> > > > > +
> > > > > +	file_start_write(file_out);
> > > > > +
> > > > > +	/*
> > > > > +	 * Try cloning first, this is supported by more file systems, and
> > > > > +	 * more efficient if both clone and copy are supported (e.g. NFS).
> > > > > +	 */
> > > > > +	if (file_in->f_op->clone_file_range) {
> > > > > +		ret = file_in->f_op->clone_file_range(file_in, pos_in,
> > > > > +				file_out, pos_out, len);
> > > > > +		if (ret == 0) {
> > > > > +			ret = len;
> > > > > +			goto done;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	if (file_out->f_op->copy_file_range64) {
> > > > > +		ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> > > > > +					file_out, pos_out, len, flags);
> > > > > +		if (ret != -EOPNOTSUPP)
> > > > > +			goto done;
> > > > > +	}
> > > > > +
> > > > > +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > +			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > +
> > > > > +done:
> > > > > +	if (ret > 0) {
> > > > > +		fsnotify_access(file_in);
> > > > > +		add_rchar(current, ret);
> > > > > +		fsnotify_modify(file_out);
> > > > > +		add_wchar(current, ret);
> > > > > +	}
> > > > > +
> > > > > +	inc_syscr(current);
> > > > > +	inc_syscw(current);
> > > > > +
> > > > > +	file_end_write(file_out);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > > +EXPORT_SYMBOL(vfs_copy_file_range64);
> > > > > +
> > > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> > > > > +		int, fd_out, loff_t __user *, off_out,
> > > > > +		u64, len, unsigned int, flags)
> > > > > +{
> > > > > +	loff_t pos_in;
> > > > > +	loff_t pos_out;
> > > > > +	struct fd f_in;
> > > > > +	struct fd f_out;
> > > > > +	u64 ret = -EBADF;
> > > > > +
> > > > > +	f_in = fdget(fd_in);
> > > > > +	if (!f_in.file)
> > > > > +		goto out2;
> > > > > +
> > > > > +	f_out = fdget(fd_out);
> > > > > +	if (!f_out.file)
> > > > > +		goto out1;
> > > > > +
> > > > > +	ret = -EFAULT;
> > > > > +	if (off_in) {
> > > > > +		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> > > > > +			goto out;
> > > > > +	} else {
> > > > > +		pos_in = f_in.file->f_pos;
> > > > > +	}
> > > > > +
> > > > > +	if (off_out) {
> > > > > +		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> > > > > +			goto out;
> > > > > +	} else {
> > > > > +		pos_out = f_out.file->f_pos;
> > > > > +	}
> > > > > +
> > > > > +	ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> > > > > +				  flags);
> > > > > +	if (ret > 0) {
> > > > > +		pos_in += ret;
> > > > > +		pos_out += ret;
> > > > > +
> > > > > +		if (off_in) {
> > > > > +			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> > > > > +				ret = -EFAULT;
> > > > > +		} else {
> > > > > +			f_in.file->f_pos = pos_in;
> > > > > +		}
> > > > > +
> > > > > +		if (off_out) {
> > > > > +			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> > > > > +				ret = -EFAULT;
> > > > > +		} else {
> > > > > +			f_out.file->f_pos = pos_out;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +out:
> > > > > +	fdput(f_out);
> > > > > +out1:
> > > > > +	fdput(f_in);
> > > > > +out2:
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> > > > > {
> > > > > 	struct inode *inode = file_inode(file);
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index 803e5a9..2727a98 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -1690,6 +1690,8 @@ struct file_operations {
> > > > > 			u64);
> > > > > 	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> > > > > 			u64);
> > > > > +	u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> > > > > +			loff_t, u64, unsigned int);
> > > > > };
> > > > > 
> > > > > struct inode_operations {
> > > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > > > 					 loff_t len, bool *is_same);
> > > > > extern int vfs_dedupe_file_range(struct file *file,
> > > > > 				 struct file_dedupe_range *same);
> > > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> > > > > +			       loff_t, u64, unsigned int);
> > > > > 
> > > > > struct super_operations {
> > > > >    	struct inode *(*alloc_inode)(struct super_block *sb);
> > > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > > > index 980c3c9..f7ea78e 100644
> > > > > --- a/include/linux/syscalls.h
> > > > > +++ b/include/linux/syscalls.h
> > > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> > > > > asmlinkage long sys_pkey_free(int pkey);
> > > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> > > > > 			  unsigned mask, struct statx __user *buffer);
> > > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> > > > > +				    int fd_out, loff_t __user *off_out,
> > > > > +				    u64 len, unsigned int flags);
> > > > > 
> > > > > #endif
> > > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > > > > index 061185a..e385a76 100644
> > > > > --- a/include/uapi/asm-generic/unistd.h
> > > > > +++ b/include/uapi/asm-generic/unistd.h
> > > > > @@ -731,9 +731,11 @@
> > > > > __SYSCALL(__NR_pkey_free,     sys_pkey_free)
> > > > > #define __NR_statx 291
> > > > > __SYSCALL(__NR_statx,     sys_statx)
> > > > > +#define __NR_copy_file_range64 292
> > > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
> > > > > 
> > > > > #undef __NR_syscalls
> > > > > -#define __NR_syscalls 292
> > > > > +#define __NR_syscalls 293
> > > > > 
> > > > > /*
> > > > >  * All syscalls below here should go away really,
> > > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > > > > index 8acef85..8e0cfd4 100644
> > > > > --- a/kernel/sys_ni.c
> > > > > +++ b/kernel/sys_ni.c
> > > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> > > > > cond_syscall(sys_capget);
> > > > > cond_syscall(sys_capset);
> > > > > cond_syscall(sys_copy_file_range);
> > > > > +cond_syscall(sys_copy_file_range64);
> > > > > 
> > > > > /* arch-specific weak syscall entries */
> > > > > cond_syscall(sys_pciconfig_read);
> > > > 
> > > > Hi Olga,
> > > > 
> > > > We discussed this a bit privately, but I'll note it here too.
> > > > 
> > > > Server-to-server copy seems like a nice thing to me as well. There are
> > > > several filesystems that can likely make use of that ability.
> > > > 
> > > > I don't have a real opinion on the API change either, but you're making
> > > > a very subtle change with this patch. Prior to this, the existing
> > > > copy_file_range calls could count on the superblocks being the same.
> > > > Now, it looks like you can pass them any old file pointer.
> > > 
> > > What if we add a new file op for xdev copy that gets called when
> > > superblocks are different, but filesystem type is the same?  We could
> > > keep the current copy_file_range fop to fall back on if superblocks
> > > are the same.
> > 
> > I don't think that would really help much. There are only two
> > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
> > think we really need another special case op for this.
> > 
> > What I would probably suggest is to just push the check for the same
> > superblock down into the copy_file_range ops in one patch (with no
> > functional change). Then, you could go and lift that restriction in the
> > NFSv4 operation without regressing cifs. The CIFS folks could eventually
> > do the same for theirs.
> 
> CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses? 
> 

Yes that should cover it.
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2017-06-27 16:12 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia
2017-06-14 17:24 ` Anna Schumaker
2017-06-14 18:53   ` Olga Kornievskaia
2017-06-14 19:32     ` Amir Goldstein
2017-06-14 20:08       ` Olga Kornievskaia
2017-06-14 21:50     ` Randy Dunlap
2017-06-15 13:07       ` Olga Kornievskaia
2017-06-15  3:59     ` Darrick J. Wong
2017-06-15 14:39       ` Olga Kornievskaia
2017-06-15 20:53         ` Darrick J. Wong
2017-06-19 18:34           ` Olga Kornievskaia
2017-06-19 19:39             ` Christoph Hellwig
2017-06-19 19:48               ` Olga Kornievskaia
2017-06-20 12:38               ` Steve Dickson
2017-06-20 19:33               ` Chuck Lever
2017-06-20 19:37                 ` Olga Kornievskaia
2017-06-24 11:29               ` Mkrtchyan, Tigran
2017-06-20 12:44           ` Steve Dickson
2017-06-15  8:15 ` Christoph Hellwig
2017-06-15 13:07   ` Olga Kornievskaia
2017-06-15 18:35     ` Andreas Dilger
2017-06-15 19:11       ` Olga Kornievskaia
2017-06-16 17:36       ` J. Bruce Fields
2017-06-17 10:03         ` Christoph Hellwig
2017-06-17 12:42           ` Olga Kornievskaia
2017-06-17 15:09             ` Christoph Hellwig
2017-06-17 18:24               ` J. Bruce Fields
2017-06-24 13:23 ` Jeff Layton
2017-06-26 13:21   ` Anna Schumaker
2017-06-26 13:46     ` Jeff Layton
2017-06-26 14:46       ` Olga Kornievskaia
2017-06-26 14:49         ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia
2017-06-27 11:47           ` Jeff Layton
2017-06-27 16:12             ` Olga Kornievskaia
     [not found]       ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>
2017-06-26 14:51         ` [PATCH 1/1] [RFC] 64bit copy_file_range system call Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).