* [PATCH 1/1] [RFC] 64bit copy_file_range system call @ 2017-06-14 17:06 Olga Kornievskaia 2017-06-14 17:24 ` Anna Schumaker ` (2 more replies) 0 siblings, 3 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-14 17:06 UTC (permalink / raw) To: linux-fsdevel; +Cc: linux-nfs This is a proposal to allow 64bit count to copy and return as a result of a copy_file_range. No attempt was made to share code with the 32bit function because 32bit interface should probably get depreciated. Why use 64bit? Current uses of 32bit are by clone_file_range() which could use 64bit count and NFS copy_file_range also supports 64bit count value. Also with this proposal off-the-bet allow the userland to copy between different mount points. Signed-off-by: Olga Kornievskaia <kolga@netapp.com> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 4 + include/linux/syscalls.h | 3 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 1 + 7 files changed, 149 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac21..1f5f1e7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 copy_file_range64 sys_copy_file_range64 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183..e658bbe 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 64 copy_file_range64 sys_copy_file_range64 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d44..e2665c1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, return ret; } +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 ret; + + if (flags != 0) + return -EINVAL; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + ret = rw_verify_area(READ, file_in, &pos_in, len); + if (unlikely(ret)) + return ret; + + ret = rw_verify_area(WRITE, file_out, &pos_out, len); + if (unlikely(ret)) + return ret; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + if (len == 0) + return 0; + + file_start_write(file_out); + + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range64) { + ret = file_out->f_op->copy_file_range64(file_in, pos_in, + file_out, pos_out, len, flags); + if (ret != -EOPNOTSUPP) + goto done; + } + + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + +done: + if (ret > 0) { + fsnotify_access(file_in); + add_rchar(current, ret); + fsnotify_modify(file_out); + add_wchar(current, ret); + } + + inc_syscr(current); + inc_syscw(current); + + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_copy_file_range64); + +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + u64, len, unsigned int, flags) +{ + loff_t pos_in; + loff_t pos_out; + struct fd f_in; + struct fd f_out; + u64 ret = -EBADF; + + f_in = fdget(fd_in); + if (!f_in.file) + goto out2; + + f_out = fdget(fd_out); + if (!f_out.file) + goto out1; + + ret = -EFAULT; + if (off_in) { + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) + goto out; + } else { + pos_in = f_in.file->f_pos; + } + + if (off_out) { + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) + goto out; + } else { + pos_out = f_out.file->f_pos; + } + + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, + flags); + if (ret > 0) { + pos_in += ret; + pos_out += ret; + + if (off_in) { + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_in.file->f_pos = pos_in; + } + + if (off_out) { + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_out.file->f_pos = pos_out; + } + } + +out: + fdput(f_out); +out1: + fdput(f_in); +out2: + return ret; +} + static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) { struct inode *inode = file_inode(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9..2727a98 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1690,6 +1690,8 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); }; struct inode_operations { @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9..f7ea78e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + u64 len, unsigned int flags); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 061185a..e385a76 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -731,9 +731,11 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#define __NR_copy_file_range64 292 +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) #undef __NR_syscalls -#define __NR_syscalls 292 +#define __NR_syscalls 293 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8acef85..8e0cfd4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) cond_syscall(sys_capget); cond_syscall(sys_capset); cond_syscall(sys_copy_file_range); +cond_syscall(sys_copy_file_range64); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia @ 2017-06-14 17:24 ` Anna Schumaker 2017-06-14 18:53 ` Olga Kornievskaia 2017-06-15 8:15 ` Christoph Hellwig 2017-06-24 13:23 ` Jeff Layton 2 siblings, 1 reply; 35+ messages in thread From: Anna Schumaker @ 2017-06-14 17:24 UTC (permalink / raw) To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs Hi Olga, On 06/14/2017 01:06 PM, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 149 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac21..1f5f1e7 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 copy_file_range64 sys_copy_file_range64 > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183..e658bbe 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 64 copy_file_range64 sys_copy_file_range64 > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/fs/read_write.c b/fs/read_write.c > index 47c1d44..e2665c1 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > return ret; > } > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len, unsigned int flags) > +{ > + struct inode *inode_in = file_inode(file_in); > + struct inode *inode_out = file_inode(file_out); > + u64 ret; > + > + if (flags != 0) > + return -EINVAL; > + > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > + return -EISDIR; > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > + return -EINVAL; > + > + ret = rw_verify_area(READ, file_in, &pos_in, len); > + if (unlikely(ret)) > + return ret; Doesn't rw_verify_area() only accept 32-bit ranges? You might have to use clone_verify_area() instead (maybe consider renaming that function to something generic?) Thanks, Anna > + > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > + if (unlikely(ret)) > + return ret; > + > + if (!(file_in->f_mode & FMODE_READ) || > + !(file_out->f_mode & FMODE_WRITE) || > + (file_out->f_flags & O_APPEND)) > + return -EBADF; > + > + if (len == 0) > + return 0; > + > + file_start_write(file_out); > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + > +done: > + if (ret > 0) { > + fsnotify_access(file_in); > + add_rchar(current, ret); > + fsnotify_modify(file_out); > + add_wchar(current, ret); > + } > + > + inc_syscr(current); > + inc_syscw(current); > + > + file_end_write(file_out); > + > + return ret; > +} > +EXPORT_SYMBOL(vfs_copy_file_range64); > + > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > + int, fd_out, loff_t __user *, off_out, > + u64, len, unsigned int, flags) > +{ > + loff_t pos_in; > + loff_t pos_out; > + struct fd f_in; > + struct fd f_out; > + u64 ret = -EBADF; > + > + f_in = fdget(fd_in); > + if (!f_in.file) > + goto out2; > + > + f_out = fdget(fd_out); > + if (!f_out.file) > + goto out1; > + > + ret = -EFAULT; > + if (off_in) { > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > + goto out; > + } else { > + pos_in = f_in.file->f_pos; > + } > + > + if (off_out) { > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > + goto out; > + } else { > + pos_out = f_out.file->f_pos; > + } > + > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > + flags); > + if (ret > 0) { > + pos_in += ret; > + pos_out += ret; > + > + if (off_in) { > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_in.file->f_pos = pos_in; > + } > + > + if (off_out) { > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_out.file->f_pos = pos_out; > + } > + } > + > +out: > + fdput(f_out); > +out1: > + fdput(f_in); > +out2: > + return ret; > +} > + > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > { > struct inode *inode = file_inode(file); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 803e5a9..2727a98 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1690,6 +1690,8 @@ struct file_operations { > u64); > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > u64); > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > }; > > struct inode_operations { > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > loff_t len, bool *is_same); > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9..f7ea78e 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > asmlinkage long sys_pkey_free(int pkey); > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > unsigned mask, struct statx __user *buffer); > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > + int fd_out, loff_t __user *off_out, > + u64 len, unsigned int flags); > > #endif > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 061185a..e385a76 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -731,9 +731,11 @@ > __SYSCALL(__NR_pkey_free, sys_pkey_free) > #define __NR_statx 291 > __SYSCALL(__NR_statx, sys_statx) > +#define __NR_copy_file_range64 292 > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > #undef __NR_syscalls > -#define __NR_syscalls 292 > +#define __NR_syscalls 293 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 8acef85..8e0cfd4 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > cond_syscall(sys_capget); > cond_syscall(sys_capset); > cond_syscall(sys_copy_file_range); > +cond_syscall(sys_copy_file_range64); > > /* arch-specific weak syscall entries */ > cond_syscall(sys_pciconfig_read); > ^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 17:24 ` Anna Schumaker @ 2017-06-14 18:53 ` Olga Kornievskaia 2017-06-14 19:32 ` Amir Goldstein ` (2 more replies) 0 siblings, 3 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-14 18:53 UTC (permalink / raw) To: linux-fsdevel; +Cc: linux-nfs This is a proposal to allow 64bit count to copy and return as a result of a copy_file_range. No attempt was made to share code with the 32bit function because 32bit interface should probably get depreciated. Why use 64bit? Current uses of 32bit are by clone_file_range() which could use 64bit count and NFS copy_file_range also supports 64bit count value. Also with this proposal off-the-bet allow the userland to copy between different mount points. Signed-off-by: Olga Kornievskaia <kolga@netapp.com> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- include/linux/fs.h | 4 + include/linux/syscalls.h | 3 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 1 + 7 files changed, 154 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac21..1f5f1e7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 copy_file_range64 sys_copy_file_range64 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183..e658bbe 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 64 copy_file_range64 sys_copy_file_range64 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/read_write.c b/fs/read_write.c index e3ee985..c9c072b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) { struct inode *inode = file_inode(file); @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 ret; + + if (flags != 0) + return -EINVAL; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + ret = rw64_verify_area(file_in, pos_in, len, false); + if (unlikely(ret)) + return ret; + + ret = rw64_verify_area(file_out, pos_out, len, true); + if (unlikely(ret)) + return ret; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + if (len == 0) + return 0; + + file_start_write(file_out); + + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range64) { + ret = file_out->f_op->copy_file_range64(file_in, pos_in, + file_out, pos_out, len, flags); + if (ret != -EOPNOTSUPP) + goto done; + } + + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + +done: + if (ret > 0) { + fsnotify_access(file_in); + add_rchar(current, ret); + fsnotify_modify(file_out); + add_wchar(current, ret); + } + + inc_syscr(current); + inc_syscw(current); + + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_copy_file_range64); + +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + u64, len, unsigned int, flags) +{ + loff_t pos_in; + loff_t pos_out; + struct fd f_in; + struct fd f_out; + u64 ret = -EBADF; + + f_in = fdget(fd_in); + if (!f_in.file) + goto out2; + + f_out = fdget(fd_out); + if (!f_out.file) + goto out1; + + ret = -EFAULT; + if (off_in) { + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) + goto out; + } else { + pos_in = f_in.file->f_pos; + } + + if (off_out) { + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) + goto out; + } else { + pos_out = f_out.file->f_pos; + } + + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, + flags); + if (ret > 0) { + pos_in += ret; + pos_out += ret; + + if (off_in) { + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_in.file->f_pos = pos_in; + } + + if (off_out) { + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_out.file->f_pos = pos_out; + } + } + +out: + fdput(f_out); +out1: + fdput(f_in); +out2: + return ret; +} + /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->clone_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = rw64_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = rw64_verify_area(file_out, pos_out, len, true); if (ret) return ret; @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = rw64_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0; @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } dst_off = info->dest_offset; - ret = clone_verify_area(dst_file, dst_off, len, true); + ret = rw64_verify_area(dst_file, dst_off, len, true); if (ret < 0) { info->status = ret; goto next_file; diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9..41fce43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1690,6 +1690,8 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); }; struct inode_operations { @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, + loff_t, u64, unsigned int); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9..f7ea78e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + u64 len, unsigned int flags); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 061185a..e385a76 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -731,9 +731,11 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#define __NR_copy_file_range64 292 +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) #undef __NR_syscalls -#define __NR_syscalls 292 +#define __NR_syscalls 293 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8acef85..8e0cfd4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) cond_syscall(sys_capget); cond_syscall(sys_capset); cond_syscall(sys_copy_file_range); +cond_syscall(sys_copy_file_range64); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 18:53 ` Olga Kornievskaia @ 2017-06-14 19:32 ` Amir Goldstein 2017-06-14 20:08 ` Olga Kornievskaia 2017-06-14 21:50 ` Randy Dunlap 2017-06-15 3:59 ` Darrick J. Wong 2 siblings, 1 reply; 35+ messages in thread From: Amir Goldstein @ 2017-06-14 19:32 UTC (permalink / raw) To: Olga Kornievskaia Cc: linux-fsdevel, Linux NFS Mailing List, Christoph Hellwig On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <kolga@netapp.com> wrote: > > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- ... > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + Olga, I know this is copied from vfs_copy_file_range() as is, so my comment applies to the origin function as well, but it is easier to change the new function without changing userspace behavior, so.. A while back, either Dave Chinner or Christoph suggested that I use vfs_copy_file_range() from ovl_copy_up_data() instead of calling vfs_clone_file_range() and falling back to do_splice_direct() loop. Then Christoph added the clone_file_range attempt into vfs_copy_file_range(), which was one part of the work. However, ovl_copy_up_data() uses a killable loop of do_splice_direct() calls with smaller chunks, so it is not the same as vfs_copy_file_range(). I wonder if it makes sense to use a similar killable loop in the new function? I wonder, for reference, if NFS implementation of copy_file_range64() is killable? Amir. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 19:32 ` Amir Goldstein @ 2017-06-14 20:08 ` Olga Kornievskaia 0 siblings, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-14 20:08 UTC (permalink / raw) To: Amir Goldstein Cc: Olga Kornievskaia, linux-fsdevel, Linux NFS Mailing List, Christoph Hellwig On Wed, Jun 14, 2017 at 3:32 PM, Amir Goldstein <amir73il@gmail.com> wrote: > On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <kolga@netapp.com> wrote: >> >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. >> >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- > ... >> + >> + /* >> + * Try cloning first, this is supported by more file systems, and >> + * more efficient if both clone and copy are supported (e.g. NFS). >> + */ >> + if (file_in->f_op->clone_file_range) { >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> + file_out, pos_out, len); >> + if (ret == 0) { >> + ret = len; >> + goto done; >> + } >> + } >> + >> + if (file_out->f_op->copy_file_range64) { >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> + file_out, pos_out, len, flags); >> + if (ret != -EOPNOTSUPP) >> + goto done; >> + } >> + >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> + > > Olga, Hi Amir, Thanks for the comments. > I know this is copied from vfs_copy_file_range() as is, so my comment > applies to the origin function as well, but it is easier to change the new > function without changing userspace behavior, so.. > > A while back, either Dave Chinner or Christoph suggested that I use > vfs_copy_file_range() from ovl_copy_up_data() instead of calling > vfs_clone_file_range() and falling back to do_splice_direct() loop. > Then Christoph added the clone_file_range attempt into > vfs_copy_file_range(), which was one part of the work. > > However, ovl_copy_up_data() uses a killable loop of do_splice_direct() > calls with smaller chunks, so it is not the same as vfs_copy_file_range(). > > I wonder if it makes sense to use a similar killable loop in the new > function? I'm currently not proposing to change the semantics of copy_file_range() call to in some cases return a partial copy and thus making the application responsible for looping to finish the copy. Basically copy_file_range() follows the same semantics as read/write where it is possible that not all requested bytes are completed. However, if it seems more desirable to shift the responsibility to completing the copy into the kernel, then I can definitely add the same looping as does ovl_copy_up_data(). > I wonder, for reference, if NFS implementation of copy_file_range64() > is killable? I believe the current implementation of "intra" (same server) copy is not killable. The outstanding proposal to add asynchronous copy as well as "inter" (server to server) copy is killable. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 18:53 ` Olga Kornievskaia 2017-06-14 19:32 ` Amir Goldstein @ 2017-06-14 21:50 ` Randy Dunlap 2017-06-15 13:07 ` Olga Kornievskaia 2017-06-15 3:59 ` Darrick J. Wong 2 siblings, 1 reply; 35+ messages in thread From: Randy Dunlap @ 2017-06-14 21:50 UTC (permalink / raw) To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs On 06/14/17 11:53, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. deprecated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy what does "off-the-bet" mean? > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 154 insertions(+), 6 deletions(-) -- ~Randy ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 21:50 ` Randy Dunlap @ 2017-06-15 13:07 ` Olga Kornievskaia 0 siblings, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-15 13:07 UTC (permalink / raw) To: Randy Dunlap; +Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Wed, Jun 14, 2017 at 5:50 PM, Randy Dunlap <rdunlap@infradead.org> wrote: > On 06/14/17 11:53, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. > > deprecated. > >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. >> >> Also with this proposal off-the-bet allow the userland to copy > > what does "off-the-bet" mean? from the beginning. > >> between different mount points. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- >> include/linux/fs.h | 4 + >> include/linux/syscalls.h | 3 + >> include/uapi/asm-generic/unistd.h | 4 +- >> kernel/sys_ni.c | 1 + >> 7 files changed, 154 insertions(+), 6 deletions(-) > > > -- > ~Randy > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 18:53 ` Olga Kornievskaia 2017-06-14 19:32 ` Amir Goldstein 2017-06-14 21:50 ` Randy Dunlap @ 2017-06-15 3:59 ` Darrick J. Wong 2017-06-15 14:39 ` Olga Kornievskaia 2 siblings, 1 reply; 35+ messages in thread From: Darrick J. Wong @ 2017-06-15 3:59 UTC (permalink / raw) To: Olga Kornievskaia; +Cc: linux-fsdevel, linux-nfs On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 154 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac21..1f5f1e7 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 copy_file_range64 sys_copy_file_range64 > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183..e658bbe 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 64 copy_file_range64 sys_copy_file_range64 > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/fs/read_write.c b/fs/read_write.c > index e3ee985..c9c072b 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > return ret; > } > > -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) > { > struct inode *inode = file_inode(file); > > @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > return security_file_permission(file, write ? MAY_WRITE : MAY_READ); > } > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len, unsigned int flags) AFAICT the difference between vfs_copy_file_range() and vfs_copy_file_range64() is that you've replaced 'size_t len' with 'u64 len', correct? sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as the userspace interface is concerned, this only makes a difference on 32-bit userlands, right? So, uh, how many 32-bit user programs need to c_f_r more than 2GB at a time? Follow up question -- does c_f_r not work for > 2GB of data when userspace is 64-bit? --D > +{ > + struct inode *inode_in = file_inode(file_in); > + struct inode *inode_out = file_inode(file_out); > + u64 ret; > + > + if (flags != 0) > + return -EINVAL; > + > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > + return -EISDIR; > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > + return -EINVAL; > + > + ret = rw64_verify_area(file_in, pos_in, len, false); > + if (unlikely(ret)) > + return ret; > + > + ret = rw64_verify_area(file_out, pos_out, len, true); > + if (unlikely(ret)) > + return ret; > + > + if (!(file_in->f_mode & FMODE_READ) || > + !(file_out->f_mode & FMODE_WRITE) || > + (file_out->f_flags & O_APPEND)) > + return -EBADF; > + > + if (len == 0) > + return 0; > + > + file_start_write(file_out); > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + > +done: > + if (ret > 0) { > + fsnotify_access(file_in); > + add_rchar(current, ret); > + fsnotify_modify(file_out); > + add_wchar(current, ret); > + } > + > + inc_syscr(current); > + inc_syscw(current); > + > + file_end_write(file_out); > + > + return ret; > +} > +EXPORT_SYMBOL(vfs_copy_file_range64); > + > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > + int, fd_out, loff_t __user *, off_out, > + u64, len, unsigned int, flags) > +{ > + loff_t pos_in; > + loff_t pos_out; > + struct fd f_in; > + struct fd f_out; > + u64 ret = -EBADF; > + > + f_in = fdget(fd_in); > + if (!f_in.file) > + goto out2; > + > + f_out = fdget(fd_out); > + if (!f_out.file) > + goto out1; > + > + ret = -EFAULT; > + if (off_in) { > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > + goto out; > + } else { > + pos_in = f_in.file->f_pos; > + } > + > + if (off_out) { > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > + goto out; > + } else { > + pos_out = f_out.file->f_pos; > + } > + > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > + flags); > + if (ret > 0) { > + pos_in += ret; > + pos_out += ret; > + > + if (off_in) { > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_in.file->f_pos = pos_in; > + } > + > + if (off_out) { > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_out.file->f_pos = pos_out; > + } > + } > + > +out: > + fdput(f_out); > +out1: > + fdput(f_in); > +out2: > + return ret; > +} > + > /* > * Check that the two inodes are eligible for cloning, the ranges make > * sense, and then flush all dirty data. Caller must ensure that the > @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, > if (!file_in->f_op->clone_file_range) > return -EOPNOTSUPP; > > - ret = clone_verify_area(file_in, pos_in, len, false); > + ret = rw64_verify_area(file_in, pos_in, len, false); > if (ret) > return ret; > > - ret = clone_verify_area(file_out, pos_out, len, true); > + ret = rw64_verify_area(file_out, pos_out, len, true); > if (ret) > return ret; > > @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > if (!S_ISREG(src->i_mode)) > goto out; > > - ret = clone_verify_area(file, off, len, false); > + ret = rw64_verify_area(file, off, len, false); > if (ret < 0) > goto out; > ret = 0; > @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > } > > dst_off = info->dest_offset; > - ret = clone_verify_area(dst_file, dst_off, len, true); > + ret = rw64_verify_area(dst_file, dst_off, len, true); > if (ret < 0) { > info->status = ret; > goto next_file; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 803e5a9..41fce43 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1690,6 +1690,8 @@ struct file_operations { > u64); > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > u64); > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > }; > > struct inode_operations { > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > loff_t len, bool *is_same); > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9..f7ea78e 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > asmlinkage long sys_pkey_free(int pkey); > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > unsigned mask, struct statx __user *buffer); > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > + int fd_out, loff_t __user *off_out, > + u64 len, unsigned int flags); > > #endif > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 061185a..e385a76 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -731,9 +731,11 @@ > __SYSCALL(__NR_pkey_free, sys_pkey_free) > #define __NR_statx 291 > __SYSCALL(__NR_statx, sys_statx) > +#define __NR_copy_file_range64 292 > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > #undef __NR_syscalls > -#define __NR_syscalls 292 > +#define __NR_syscalls 293 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 8acef85..8e0cfd4 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > cond_syscall(sys_capget); > cond_syscall(sys_capset); > cond_syscall(sys_copy_file_range); > +cond_syscall(sys_copy_file_range64); > > /* arch-specific weak syscall entries */ > cond_syscall(sys_pciconfig_read); > -- > 1.8.3.1 > ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 3:59 ` Darrick J. Wong @ 2017-06-15 14:39 ` Olga Kornievskaia 2017-06-15 20:53 ` Darrick J. Wong 0 siblings, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-15 14:39 UTC (permalink / raw) To: Darrick J. Wong Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote: > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. >> >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- >> include/linux/fs.h | 4 + >> include/linux/syscalls.h | 3 + >> include/uapi/asm-generic/unistd.h | 4 +- >> kernel/sys_ni.c | 1 + >> 7 files changed, 154 insertions(+), 6 deletions(-) >> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> index 448ac21..1f5f1e7 100644 >> --- a/arch/x86/entry/syscalls/syscall_32.tbl >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> @@ -391,3 +391,4 @@ >> 382 i386 pkey_free sys_pkey_free >> 383 i386 statx sys_statx >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> +385 i386 copy_file_range64 sys_copy_file_range64 >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> index 5aef183..e658bbe 100644 >> --- a/arch/x86/entry/syscalls/syscall_64.tbl >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> @@ -339,6 +339,7 @@ >> 330 common pkey_alloc sys_pkey_alloc >> 331 common pkey_free sys_pkey_free >> 332 common statx sys_statx >> +333 64 copy_file_range64 sys_copy_file_range64 >> >> # >> # x32-specific system call numbers start at 512 to avoid cache impact >> diff --git a/fs/read_write.c b/fs/read_write.c >> index e3ee985..c9c072b 100644 >> --- a/fs/read_write.c >> +++ b/fs/read_write.c >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> return ret; >> } >> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> { >> struct inode *inode = file_inode(file); >> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> return security_file_permission(file, write ? MAY_WRITE : MAY_READ); >> } >> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> + struct file *file_out, loff_t pos_out, >> + u64 len, unsigned int flags) > > AFAICT the difference between vfs_copy_file_range() and > vfs_copy_file_range64() is that you've replaced 'size_t len' with > 'u64 len', correct? That and I'm asking to allow for cross-device copy to be allowed by the system call. > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as > the userspace interface is concerned, this only makes a difference on > 32-bit userlands, right? So, uh, how many 32-bit user programs need to > c_f_r more than 2GB at a time? Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit machine. If I'm wrong, then this patch is indeed not needed. > Follow up question -- does c_f_r not work for > 2GB of data when > userspace is 64-bit? Current copy_file_range would work for any file size, it would just need to be called in a loop by the application. With the given proposal, there wouldn't be a need for the application to loop due to the API size limitation. It might still loop if a partial copy was done. > > --D > >> +{ >> + struct inode *inode_in = file_inode(file_in); >> + struct inode *inode_out = file_inode(file_out); >> + u64 ret; >> + >> + if (flags != 0) >> + return -EINVAL; >> + >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> + return -EISDIR; >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> + return -EINVAL; >> + >> + ret = rw64_verify_area(file_in, pos_in, len, false); >> + if (unlikely(ret)) >> + return ret; >> + >> + ret = rw64_verify_area(file_out, pos_out, len, true); >> + if (unlikely(ret)) >> + return ret; >> + >> + if (!(file_in->f_mode & FMODE_READ) || >> + !(file_out->f_mode & FMODE_WRITE) || >> + (file_out->f_flags & O_APPEND)) >> + return -EBADF; >> + >> + if (len == 0) >> + return 0; >> + >> + file_start_write(file_out); >> + >> + /* >> + * Try cloning first, this is supported by more file systems, and >> + * more efficient if both clone and copy are supported (e.g. NFS). >> + */ >> + if (file_in->f_op->clone_file_range) { >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> + file_out, pos_out, len); >> + if (ret == 0) { >> + ret = len; >> + goto done; >> + } >> + } >> + >> + if (file_out->f_op->copy_file_range64) { >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> + file_out, pos_out, len, flags); >> + if (ret != -EOPNOTSUPP) >> + goto done; >> + } >> + >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> + >> +done: >> + if (ret > 0) { >> + fsnotify_access(file_in); >> + add_rchar(current, ret); >> + fsnotify_modify(file_out); >> + add_wchar(current, ret); >> + } >> + >> + inc_syscr(current); >> + inc_syscw(current); >> + >> + file_end_write(file_out); >> + >> + return ret; >> +} >> +EXPORT_SYMBOL(vfs_copy_file_range64); >> + >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> + int, fd_out, loff_t __user *, off_out, >> + u64, len, unsigned int, flags) >> +{ >> + loff_t pos_in; >> + loff_t pos_out; >> + struct fd f_in; >> + struct fd f_out; >> + u64 ret = -EBADF; >> + >> + f_in = fdget(fd_in); >> + if (!f_in.file) >> + goto out2; >> + >> + f_out = fdget(fd_out); >> + if (!f_out.file) >> + goto out1; >> + >> + ret = -EFAULT; >> + if (off_in) { >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_in = f_in.file->f_pos; >> + } >> + >> + if (off_out) { >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_out = f_out.file->f_pos; >> + } >> + >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> + flags); >> + if (ret > 0) { >> + pos_in += ret; >> + pos_out += ret; >> + >> + if (off_in) { >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_in.file->f_pos = pos_in; >> + } >> + >> + if (off_out) { >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_out.file->f_pos = pos_out; >> + } >> + } >> + >> +out: >> + fdput(f_out); >> +out1: >> + fdput(f_in); >> +out2: >> + return ret; >> +} >> + >> /* >> * Check that the two inodes are eligible for cloning, the ranges make >> * sense, and then flush all dirty data. Caller must ensure that the >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, >> if (!file_in->f_op->clone_file_range) >> return -EOPNOTSUPP; >> >> - ret = clone_verify_area(file_in, pos_in, len, false); >> + ret = rw64_verify_area(file_in, pos_in, len, false); >> if (ret) >> return ret; >> >> - ret = clone_verify_area(file_out, pos_out, len, true); >> + ret = rw64_verify_area(file_out, pos_out, len, true); >> if (ret) >> return ret; >> >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) >> if (!S_ISREG(src->i_mode)) >> goto out; >> >> - ret = clone_verify_area(file, off, len, false); >> + ret = rw64_verify_area(file, off, len, false); >> if (ret < 0) >> goto out; >> ret = 0; >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) >> } >> >> dst_off = info->dest_offset; >> - ret = clone_verify_area(dst_file, dst_off, len, true); >> + ret = rw64_verify_area(dst_file, dst_off, len, true); >> if (ret < 0) { >> info->status = ret; >> goto next_file; >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 803e5a9..41fce43 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -1690,6 +1690,8 @@ struct file_operations { >> u64); >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> u64); >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> }; >> >> struct inode_operations { >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> loff_t len, bool *is_same); >> extern int vfs_dedupe_file_range(struct file *file, >> struct file_dedupe_range *same); >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> >> struct super_operations { >> struct inode *(*alloc_inode)(struct super_block *sb); >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> index 980c3c9..f7ea78e 100644 >> --- a/include/linux/syscalls.h >> +++ b/include/linux/syscalls.h >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> asmlinkage long sys_pkey_free(int pkey); >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> unsigned mask, struct statx __user *buffer); >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> + int fd_out, loff_t __user *off_out, >> + u64 len, unsigned int flags); >> >> #endif >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> index 061185a..e385a76 100644 >> --- a/include/uapi/asm-generic/unistd.h >> +++ b/include/uapi/asm-generic/unistd.h >> @@ -731,9 +731,11 @@ >> __SYSCALL(__NR_pkey_free, sys_pkey_free) >> #define __NR_statx 291 >> __SYSCALL(__NR_statx, sys_statx) >> +#define __NR_copy_file_range64 292 >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> >> #undef __NR_syscalls >> -#define __NR_syscalls 292 >> +#define __NR_syscalls 293 >> >> /* >> * All syscalls below here should go away really, >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> index 8acef85..8e0cfd4 100644 >> --- a/kernel/sys_ni.c >> +++ b/kernel/sys_ni.c >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> cond_syscall(sys_capget); >> cond_syscall(sys_capset); >> cond_syscall(sys_copy_file_range); >> +cond_syscall(sys_copy_file_range64); >> >> /* arch-specific weak syscall entries */ >> cond_syscall(sys_pciconfig_read); >> -- >> 1.8.3.1 >> > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 14:39 ` Olga Kornievskaia @ 2017-06-15 20:53 ` Darrick J. Wong 2017-06-19 18:34 ` Olga Kornievskaia 2017-06-20 12:44 ` Steve Dickson 0 siblings, 2 replies; 35+ messages in thread From: Darrick J. Wong @ 2017-06-15 20:53 UTC (permalink / raw) To: Olga Kornievskaia Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote: > On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong > <darrick.wong@oracle.com> wrote: > > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote: > >> This is a proposal to allow 64bit count to copy and return as > >> a result of a copy_file_range. No attempt was made to share code > >> with the 32bit function because 32bit interface should probably > >> get depreciated. > >> > >> Why use 64bit? Current uses of 32bit are by clone_file_range() > >> which could use 64bit count and NFS copy_file_range also supports > >> 64bit count value. > >> > >> Also with this proposal off-the-bet allow the userland to copy > >> between different mount points. > >> > >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > >> --- > >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + > >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + > >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- > >> include/linux/fs.h | 4 + > >> include/linux/syscalls.h | 3 + > >> include/uapi/asm-generic/unistd.h | 4 +- > >> kernel/sys_ni.c | 1 + > >> 7 files changed, 154 insertions(+), 6 deletions(-) > >> > >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > >> index 448ac21..1f5f1e7 100644 > >> --- a/arch/x86/entry/syscalls/syscall_32.tbl > >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl > >> @@ -391,3 +391,4 @@ > >> 382 i386 pkey_free sys_pkey_free > >> 383 i386 statx sys_statx > >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > >> +385 i386 copy_file_range64 sys_copy_file_range64 > >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > >> index 5aef183..e658bbe 100644 > >> --- a/arch/x86/entry/syscalls/syscall_64.tbl > >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl > >> @@ -339,6 +339,7 @@ > >> 330 common pkey_alloc sys_pkey_alloc > >> 331 common pkey_free sys_pkey_free > >> 332 common statx sys_statx > >> +333 64 copy_file_range64 sys_copy_file_range64 > >> > >> # > >> # x32-specific system call numbers start at 512 to avoid cache impact > >> diff --git a/fs/read_write.c b/fs/read_write.c > >> index e3ee985..c9c072b 100644 > >> --- a/fs/read_write.c > >> +++ b/fs/read_write.c > >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > >> return ret; > >> } > >> > >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) > >> { > >> struct inode *inode = file_inode(file); > >> > >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > >> return security_file_permission(file, write ? MAY_WRITE : MAY_READ); > >> } > >> > >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > >> + struct file *file_out, loff_t pos_out, > >> + u64 len, unsigned int flags) > > > > AFAICT the difference between vfs_copy_file_range() and > > vfs_copy_file_range64() is that you've replaced 'size_t len' with > > 'u64 len', correct? > > That and I'm asking to allow for cross-device copy to be allowed by > the system call. I was fine letting each fs decide if it was going to allow cross-device copy or not, but Christoph argued that since we don't generally allow operations across mountpoints we shouldn't allow cross-mountpoint {clone,copy}_file_range either. Roughly translated, XFS has no cross-device copy abilities, there aren't any plans to add it, so I don't feel motivated to argue for it... but if the nfs community is so motivated (it sounds like it) then y'all could try one more time. > > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as > > the userspace interface is concerned, this only makes a difference on > > 32-bit userlands, right? So, uh, how many 32-bit user programs need to > > c_f_r more than 2GB at a time? > > Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm > under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit > machine. If I'm wrong, then this patch is indeed not needed. >From include/uapi/asm-generic/posix_types.h: /* * Most 32 bit architectures use "unsigned int" size_t, * and all 64 bit architectures use "unsigned long" size_t. */ #ifndef __kernel_size_t #if __BITS_PER_LONG != 64 typedef unsigned int __kernel_size_t; typedef int __kernel_ssize_t; typedef int __kernel_ptrdiff_t; #else typedef __kernel_ulong_t __kernel_size_t; typedef __kernel_long_t __kernel_ssize_t; typedef __kernel_long_t __kernel_ptrdiff_t; #endif #endif Note that 'long' is 64-bit, at least on x64. > > Follow up question -- does c_f_r not work for > 2GB of data when > > userspace is 64-bit? > > Current copy_file_range would work for any file size, it would just > need to be called in a loop by the application. With the given > proposal, there wouldn't be a need for the application to loop due to > the API size limitation. It might still loop if a partial copy was > done. ...which is what happens if we fall back to do_splice_direct, since a single call won't pagecache-copy more than INT_MAX bytes from one file to another. Therefore, any serious user of this syscall has to check the arguments and loop if it wants the whole range done, similar to how one is (supposed) to call write(). --D > > > > > --D > > > >> +{ > >> + struct inode *inode_in = file_inode(file_in); > >> + struct inode *inode_out = file_inode(file_out); > >> + u64 ret; > >> + > >> + if (flags != 0) > >> + return -EINVAL; > >> + > >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > >> + return -EISDIR; > >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > >> + return -EINVAL; > >> + > >> + ret = rw64_verify_area(file_in, pos_in, len, false); > >> + if (unlikely(ret)) > >> + return ret; > >> + > >> + ret = rw64_verify_area(file_out, pos_out, len, true); > >> + if (unlikely(ret)) > >> + return ret; > >> + > >> + if (!(file_in->f_mode & FMODE_READ) || > >> + !(file_out->f_mode & FMODE_WRITE) || > >> + (file_out->f_flags & O_APPEND)) > >> + return -EBADF; > >> + > >> + if (len == 0) > >> + return 0; > >> + > >> + file_start_write(file_out); > >> + > >> + /* > >> + * Try cloning first, this is supported by more file systems, and > >> + * more efficient if both clone and copy are supported (e.g. NFS). > >> + */ > >> + if (file_in->f_op->clone_file_range) { > >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, > >> + file_out, pos_out, len); > >> + if (ret == 0) { > >> + ret = len; > >> + goto done; > >> + } > >> + } > >> + > >> + if (file_out->f_op->copy_file_range64) { > >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > >> + file_out, pos_out, len, flags); > >> + if (ret != -EOPNOTSUPP) > >> + goto done; > >> + } > >> + > >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > >> + > >> +done: > >> + if (ret > 0) { > >> + fsnotify_access(file_in); > >> + add_rchar(current, ret); > >> + fsnotify_modify(file_out); > >> + add_wchar(current, ret); > >> + } > >> + > >> + inc_syscr(current); > >> + inc_syscw(current); > >> + > >> + file_end_write(file_out); > >> + > >> + return ret; > >> +} > >> +EXPORT_SYMBOL(vfs_copy_file_range64); > >> + > >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > >> + int, fd_out, loff_t __user *, off_out, > >> + u64, len, unsigned int, flags) > >> +{ > >> + loff_t pos_in; > >> + loff_t pos_out; > >> + struct fd f_in; > >> + struct fd f_out; > >> + u64 ret = -EBADF; > >> + > >> + f_in = fdget(fd_in); > >> + if (!f_in.file) > >> + goto out2; > >> + > >> + f_out = fdget(fd_out); > >> + if (!f_out.file) > >> + goto out1; > >> + > >> + ret = -EFAULT; > >> + if (off_in) { > >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > >> + goto out; > >> + } else { > >> + pos_in = f_in.file->f_pos; > >> + } > >> + > >> + if (off_out) { > >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > >> + goto out; > >> + } else { > >> + pos_out = f_out.file->f_pos; > >> + } > >> + > >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > >> + flags); > >> + if (ret > 0) { > >> + pos_in += ret; > >> + pos_out += ret; > >> + > >> + if (off_in) { > >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > >> + ret = -EFAULT; > >> + } else { > >> + f_in.file->f_pos = pos_in; > >> + } > >> + > >> + if (off_out) { > >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > >> + ret = -EFAULT; > >> + } else { > >> + f_out.file->f_pos = pos_out; > >> + } > >> + } > >> + > >> +out: > >> + fdput(f_out); > >> +out1: > >> + fdput(f_in); > >> +out2: > >> + return ret; > >> +} > >> + > >> /* > >> * Check that the two inodes are eligible for cloning, the ranges make > >> * sense, and then flush all dirty data. Caller must ensure that the > >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, > >> if (!file_in->f_op->clone_file_range) > >> return -EOPNOTSUPP; > >> > >> - ret = clone_verify_area(file_in, pos_in, len, false); > >> + ret = rw64_verify_area(file_in, pos_in, len, false); > >> if (ret) > >> return ret; > >> > >> - ret = clone_verify_area(file_out, pos_out, len, true); > >> + ret = rw64_verify_area(file_out, pos_out, len, true); > >> if (ret) > >> return ret; > >> > >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > >> if (!S_ISREG(src->i_mode)) > >> goto out; > >> > >> - ret = clone_verify_area(file, off, len, false); > >> + ret = rw64_verify_area(file, off, len, false); > >> if (ret < 0) > >> goto out; > >> ret = 0; > >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) > >> } > >> > >> dst_off = info->dest_offset; > >> - ret = clone_verify_area(dst_file, dst_off, len, true); > >> + ret = rw64_verify_area(dst_file, dst_off, len, true); > >> if (ret < 0) { > >> info->status = ret; > >> goto next_file; > >> diff --git a/include/linux/fs.h b/include/linux/fs.h > >> index 803e5a9..41fce43 100644 > >> --- a/include/linux/fs.h > >> +++ b/include/linux/fs.h > >> @@ -1690,6 +1690,8 @@ struct file_operations { > >> u64); > >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > >> u64); > >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > >> + loff_t, u64, unsigned int); > >> }; > >> > >> struct inode_operations { > >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > >> loff_t len, bool *is_same); > >> extern int vfs_dedupe_file_range(struct file *file, > >> struct file_dedupe_range *same); > >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > >> + loff_t, u64, unsigned int); > >> > >> struct super_operations { > >> struct inode *(*alloc_inode)(struct super_block *sb); > >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > >> index 980c3c9..f7ea78e 100644 > >> --- a/include/linux/syscalls.h > >> +++ b/include/linux/syscalls.h > >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > >> asmlinkage long sys_pkey_free(int pkey); > >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > >> unsigned mask, struct statx __user *buffer); > >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > >> + int fd_out, loff_t __user *off_out, > >> + u64 len, unsigned int flags); > >> > >> #endif > >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > >> index 061185a..e385a76 100644 > >> --- a/include/uapi/asm-generic/unistd.h > >> +++ b/include/uapi/asm-generic/unistd.h > >> @@ -731,9 +731,11 @@ > >> __SYSCALL(__NR_pkey_free, sys_pkey_free) > >> #define __NR_statx 291 > >> __SYSCALL(__NR_statx, sys_statx) > >> +#define __NR_copy_file_range64 292 > >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > >> > >> #undef __NR_syscalls > >> -#define __NR_syscalls 292 > >> +#define __NR_syscalls 293 > >> > >> /* > >> * All syscalls below here should go away really, > >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > >> index 8acef85..8e0cfd4 100644 > >> --- a/kernel/sys_ni.c > >> +++ b/kernel/sys_ni.c > >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > >> cond_syscall(sys_capget); > >> cond_syscall(sys_capset); > >> cond_syscall(sys_copy_file_range); > >> +cond_syscall(sys_copy_file_range64); > >> > >> /* arch-specific weak syscall entries */ > >> cond_syscall(sys_pciconfig_read); > >> -- > >> 1.8.3.1 > >> > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 20:53 ` Darrick J. Wong @ 2017-06-19 18:34 ` Olga Kornievskaia 2017-06-19 19:39 ` Christoph Hellwig 2017-06-20 12:44 ` Steve Dickson 1 sibling, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-19 18:34 UTC (permalink / raw) To: Darrick J. Wong Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Thu, Jun 15, 2017 at 4:53 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote: > On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote: >> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong >> <darrick.wong@oracle.com> wrote: >> > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote: >> >> This is a proposal to allow 64bit count to copy and return as >> >> a result of a copy_file_range. No attempt was made to share code >> >> with the 32bit function because 32bit interface should probably >> >> get depreciated. >> >> >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> >> which could use 64bit count and NFS copy_file_range also supports >> >> 64bit count value. >> >> >> >> Also with this proposal off-the-bet allow the userland to copy >> >> between different mount points. >> >> >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> >> --- >> >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- >> >> include/linux/fs.h | 4 + >> >> include/linux/syscalls.h | 3 + >> >> include/uapi/asm-generic/unistd.h | 4 +- >> >> kernel/sys_ni.c | 1 + >> >> 7 files changed, 154 insertions(+), 6 deletions(-) >> >> >> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> >> index 448ac21..1f5f1e7 100644 >> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl >> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> >> @@ -391,3 +391,4 @@ >> >> 382 i386 pkey_free sys_pkey_free >> >> 383 i386 statx sys_statx >> >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> >> +385 i386 copy_file_range64 sys_copy_file_range64 >> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> >> index 5aef183..e658bbe 100644 >> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl >> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> >> @@ -339,6 +339,7 @@ >> >> 330 common pkey_alloc sys_pkey_alloc >> >> 331 common pkey_free sys_pkey_free >> >> 332 common statx sys_statx >> >> +333 64 copy_file_range64 sys_copy_file_range64 >> >> >> >> # >> >> # x32-specific system call numbers start at 512 to avoid cache impact >> >> diff --git a/fs/read_write.c b/fs/read_write.c >> >> index e3ee985..c9c072b 100644 >> >> --- a/fs/read_write.c >> >> +++ b/fs/read_write.c >> >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> >> return ret; >> >> } >> >> >> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> >> { >> >> struct inode *inode = file_inode(file); >> >> >> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> >> return security_file_permission(file, write ? MAY_WRITE : MAY_READ); >> >> } >> >> >> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> >> + struct file *file_out, loff_t pos_out, >> >> + u64 len, unsigned int flags) >> > >> > AFAICT the difference between vfs_copy_file_range() and >> > vfs_copy_file_range64() is that you've replaced 'size_t len' with >> > 'u64 len', correct? >> >> That and I'm asking to allow for cross-device copy to be allowed by >> the system call. > > I was fine letting each fs decide if it was going to allow cross-device > copy or not, but Christoph argued that since we don't generally allow > operations across mountpoints we shouldn't allow cross-mountpoint > {clone,copy}_file_range either. > > Roughly translated, XFS has no cross-device copy abilities, there aren't > any plans to add it, so I don't feel motivated to argue for it... but if > the nfs community is so motivated (it sounds like it) then y'all could > try one more time. Yes NFS community does need one for doing a server-to-server copy (performance) feature. There was an argument that none of the VFS operations allow for cross-device operations. What about splice, doesn't it allow to cross mount points? So why not allow copy_file_range to do it too? >> > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as >> > the userspace interface is concerned, this only makes a difference on >> > 32-bit userlands, right? So, uh, how many 32-bit user programs need to >> > c_f_r more than 2GB at a time? >> >> Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm >> under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit >> machine. If I'm wrong, then this patch is indeed not needed. > > From include/uapi/asm-generic/posix_types.h: > > /* > * Most 32 bit architectures use "unsigned int" size_t, > * and all 64 bit architectures use "unsigned long" size_t. > */ > #ifndef __kernel_size_t > #if __BITS_PER_LONG != 64 > typedef unsigned int __kernel_size_t; > typedef int __kernel_ssize_t; > typedef int __kernel_ptrdiff_t; > #else > typedef __kernel_ulong_t __kernel_size_t; > typedef __kernel_long_t __kernel_ssize_t; > typedef __kernel_long_t __kernel_ptrdiff_t; > #endif > #endif > > Note that 'long' is 64-bit, at least on x64. Ok, thank you for pointing this out. I see now that it's not important to change the size_t to allow for larger than 4GB copies. > >> > Follow up question -- does c_f_r not work for > 2GB of data when >> > userspace is 64-bit? >> >> Current copy_file_range would work for any file size, it would just >> need to be called in a loop by the application. With the given >> proposal, there wouldn't be a need for the application to loop due to >> the API size limitation. It might still loop if a partial copy was >> done. > > ...which is what happens if we fall back to do_splice_direct, since a > single call won't pagecache-copy more than INT_MAX bytes from one file > to another. Therefore, any serious user of this syscall has to check > the arguments and loop if it wants the whole range done, similar to how > one is (supposed) to call write(). > > --D > >> >> > >> > --D >> > >> >> +{ >> >> + struct inode *inode_in = file_inode(file_in); >> >> + struct inode *inode_out = file_inode(file_out); >> >> + u64 ret; >> >> + >> >> + if (flags != 0) >> >> + return -EINVAL; >> >> + >> >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> >> + return -EISDIR; >> >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> >> + return -EINVAL; >> >> + >> >> + ret = rw64_verify_area(file_in, pos_in, len, false); >> >> + if (unlikely(ret)) >> >> + return ret; >> >> + >> >> + ret = rw64_verify_area(file_out, pos_out, len, true); >> >> + if (unlikely(ret)) >> >> + return ret; >> >> + >> >> + if (!(file_in->f_mode & FMODE_READ) || >> >> + !(file_out->f_mode & FMODE_WRITE) || >> >> + (file_out->f_flags & O_APPEND)) >> >> + return -EBADF; >> >> + >> >> + if (len == 0) >> >> + return 0; >> >> + >> >> + file_start_write(file_out); >> >> + >> >> + /* >> >> + * Try cloning first, this is supported by more file systems, and >> >> + * more efficient if both clone and copy are supported (e.g. NFS). >> >> + */ >> >> + if (file_in->f_op->clone_file_range) { >> >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> >> + file_out, pos_out, len); >> >> + if (ret == 0) { >> >> + ret = len; >> >> + goto done; >> >> + } >> >> + } >> >> + >> >> + if (file_out->f_op->copy_file_range64) { >> >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> >> + file_out, pos_out, len, flags); >> >> + if (ret != -EOPNOTSUPP) >> >> + goto done; >> >> + } >> >> + >> >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> >> + >> >> +done: >> >> + if (ret > 0) { >> >> + fsnotify_access(file_in); >> >> + add_rchar(current, ret); >> >> + fsnotify_modify(file_out); >> >> + add_wchar(current, ret); >> >> + } >> >> + >> >> + inc_syscr(current); >> >> + inc_syscw(current); >> >> + >> >> + file_end_write(file_out); >> >> + >> >> + return ret; >> >> +} >> >> +EXPORT_SYMBOL(vfs_copy_file_range64); >> >> + >> >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> >> + int, fd_out, loff_t __user *, off_out, >> >> + u64, len, unsigned int, flags) >> >> +{ >> >> + loff_t pos_in; >> >> + loff_t pos_out; >> >> + struct fd f_in; >> >> + struct fd f_out; >> >> + u64 ret = -EBADF; >> >> + >> >> + f_in = fdget(fd_in); >> >> + if (!f_in.file) >> >> + goto out2; >> >> + >> >> + f_out = fdget(fd_out); >> >> + if (!f_out.file) >> >> + goto out1; >> >> + >> >> + ret = -EFAULT; >> >> + if (off_in) { >> >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> >> + goto out; >> >> + } else { >> >> + pos_in = f_in.file->f_pos; >> >> + } >> >> + >> >> + if (off_out) { >> >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> >> + goto out; >> >> + } else { >> >> + pos_out = f_out.file->f_pos; >> >> + } >> >> + >> >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> >> + flags); >> >> + if (ret > 0) { >> >> + pos_in += ret; >> >> + pos_out += ret; >> >> + >> >> + if (off_in) { >> >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> >> + ret = -EFAULT; >> >> + } else { >> >> + f_in.file->f_pos = pos_in; >> >> + } >> >> + >> >> + if (off_out) { >> >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> >> + ret = -EFAULT; >> >> + } else { >> >> + f_out.file->f_pos = pos_out; >> >> + } >> >> + } >> >> + >> >> +out: >> >> + fdput(f_out); >> >> +out1: >> >> + fdput(f_in); >> >> +out2: >> >> + return ret; >> >> +} >> >> + >> >> /* >> >> * Check that the two inodes are eligible for cloning, the ranges make >> >> * sense, and then flush all dirty data. Caller must ensure that the >> >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, >> >> if (!file_in->f_op->clone_file_range) >> >> return -EOPNOTSUPP; >> >> >> >> - ret = clone_verify_area(file_in, pos_in, len, false); >> >> + ret = rw64_verify_area(file_in, pos_in, len, false); >> >> if (ret) >> >> return ret; >> >> >> >> - ret = clone_verify_area(file_out, pos_out, len, true); >> >> + ret = rw64_verify_area(file_out, pos_out, len, true); >> >> if (ret) >> >> return ret; >> >> >> >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) >> >> if (!S_ISREG(src->i_mode)) >> >> goto out; >> >> >> >> - ret = clone_verify_area(file, off, len, false); >> >> + ret = rw64_verify_area(file, off, len, false); >> >> if (ret < 0) >> >> goto out; >> >> ret = 0; >> >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) >> >> } >> >> >> >> dst_off = info->dest_offset; >> >> - ret = clone_verify_area(dst_file, dst_off, len, true); >> >> + ret = rw64_verify_area(dst_file, dst_off, len, true); >> >> if (ret < 0) { >> >> info->status = ret; >> >> goto next_file; >> >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> >> index 803e5a9..41fce43 100644 >> >> --- a/include/linux/fs.h >> >> +++ b/include/linux/fs.h >> >> @@ -1690,6 +1690,8 @@ struct file_operations { >> >> u64); >> >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> >> u64); >> >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> >> + loff_t, u64, unsigned int); >> >> }; >> >> >> >> struct inode_operations { >> >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> >> loff_t len, bool *is_same); >> >> extern int vfs_dedupe_file_range(struct file *file, >> >> struct file_dedupe_range *same); >> >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> >> + loff_t, u64, unsigned int); >> >> >> >> struct super_operations { >> >> struct inode *(*alloc_inode)(struct super_block *sb); >> >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> >> index 980c3c9..f7ea78e 100644 >> >> --- a/include/linux/syscalls.h >> >> +++ b/include/linux/syscalls.h >> >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> >> asmlinkage long sys_pkey_free(int pkey); >> >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> >> unsigned mask, struct statx __user *buffer); >> >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> >> + int fd_out, loff_t __user *off_out, >> >> + u64 len, unsigned int flags); >> >> >> >> #endif >> >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> >> index 061185a..e385a76 100644 >> >> --- a/include/uapi/asm-generic/unistd.h >> >> +++ b/include/uapi/asm-generic/unistd.h >> >> @@ -731,9 +731,11 @@ >> >> __SYSCALL(__NR_pkey_free, sys_pkey_free) >> >> #define __NR_statx 291 >> >> __SYSCALL(__NR_statx, sys_statx) >> >> +#define __NR_copy_file_range64 292 >> >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> >> >> >> #undef __NR_syscalls >> >> -#define __NR_syscalls 292 >> >> +#define __NR_syscalls 293 >> >> >> >> /* >> >> * All syscalls below here should go away really, >> >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> >> index 8acef85..8e0cfd4 100644 >> >> --- a/kernel/sys_ni.c >> >> +++ b/kernel/sys_ni.c >> >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> >> cond_syscall(sys_capget); >> >> cond_syscall(sys_capset); >> >> cond_syscall(sys_copy_file_range); >> >> +cond_syscall(sys_copy_file_range64); >> >> >> >> /* arch-specific weak syscall entries */ >> >> cond_syscall(sys_pciconfig_read); >> >> -- >> >> 1.8.3.1 >> >> >> > -- >> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in >> > the body of a message to majordomo@vger.kernel.org >> > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-19 18:34 ` Olga Kornievskaia @ 2017-06-19 19:39 ` Christoph Hellwig 2017-06-19 19:48 ` Olga Kornievskaia ` (3 more replies) 0 siblings, 4 replies; 35+ messages in thread From: Christoph Hellwig @ 2017-06-19 19:39 UTC (permalink / raw) To: Olga Kornievskaia Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: > Yes NFS community does need one for doing a server-to-server copy > (performance) feature. s/NFS community/NetApp/ ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-19 19:39 ` Christoph Hellwig @ 2017-06-19 19:48 ` Olga Kornievskaia 2017-06-20 12:38 ` Steve Dickson ` (2 subsequent siblings) 3 siblings, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-19 19:48 UTC (permalink / raw) To: Christoph Hellwig Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Mon, Jun 19, 2017 at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote: > On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: >> Yes NFS community does need one for doing a server-to-server copy >> (performance) feature. > > s/NFS community/NetApp/ NetApp = users. Is there another reason for having a feature in the kernel if not for the users? ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-19 19:39 ` Christoph Hellwig 2017-06-19 19:48 ` Olga Kornievskaia @ 2017-06-20 12:38 ` Steve Dickson 2017-06-20 19:33 ` Chuck Lever 2017-06-24 11:29 ` Mkrtchyan, Tigran 3 siblings, 0 replies; 35+ messages in thread From: Steve Dickson @ 2017-06-20 12:38 UTC (permalink / raw) To: Christoph Hellwig, Olga Kornievskaia Cc: Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs Hello, On 06/19/2017 03:39 PM, Christoph Hellwig wrote: > On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: >> Yes NFS community does need one for doing a server-to-server copy >> (performance) feature. > > s/NFS community/NetApp/ Wrong... Red Hat is also very interested in this work being that this support is in both Fedora and upcoming RHEL releases... I must admin I just don't understand what there is *any* push back on a feature that would dramatically increase the I/O speed of NFS. That would be good for the *entire* Linux community... IMHO. I'm coming to the party late... So is the only sticking point is to have this vfs call write to a different filesystem, correct? Why? (please point to a email that already explains it) Does that break some type of boundary or is it because it was never needed before?? Again, looking at the big picture... Allowing NFS to copy files by only sending meta-data is HUGE! It is such a win for the entire community so I'm so surprise this community is not trying to help the process instead of push it back. my two cents, steved. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-19 19:39 ` Christoph Hellwig 2017-06-19 19:48 ` Olga Kornievskaia 2017-06-20 12:38 ` Steve Dickson @ 2017-06-20 19:33 ` Chuck Lever 2017-06-20 19:37 ` Olga Kornievskaia 2017-06-24 11:29 ` Mkrtchyan, Tigran 3 siblings, 1 reply; 35+ messages in thread From: Chuck Lever @ 2017-06-20 19:33 UTC (permalink / raw) To: Christoph Hellwig Cc: Olga Kornievskaia, Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, Linux NFS Mailing List > On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote: > > On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: >> Yes NFS community does need one for doing a server-to-server copy >> (performance) feature. > > s/NFS community/NetApp/ I don't have an opinion on the API details, but Oracle is interested in server-to-server copy, especially if there is an implementation of it in the Linux NFS server. -- Chuck Lever ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-20 19:33 ` Chuck Lever @ 2017-06-20 19:37 ` Olga Kornievskaia 0 siblings, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-20 19:37 UTC (permalink / raw) To: Chuck Lever Cc: Christoph Hellwig, Darrick J. Wong, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, Linux NFS Mailing List On Tue, Jun 20, 2017 at 3:33 PM, Chuck Lever <chuck.lever@oracle.com> wrote: > >> On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <hch@infradead.org> wrote: >> >> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: >>> Yes NFS community does need one for doing a server-to-server copy >>> (performance) feature. >> >> s/NFS community/NetApp/ > > I don't have an opinion on the API details, but Oracle is interested > in server-to-server copy, especially if there is an implementation > of it in the Linux NFS server. The proposed patch set is the linux client AND linux server. We need removal of the VFS cross-device check on both side (client and destination server) to make the server-to-server copy work. On the server side, we also call vfs_copy_file_range() that (when check is removed) ends up reading from the NFS source file and writing to the local file system. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-19 19:39 ` Christoph Hellwig ` (2 preceding siblings ...) 2017-06-20 19:33 ` Chuck Lever @ 2017-06-24 11:29 ` Mkrtchyan, Tigran 3 siblings, 0 replies; 35+ messages in thread From: Mkrtchyan, Tigran @ 2017-06-24 11:29 UTC (permalink / raw) To: Christoph Hellwig Cc: Olga Kornievskaia, Darrick J. Wong, Olga Kornievskaia, linux-fsdevel, linux-nfs ----- Original Message ----- > From: "Christoph Hellwig" <hch@infradead.org> > To: "Olga Kornievskaia" <aglo@umich.edu> > Cc: "Darrick J. Wong" <darrick.wong@oracle.com>, "Olga Kornievskaia" <kolga@netapp.com>, linux-fsdevel@vger.kernel.org, > "linux-nfs" <linux-nfs@vger.kernel.org> > Sent: Monday, June 19, 2017 9:39:03 PM > Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call > On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote: >> Yes NFS community does need one for doing a server-to-server copy >> (performance) feature. > > s/NFS community/NetApp/ As being probably the first person who have raised the need in server-to-server copy in NFS (see slide 15 http://nfsv4bat.org/Documents/ConnectAThon/2008/managedstorage.pdf), let me disagree with you. Having end-user hat on me I can say, that over a decade we are moving scientific data around the world with server-side-copy, or 3rd-party copy as we call it. While we more-and-more use NFS (pNFS) for data access, we always need a second protocol, like FTP or WebDAV, to perform efficient data movement. I am not deep in kernel internals, but there must be a very good reason to block such functionality in NFS. Such functionality will drastically simplify our workflow. And I am sure, we are not the only users of it. Tigran. > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 20:53 ` Darrick J. Wong 2017-06-19 18:34 ` Olga Kornievskaia @ 2017-06-20 12:44 ` Steve Dickson 1 sibling, 0 replies; 35+ messages in thread From: Steve Dickson @ 2017-06-20 12:44 UTC (permalink / raw) To: Darrick J. Wong, Olga Kornievskaia Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On 06/15/2017 04:53 PM, Darrick J. Wong wrote: > On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote: >> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong >> <darrick.wong@oracle.com> wrote: >>> On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote: >>>> This is a proposal to allow 64bit count to copy and return as >>>> a result of a copy_file_range. No attempt was made to share code >>>> with the 32bit function because 32bit interface should probably >>>> get depreciated. >>>> >>>> Why use 64bit? Current uses of 32bit are by clone_file_range() >>>> which could use 64bit count and NFS copy_file_range also supports >>>> 64bit count value. >>>> >>>> Also with this proposal off-the-bet allow the userland to copy >>>> between different mount points. >>>> >>>> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >>>> --- >>>> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >>>> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >>>> fs/read_write.c | 146 +++++++++++++++++++++++++++++++-- >>>> include/linux/fs.h | 4 + >>>> include/linux/syscalls.h | 3 + >>>> include/uapi/asm-generic/unistd.h | 4 +- >>>> kernel/sys_ni.c | 1 + >>>> 7 files changed, 154 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >>>> index 448ac21..1f5f1e7 100644 >>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl >>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl >>>> @@ -391,3 +391,4 @@ >>>> 382 i386 pkey_free sys_pkey_free >>>> 383 i386 statx sys_statx >>>> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >>>> +385 i386 copy_file_range64 sys_copy_file_range64 >>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >>>> index 5aef183..e658bbe 100644 >>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl >>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >>>> @@ -339,6 +339,7 @@ >>>> 330 common pkey_alloc sys_pkey_alloc >>>> 331 common pkey_free sys_pkey_free >>>> 332 common statx sys_statx >>>> +333 64 copy_file_range64 sys_copy_file_range64 >>>> >>>> # >>>> # x32-specific system call numbers start at 512 to avoid cache impact >>>> diff --git a/fs/read_write.c b/fs/read_write.c >>>> index e3ee985..c9c072b 100644 >>>> --- a/fs/read_write.c >>>> +++ b/fs/read_write.c >>>> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >>>> return ret; >>>> } >>>> >>>> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >>>> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write) >>>> { >>>> struct inode *inode = file_inode(file); >>>> >>>> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >>>> return security_file_permission(file, write ? MAY_WRITE : MAY_READ); >>>> } >>>> >>>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >>>> + struct file *file_out, loff_t pos_out, >>>> + u64 len, unsigned int flags) >>> >>> AFAICT the difference between vfs_copy_file_range() and >>> vfs_copy_file_range64() is that you've replaced 'size_t len' with >>> 'u64 len', correct? >> >> That and I'm asking to allow for cross-device copy to be allowed by >> the system call. > > I was fine letting each fs decide if it was going to allow cross-device > copy or not, but Christoph argued that since we don't generally allow > operations across mountpoints we shouldn't allow cross-mountpoint > {clone,copy}_file_range either. > > Roughly translated, XFS has no cross-device copy abilities, there aren't > any plans to add it, so I don't feel motivated to argue for it... but if > the nfs community is so motivated (it sounds like it) then y'all could > try one more time. > So since XFS (or any other local fs) does not have this ability, NFS does not needed it?? That can't be the reason for the push back that is denying NFS this ability.. steved. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia 2017-06-14 17:24 ` Anna Schumaker @ 2017-06-15 8:15 ` Christoph Hellwig 2017-06-15 13:07 ` Olga Kornievskaia 2017-06-24 13:23 ` Jeff Layton 2 siblings, 1 reply; 35+ messages in thread From: Christoph Hellwig @ 2017-06-15 8:15 UTC (permalink / raw) To: Olga Kornievskaia; +Cc: linux-fsdevel, linux-nfs On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. Please provide a very good use case that actually matters to a large portion of the Linux users. > Also with this proposal off-the-bet allow the userland to copy > between different mount points. Totally different issue that we've discussed N times. Trying to sneak it in behind peoples backs isn't going to improve the situation in any way. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 8:15 ` Christoph Hellwig @ 2017-06-15 13:07 ` Olga Kornievskaia 2017-06-15 18:35 ` Andreas Dilger 0 siblings, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-15 13:07 UTC (permalink / raw) To: Christoph Hellwig Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. > > Please provide a very good use case that actually matters to a large > portion of the Linux users. Reason: it will not hurt any user but it will help for server-to-server NFS copy because for each invocation of of copy_file_range() it requires that the client sends COPY_NOTIFY, then COPY and then a copy triggers a establishment of clientid/session before the reading of the source file can happen. All that could be saved by have a 64bit value. >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. > > Totally different issue that we've discussed N times. Trying to sneak > it in behind peoples backs isn't going to improve the situation in any way. I would think that sneaking would have been to remove the check and not mention it in the commit description. I have explicitly brought the topic up in the commit description to bring the attention to the issue as a way to restart the conversation about the issue. I have several times have asked for the reason to why the check can not be removed. I'm asking again can you please explain why. > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 13:07 ` Olga Kornievskaia @ 2017-06-15 18:35 ` Andreas Dilger 2017-06-15 19:11 ` Olga Kornievskaia 2017-06-16 17:36 ` J. Bruce Fields 0 siblings, 2 replies; 35+ messages in thread From: Andreas Dilger @ 2017-06-15 18:35 UTC (permalink / raw) To: Olga Kornievskaia Cc: Christoph Hellwig, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs [-- Attachment #1: Type: text/plain, Size: 1733 bytes --] > On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: > > On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: >> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >>> This is a proposal to allow 64bit count to copy and return as >>> a result of a copy_file_range. No attempt was made to share code >>> with the 32bit function because 32bit interface should probably >>> get depreciated. >>> >>> Why use 64bit? Current uses of 32bit are by clone_file_range() >>> which could use 64bit count and NFS copy_file_range also supports >>> 64bit count value. >> >> Please provide a very good use case that actually matters to a large >> portion of the Linux users. > > Reason: it will not hurt any user but it will help for > server-to-server NFS copy because for each invocation of of > copy_file_range() it requires that the client sends COPY_NOTIFY, then > COPY and then a copy triggers a establishment of clientid/session > before the reading of the source file can happen. All that could be > saved by have a 64bit value. What is the overhead of sending a couple of RPCs vs. copying 4GB of data on the server? > Current copy_file_range would work for any file size, it would just > need to be called in a loop by the application. With the given > proposal, there wouldn't be a need for the application to loop due to > the API size limitation. It might still loop if a partial copy was > done. Given that there always needs to be a loop to handle partial completion, adding the 64-bit syscall doesn't really simplify the application at all, but adds duplicate code to the kernel for little benefit. Cheers, Andreas [-- Attachment #2: Message signed with OpenPGP --] [-- Type: application/pgp-signature, Size: 195 bytes --] ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 18:35 ` Andreas Dilger @ 2017-06-15 19:11 ` Olga Kornievskaia 2017-06-16 17:36 ` J. Bruce Fields 1 sibling, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-15 19:11 UTC (permalink / raw) To: Andreas Dilger Cc: Christoph Hellwig, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Thu, Jun 15, 2017 at 2:35 PM, Andreas Dilger <adilger@dilger.ca> wrote: > >> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: >> >> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <hch@infradead.org> wrote: >>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote: >>>> This is a proposal to allow 64bit count to copy and return as >>>> a result of a copy_file_range. No attempt was made to share code >>>> with the 32bit function because 32bit interface should probably >>>> get depreciated. >>>> >>>> Why use 64bit? Current uses of 32bit are by clone_file_range() >>>> which could use 64bit count and NFS copy_file_range also supports >>>> 64bit count value. >>> >>> Please provide a very good use case that actually matters to a large >>> portion of the Linux users. >> >> Reason: it will not hurt any user but it will help for >> server-to-server NFS copy because for each invocation of of >> copy_file_range() it requires that the client sends COPY_NOTIFY, then >> COPY and then a copy triggers a establishment of clientid/session >> before the reading of the source file can happen. All that could be >> saved by have a 64bit value. > > What is the overhead of sending a couple of RPCs vs. copying 4GB of > data on the server? We get about 11% improvement having an interface that removes the need for multiple calls. That's testing 8gb copy that normally would have needed 2 copies. >> Current copy_file_range would work for any file size, it would just >> need to be called in a loop by the application. With the given >> proposal, there wouldn't be a need for the application to loop due to >> the API size limitation. It might still loop if a partial copy was >> done. > > > Given that there always needs to be a loop to handle partial completion, > adding the 64-bit syscall doesn't really simplify the application at > all, but adds duplicate code to the kernel for little benefit. If 32bit version would be deprecated then there won't be duplicate code. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-15 18:35 ` Andreas Dilger 2017-06-15 19:11 ` Olga Kornievskaia @ 2017-06-16 17:36 ` J. Bruce Fields 2017-06-17 10:03 ` Christoph Hellwig 1 sibling, 1 reply; 35+ messages in thread From: J. Bruce Fields @ 2017-06-16 17:36 UTC (permalink / raw) To: Andreas Dilger Cc: Olga Kornievskaia, Christoph Hellwig, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Thu, Jun 15, 2017 at 12:35:29PM -0600, Andreas Dilger wrote: > > > On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <aglo@umich.edu> wrote: > > Reason: it will not hurt any user but it will help for > > server-to-server NFS copy because for each invocation of of > > copy_file_range() it requires that the client sends COPY_NOTIFY, then > > COPY and then a copy triggers a establishment of clientid/session > > before the reading of the source file can happen. All that could be > > saved by have a 64bit value. > > What is the overhead of sending a couple of RPCs vs. copying 4GB of > data on the server? It makes a difference in the clone case, doesn't it? (Though I'd think best there would be a special "copy to end of file" value.) --b. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-16 17:36 ` J. Bruce Fields @ 2017-06-17 10:03 ` Christoph Hellwig 2017-06-17 12:42 ` Olga Kornievskaia 0 siblings, 1 reply; 35+ messages in thread From: Christoph Hellwig @ 2017-06-17 10:03 UTC (permalink / raw) To: J. Bruce Fields Cc: Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > It makes a difference in the clone case, doesn't it? (Though I'd think > best there would be a special "copy to end of file" value.) Clones uses a 0 length as "whole" file. Both for the NFS operation and the Linux ioctl. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-17 10:03 ` Christoph Hellwig @ 2017-06-17 12:42 ` Olga Kornievskaia 2017-06-17 15:09 ` Christoph Hellwig 0 siblings, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-17 12:42 UTC (permalink / raw) To: Christoph Hellwig Cc: J. Bruce Fields, Andreas Dilger, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: >> It makes a difference in the clone case, doesn't it? (Though I'd think >> best there would be a special "copy to end of file" value.) > > Clones uses a 0 length as "whole" file. Both for the NFS operation > and the Linux ioctl. Does that actually work? There is check vfs_copy_file_range() if (len == 0) return 0; ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-17 12:42 ` Olga Kornievskaia @ 2017-06-17 15:09 ` Christoph Hellwig 2017-06-17 18:24 ` J. Bruce Fields 0 siblings, 1 reply; 35+ messages in thread From: Christoph Hellwig @ 2017-06-17 15:09 UTC (permalink / raw) To: Olga Kornievskaia Cc: Christoph Hellwig, J. Bruce Fields, Andreas Dilger, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote: > > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > >> It makes a difference in the clone case, doesn't it? (Though I'd think > >> best there would be a special "copy to end of file" value.) > > > > Clones uses a 0 length as "whole" file. Both for the NFS operation > > and the Linux ioctl. > > Does that actually work? There is check vfs_copy_file_range() Please re-read the above sentence. I'm talking about NFS clone and IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow the NFS semantics that don't have the special 0 case. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-17 15:09 ` Christoph Hellwig @ 2017-06-17 18:24 ` J. Bruce Fields 0 siblings, 0 replies; 35+ messages in thread From: J. Bruce Fields @ 2017-06-17 18:24 UTC (permalink / raw) To: Christoph Hellwig Cc: Olga Kornievskaia, Andreas Dilger, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Sat, Jun 17, 2017 at 08:09:38AM -0700, Christoph Hellwig wrote: > On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote: > > > > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > > > > > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote: > > >> It makes a difference in the clone case, doesn't it? (Though I'd think > > >> best there would be a special "copy to end of file" value.) > > > > > > Clones uses a 0 length as "whole" file. Both for the NFS operation > > > and the Linux ioctl. > > > > Does that actually work? There is check vfs_copy_file_range() > > Please re-read the above sentence. I'm talking about NFS clone and > IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow > the NFS semantics that don't have the special 0 case. Nope, COPY has it: https://tools.ietf.org/html/rfc7862#section-15.2.3 A count of 0 (zero) requests that all bytes from ca_src_offset through EOF be copied to the destination. I think leaving it out of copy_file_range was just an oversight. --b. ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia 2017-06-14 17:24 ` Anna Schumaker 2017-06-15 8:15 ` Christoph Hellwig @ 2017-06-24 13:23 ` Jeff Layton 2017-06-26 13:21 ` Anna Schumaker 2 siblings, 1 reply; 35+ messages in thread From: Jeff Layton @ 2017-06-24 13:23 UTC (permalink / raw) To: Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > This is a proposal to allow 64bit count to copy and return as > a result of a copy_file_range. No attempt was made to share code > with the 32bit function because 32bit interface should probably > get depreciated. > > Why use 64bit? Current uses of 32bit are by clone_file_range() > which could use 64bit count and NFS copy_file_range also supports > 64bit count value. > > Also with this proposal off-the-bet allow the userland to copy > between different mount points. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > include/linux/fs.h | 4 + > include/linux/syscalls.h | 3 + > include/uapi/asm-generic/unistd.h | 4 +- > kernel/sys_ni.c | 1 + > 7 files changed, 149 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac21..1f5f1e7 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 copy_file_range64 sys_copy_file_range64 > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183..e658bbe 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 64 copy_file_range64 sys_copy_file_range64 > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/fs/read_write.c b/fs/read_write.c > index 47c1d44..e2665c1 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > return ret; > } > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > + struct file *file_out, loff_t pos_out, > + u64 len, unsigned int flags) > +{ > + struct inode *inode_in = file_inode(file_in); > + struct inode *inode_out = file_inode(file_out); > + u64 ret; > + > + if (flags != 0) > + return -EINVAL; > + > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > + return -EISDIR; > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > + return -EINVAL; > + > + ret = rw_verify_area(READ, file_in, &pos_in, len); > + if (unlikely(ret)) > + return ret; > + > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > + if (unlikely(ret)) > + return ret; > + > + if (!(file_in->f_mode & FMODE_READ) || > + !(file_out->f_mode & FMODE_WRITE) || > + (file_out->f_flags & O_APPEND)) > + return -EBADF; > + > + if (len == 0) > + return 0; > + > + file_start_write(file_out); > + > + /* > + * Try cloning first, this is supported by more file systems, and > + * more efficient if both clone and copy are supported (e.g. NFS). > + */ > + if (file_in->f_op->clone_file_range) { > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > + file_out, pos_out, len); > + if (ret == 0) { > + ret = len; > + goto done; > + } > + } > + > + if (file_out->f_op->copy_file_range64) { > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > + file_out, pos_out, len, flags); > + if (ret != -EOPNOTSUPP) > + goto done; > + } > + > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > + > +done: > + if (ret > 0) { > + fsnotify_access(file_in); > + add_rchar(current, ret); > + fsnotify_modify(file_out); > + add_wchar(current, ret); > + } > + > + inc_syscr(current); > + inc_syscw(current); > + > + file_end_write(file_out); > + > + return ret; > +} > +EXPORT_SYMBOL(vfs_copy_file_range64); > + > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > + int, fd_out, loff_t __user *, off_out, > + u64, len, unsigned int, flags) > +{ > + loff_t pos_in; > + loff_t pos_out; > + struct fd f_in; > + struct fd f_out; > + u64 ret = -EBADF; > + > + f_in = fdget(fd_in); > + if (!f_in.file) > + goto out2; > + > + f_out = fdget(fd_out); > + if (!f_out.file) > + goto out1; > + > + ret = -EFAULT; > + if (off_in) { > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > + goto out; > + } else { > + pos_in = f_in.file->f_pos; > + } > + > + if (off_out) { > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > + goto out; > + } else { > + pos_out = f_out.file->f_pos; > + } > + > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > + flags); > + if (ret > 0) { > + pos_in += ret; > + pos_out += ret; > + > + if (off_in) { > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_in.file->f_pos = pos_in; > + } > + > + if (off_out) { > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > + ret = -EFAULT; > + } else { > + f_out.file->f_pos = pos_out; > + } > + } > + > +out: > + fdput(f_out); > +out1: > + fdput(f_in); > +out2: > + return ret; > +} > + > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > { > struct inode *inode = file_inode(file); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 803e5a9..2727a98 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1690,6 +1690,8 @@ struct file_operations { > u64); > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > u64); > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > }; > > struct inode_operations { > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > loff_t len, bool *is_same); > extern int vfs_dedupe_file_range(struct file *file, > struct file_dedupe_range *same); > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > + loff_t, u64, unsigned int); > > struct super_operations { > struct inode *(*alloc_inode)(struct super_block *sb); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9..f7ea78e 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > asmlinkage long sys_pkey_free(int pkey); > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > unsigned mask, struct statx __user *buffer); > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > + int fd_out, loff_t __user *off_out, > + u64 len, unsigned int flags); > > #endif > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 061185a..e385a76 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -731,9 +731,11 @@ > __SYSCALL(__NR_pkey_free, sys_pkey_free) > #define __NR_statx 291 > __SYSCALL(__NR_statx, sys_statx) > +#define __NR_copy_file_range64 292 > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > #undef __NR_syscalls > -#define __NR_syscalls 292 > +#define __NR_syscalls 293 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 8acef85..8e0cfd4 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > cond_syscall(sys_capget); > cond_syscall(sys_capset); > cond_syscall(sys_copy_file_range); > +cond_syscall(sys_copy_file_range64); > > /* arch-specific weak syscall entries */ > cond_syscall(sys_pciconfig_read); Hi Olga, We discussed this a bit privately, but I'll note it here too. Server-to-server copy seems like a nice thing to me as well. There are several filesystems that can likely make use of that ability. I don't have a real opinion on the API change either, but you're making a very subtle change with this patch. Prior to this, the existing copy_file_range calls could count on the superblocks being the same. Now, it looks like you can pass them any old file pointer. The exsiting copy_file_range file ops need to be fixed to vet the struct files they've been handed if you want to do this. It would also be nice to see something on copy_file_range and clone_file_range in Documentation/filesystems/vfs.txt. Cheers, -- Jeff Layton <jlayton@redhat.com> ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-24 13:23 ` Jeff Layton @ 2017-06-26 13:21 ` Anna Schumaker 2017-06-26 13:46 ` Jeff Layton 0 siblings, 1 reply; 35+ messages in thread From: Anna Schumaker @ 2017-06-26 13:21 UTC (permalink / raw) To: Jeff Layton, Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs On 06/24/2017 09:23 AM, Jeff Layton wrote: > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: >> This is a proposal to allow 64bit count to copy and return as >> a result of a copy_file_range. No attempt was made to share code >> with the 32bit function because 32bit interface should probably >> get depreciated. >> >> Why use 64bit? Current uses of 32bit are by clone_file_range() >> which could use 64bit count and NFS copy_file_range also supports >> 64bit count value. >> >> Also with this proposal off-the-bet allow the userland to copy >> between different mount points. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- >> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ >> include/linux/fs.h | 4 + >> include/linux/syscalls.h | 3 + >> include/uapi/asm-generic/unistd.h | 4 +- >> kernel/sys_ni.c | 1 + >> 7 files changed, 149 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> index 448ac21..1f5f1e7 100644 >> --- a/arch/x86/entry/syscalls/syscall_32.tbl >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> @@ -391,3 +391,4 @@ >> 382 i386 pkey_free sys_pkey_free >> 383 i386 statx sys_statx >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> +385 i386 copy_file_range64 sys_copy_file_range64 >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> index 5aef183..e658bbe 100644 >> --- a/arch/x86/entry/syscalls/syscall_64.tbl >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> @@ -339,6 +339,7 @@ >> 330 common pkey_alloc sys_pkey_alloc >> 331 common pkey_free sys_pkey_free >> 332 common statx sys_statx >> +333 64 copy_file_range64 sys_copy_file_range64 >> >> # >> # x32-specific system call numbers start at 512 to avoid cache impact >> diff --git a/fs/read_write.c b/fs/read_write.c >> index 47c1d44..e2665c1 100644 >> --- a/fs/read_write.c >> +++ b/fs/read_write.c >> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> return ret; >> } >> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> + struct file *file_out, loff_t pos_out, >> + u64 len, unsigned int flags) >> +{ >> + struct inode *inode_in = file_inode(file_in); >> + struct inode *inode_out = file_inode(file_out); >> + u64 ret; >> + >> + if (flags != 0) >> + return -EINVAL; >> + >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> + return -EISDIR; >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> + return -EINVAL; >> + >> + ret = rw_verify_area(READ, file_in, &pos_in, len); >> + if (unlikely(ret)) >> + return ret; >> + >> + ret = rw_verify_area(WRITE, file_out, &pos_out, len); >> + if (unlikely(ret)) >> + return ret; >> + >> + if (!(file_in->f_mode & FMODE_READ) || >> + !(file_out->f_mode & FMODE_WRITE) || >> + (file_out->f_flags & O_APPEND)) >> + return -EBADF; >> + >> + if (len == 0) >> + return 0; >> + >> + file_start_write(file_out); >> + >> + /* >> + * Try cloning first, this is supported by more file systems, and >> + * more efficient if both clone and copy are supported (e.g. NFS). >> + */ >> + if (file_in->f_op->clone_file_range) { >> + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> + file_out, pos_out, len); >> + if (ret == 0) { >> + ret = len; >> + goto done; >> + } >> + } >> + >> + if (file_out->f_op->copy_file_range64) { >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> + file_out, pos_out, len, flags); >> + if (ret != -EOPNOTSUPP) >> + goto done; >> + } >> + >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> + >> +done: >> + if (ret > 0) { >> + fsnotify_access(file_in); >> + add_rchar(current, ret); >> + fsnotify_modify(file_out); >> + add_wchar(current, ret); >> + } >> + >> + inc_syscr(current); >> + inc_syscw(current); >> + >> + file_end_write(file_out); >> + >> + return ret; >> +} >> +EXPORT_SYMBOL(vfs_copy_file_range64); >> + >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> + int, fd_out, loff_t __user *, off_out, >> + u64, len, unsigned int, flags) >> +{ >> + loff_t pos_in; >> + loff_t pos_out; >> + struct fd f_in; >> + struct fd f_out; >> + u64 ret = -EBADF; >> + >> + f_in = fdget(fd_in); >> + if (!f_in.file) >> + goto out2; >> + >> + f_out = fdget(fd_out); >> + if (!f_out.file) >> + goto out1; >> + >> + ret = -EFAULT; >> + if (off_in) { >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_in = f_in.file->f_pos; >> + } >> + >> + if (off_out) { >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> + goto out; >> + } else { >> + pos_out = f_out.file->f_pos; >> + } >> + >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> + flags); >> + if (ret > 0) { >> + pos_in += ret; >> + pos_out += ret; >> + >> + if (off_in) { >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_in.file->f_pos = pos_in; >> + } >> + >> + if (off_out) { >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> + ret = -EFAULT; >> + } else { >> + f_out.file->f_pos = pos_out; >> + } >> + } >> + >> +out: >> + fdput(f_out); >> +out1: >> + fdput(f_in); >> +out2: >> + return ret; >> +} >> + >> static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> { >> struct inode *inode = file_inode(file); >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 803e5a9..2727a98 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -1690,6 +1690,8 @@ struct file_operations { >> u64); >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> u64); >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> }; >> >> struct inode_operations { >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> loff_t len, bool *is_same); >> extern int vfs_dedupe_file_range(struct file *file, >> struct file_dedupe_range *same); >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> + loff_t, u64, unsigned int); >> >> struct super_operations { >> struct inode *(*alloc_inode)(struct super_block *sb); >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> index 980c3c9..f7ea78e 100644 >> --- a/include/linux/syscalls.h >> +++ b/include/linux/syscalls.h >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> asmlinkage long sys_pkey_free(int pkey); >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> unsigned mask, struct statx __user *buffer); >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> + int fd_out, loff_t __user *off_out, >> + u64 len, unsigned int flags); >> >> #endif >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> index 061185a..e385a76 100644 >> --- a/include/uapi/asm-generic/unistd.h >> +++ b/include/uapi/asm-generic/unistd.h >> @@ -731,9 +731,11 @@ >> __SYSCALL(__NR_pkey_free, sys_pkey_free) >> #define __NR_statx 291 >> __SYSCALL(__NR_statx, sys_statx) >> +#define __NR_copy_file_range64 292 >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> >> #undef __NR_syscalls >> -#define __NR_syscalls 292 >> +#define __NR_syscalls 293 >> >> /* >> * All syscalls below here should go away really, >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> index 8acef85..8e0cfd4 100644 >> --- a/kernel/sys_ni.c >> +++ b/kernel/sys_ni.c >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> cond_syscall(sys_capget); >> cond_syscall(sys_capset); >> cond_syscall(sys_copy_file_range); >> +cond_syscall(sys_copy_file_range64); >> >> /* arch-specific weak syscall entries */ >> cond_syscall(sys_pciconfig_read); > > Hi Olga, > > We discussed this a bit privately, but I'll note it here too. > > Server-to-server copy seems like a nice thing to me as well. There are > several filesystems that can likely make use of that ability. > > I don't have a real opinion on the API change either, but you're making > a very subtle change with this patch. Prior to this, the existing > copy_file_range calls could count on the superblocks being the same. > Now, it looks like you can pass them any old file pointer. What if we add a new file op for xdev copy that gets called when superblocks are different, but filesystem type is the same? We could keep the current copy_file_range fop to fall back on if superblocks are the same. Thoughts? Anna > > The exsiting copy_file_range file ops need to be fixed to vet the struct > files they've been handed if you want to do this. > > It would also be nice to see something on copy_file_range and > clone_file_range in Documentation/filesystems/vfs.txt. > > Cheers, > ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-26 13:21 ` Anna Schumaker @ 2017-06-26 13:46 ` Jeff Layton 2017-06-26 14:46 ` Olga Kornievskaia [not found] ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com> 0 siblings, 2 replies; 35+ messages in thread From: Jeff Layton @ 2017-06-26 13:46 UTC (permalink / raw) To: Anna Schumaker, Olga Kornievskaia, linux-fsdevel; +Cc: linux-nfs On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: > > On 06/24/2017 09:23 AM, Jeff Layton wrote: > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > > > This is a proposal to allow 64bit count to copy and return as > > > a result of a copy_file_range. No attempt was made to share code > > > with the 32bit function because 32bit interface should probably > > > get depreciated. > > > > > > Why use 64bit? Current uses of 32bit are by clone_file_range() > > > which could use 64bit count and NFS copy_file_range also supports > > > 64bit count value. > > > > > > Also with this proposal off-the-bet allow the userland to copy > > > between different mount points. > > > > > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > > > --- > > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > > > include/linux/fs.h | 4 + > > > include/linux/syscalls.h | 3 + > > > include/uapi/asm-generic/unistd.h | 4 +- > > > kernel/sys_ni.c | 1 + > > > 7 files changed, 149 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > > > index 448ac21..1f5f1e7 100644 > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > > > @@ -391,3 +391,4 @@ > > > 382 i386 pkey_free sys_pkey_free > > > 383 i386 statx sys_statx > > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > > > +385 i386 copy_file_range64 sys_copy_file_range64 > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > > index 5aef183..e658bbe 100644 > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > > @@ -339,6 +339,7 @@ > > > 330 common pkey_alloc sys_pkey_alloc > > > 331 common pkey_free sys_pkey_free > > > 332 common statx sys_statx > > > +333 64 copy_file_range64 sys_copy_file_range64 > > > > > > # > > > # x32-specific system call numbers start at 512 to avoid cache impact > > > diff --git a/fs/read_write.c b/fs/read_write.c > > > index 47c1d44..e2665c1 100644 > > > --- a/fs/read_write.c > > > +++ b/fs/read_write.c > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > > > return ret; > > > } > > > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > > > + struct file *file_out, loff_t pos_out, > > > + u64 len, unsigned int flags) > > > +{ > > > + struct inode *inode_in = file_inode(file_in); > > > + struct inode *inode_out = file_inode(file_out); > > > + u64 ret; > > > + > > > + if (flags != 0) > > > + return -EINVAL; > > > + > > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > > > + return -EISDIR; > > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > > > + return -EINVAL; > > > + > > > + ret = rw_verify_area(READ, file_in, &pos_in, len); > > > + if (unlikely(ret)) > > > + return ret; > > > + > > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > > > + if (unlikely(ret)) > > > + return ret; > > > + > > > + if (!(file_in->f_mode & FMODE_READ) || > > > + !(file_out->f_mode & FMODE_WRITE) || > > > + (file_out->f_flags & O_APPEND)) > > > + return -EBADF; > > > + > > > + if (len == 0) > > > + return 0; > > > + > > > + file_start_write(file_out); > > > + > > > + /* > > > + * Try cloning first, this is supported by more file systems, and > > > + * more efficient if both clone and copy are supported (e.g. NFS). > > > + */ > > > + if (file_in->f_op->clone_file_range) { > > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > > > + file_out, pos_out, len); > > > + if (ret == 0) { > > > + ret = len; > > > + goto done; > > > + } > > > + } > > > + > > > + if (file_out->f_op->copy_file_range64) { > > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > > > + file_out, pos_out, len, flags); > > > + if (ret != -EOPNOTSUPP) > > > + goto done; > > > + } > > > + > > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > > > + > > > +done: > > > + if (ret > 0) { > > > + fsnotify_access(file_in); > > > + add_rchar(current, ret); > > > + fsnotify_modify(file_out); > > > + add_wchar(current, ret); > > > + } > > > + > > > + inc_syscr(current); > > > + inc_syscw(current); > > > + > > > + file_end_write(file_out); > > > + > > > + return ret; > > > +} > > > +EXPORT_SYMBOL(vfs_copy_file_range64); > > > + > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > > > + int, fd_out, loff_t __user *, off_out, > > > + u64, len, unsigned int, flags) > > > +{ > > > + loff_t pos_in; > > > + loff_t pos_out; > > > + struct fd f_in; > > > + struct fd f_out; > > > + u64 ret = -EBADF; > > > + > > > + f_in = fdget(fd_in); > > > + if (!f_in.file) > > > + goto out2; > > > + > > > + f_out = fdget(fd_out); > > > + if (!f_out.file) > > > + goto out1; > > > + > > > + ret = -EFAULT; > > > + if (off_in) { > > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > > > + goto out; > > > + } else { > > > + pos_in = f_in.file->f_pos; > > > + } > > > + > > > + if (off_out) { > > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > > > + goto out; > > > + } else { > > > + pos_out = f_out.file->f_pos; > > > + } > > > + > > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > > > + flags); > > > + if (ret > 0) { > > > + pos_in += ret; > > > + pos_out += ret; > > > + > > > + if (off_in) { > > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > > > + ret = -EFAULT; > > > + } else { > > > + f_in.file->f_pos = pos_in; > > > + } > > > + > > > + if (off_out) { > > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > > > + ret = -EFAULT; > > > + } else { > > > + f_out.file->f_pos = pos_out; > > > + } > > > + } > > > + > > > +out: > > > + fdput(f_out); > > > +out1: > > > + fdput(f_in); > > > +out2: > > > + return ret; > > > +} > > > + > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > > > { > > > struct inode *inode = file_inode(file); > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > index 803e5a9..2727a98 100644 > > > --- a/include/linux/fs.h > > > +++ b/include/linux/fs.h > > > @@ -1690,6 +1690,8 @@ struct file_operations { > > > u64); > > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > > > u64); > > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > > > + loff_t, u64, unsigned int); > > > }; > > > > > > struct inode_operations { > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > > > loff_t len, bool *is_same); > > > extern int vfs_dedupe_file_range(struct file *file, > > > struct file_dedupe_range *same); > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > > > + loff_t, u64, unsigned int); > > > > > > struct super_operations { > > > struct inode *(*alloc_inode)(struct super_block *sb); > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > > > index 980c3c9..f7ea78e 100644 > > > --- a/include/linux/syscalls.h > > > +++ b/include/linux/syscalls.h > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > > > asmlinkage long sys_pkey_free(int pkey); > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > > > unsigned mask, struct statx __user *buffer); > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > > > + int fd_out, loff_t __user *off_out, > > > + u64 len, unsigned int flags); > > > > > > #endif > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > > > index 061185a..e385a76 100644 > > > --- a/include/uapi/asm-generic/unistd.h > > > +++ b/include/uapi/asm-generic/unistd.h > > > @@ -731,9 +731,11 @@ > > > __SYSCALL(__NR_pkey_free, sys_pkey_free) > > > #define __NR_statx 291 > > > __SYSCALL(__NR_statx, sys_statx) > > > +#define __NR_copy_file_range64 292 > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > > > > > #undef __NR_syscalls > > > -#define __NR_syscalls 292 > > > +#define __NR_syscalls 293 > > > > > > /* > > > * All syscalls below here should go away really, > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > > > index 8acef85..8e0cfd4 100644 > > > --- a/kernel/sys_ni.c > > > +++ b/kernel/sys_ni.c > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > > > cond_syscall(sys_capget); > > > cond_syscall(sys_capset); > > > cond_syscall(sys_copy_file_range); > > > +cond_syscall(sys_copy_file_range64); > > > > > > /* arch-specific weak syscall entries */ > > > cond_syscall(sys_pciconfig_read); > > > > Hi Olga, > > > > We discussed this a bit privately, but I'll note it here too. > > > > Server-to-server copy seems like a nice thing to me as well. There are > > several filesystems that can likely make use of that ability. > > > > I don't have a real opinion on the API change either, but you're making > > a very subtle change with this patch. Prior to this, the existing > > copy_file_range calls could count on the superblocks being the same. > > Now, it looks like you can pass them any old file pointer. > > What if we add a new file op for xdev copy that gets called when > superblocks are different, but filesystem type is the same? We could > keep the current copy_file_range fop to fall back on if superblocks > are the same. I don't think that would really help much. There are only two filesystems with copy_file_range operations -- nfsv4 and cifs. I don't think we really need another special case op for this. What I would probably suggest is to just push the check for the same superblock down into the copy_file_range ops in one patch (with no functional change). Then, you could go and lift that restriction in the NFSv4 operation without regressing cifs. The CIFS folks could eventually do the same for theirs. -- Jeff Layton <jlayton@redhat.com> ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call 2017-06-26 13:46 ` Jeff Layton @ 2017-06-26 14:46 ` Olga Kornievskaia 2017-06-26 14:49 ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia [not found] ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com> 1 sibling, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-26 14:46 UTC (permalink / raw) To: Jeff Layton Cc: Anna Schumaker, Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Mon, Jun 26, 2017 at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote: > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: >> >> On 06/24/2017 09:23 AM, Jeff Layton wrote: >> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: >> > > This is a proposal to allow 64bit count to copy and return as >> > > a result of a copy_file_range. No attempt was made to share code >> > > with the 32bit function because 32bit interface should probably >> > > get depreciated. >> > > >> > > Why use 64bit? Current uses of 32bit are by clone_file_range() >> > > which could use 64bit count and NFS copy_file_range also supports >> > > 64bit count value. >> > > >> > > Also with this proposal off-the-bet allow the userland to copy >> > > between different mount points. >> > > >> > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> > > --- >> > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + >> > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + >> > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ >> > > include/linux/fs.h | 4 + >> > > include/linux/syscalls.h | 3 + >> > > include/uapi/asm-generic/unistd.h | 4 +- >> > > kernel/sys_ni.c | 1 + >> > > 7 files changed, 149 insertions(+), 1 deletion(-) >> > > >> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl >> > > index 448ac21..1f5f1e7 100644 >> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl >> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl >> > > @@ -391,3 +391,4 @@ >> > > 382 i386 pkey_free sys_pkey_free >> > > 383 i386 statx sys_statx >> > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl >> > > +385 i386 copy_file_range64 sys_copy_file_range64 >> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> > > index 5aef183..e658bbe 100644 >> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl >> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> > > @@ -339,6 +339,7 @@ >> > > 330 common pkey_alloc sys_pkey_alloc >> > > 331 common pkey_free sys_pkey_free >> > > 332 common statx sys_statx >> > > +333 64 copy_file_range64 sys_copy_file_range64 >> > > >> > > # >> > > # x32-specific system call numbers start at 512 to avoid cache impact >> > > diff --git a/fs/read_write.c b/fs/read_write.c >> > > index 47c1d44..e2665c1 100644 >> > > --- a/fs/read_write.c >> > > +++ b/fs/read_write.c >> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> > > return ret; >> > > } >> > > >> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, >> > > + struct file *file_out, loff_t pos_out, >> > > + u64 len, unsigned int flags) >> > > +{ >> > > + struct inode *inode_in = file_inode(file_in); >> > > + struct inode *inode_out = file_inode(file_out); >> > > + u64 ret; >> > > + >> > > + if (flags != 0) >> > > + return -EINVAL; >> > > + >> > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) >> > > + return -EISDIR; >> > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) >> > > + return -EINVAL; >> > > + >> > > + ret = rw_verify_area(READ, file_in, &pos_in, len); >> > > + if (unlikely(ret)) >> > > + return ret; >> > > + >> > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); >> > > + if (unlikely(ret)) >> > > + return ret; >> > > + >> > > + if (!(file_in->f_mode & FMODE_READ) || >> > > + !(file_out->f_mode & FMODE_WRITE) || >> > > + (file_out->f_flags & O_APPEND)) >> > > + return -EBADF; >> > > + >> > > + if (len == 0) >> > > + return 0; >> > > + >> > > + file_start_write(file_out); >> > > + >> > > + /* >> > > + * Try cloning first, this is supported by more file systems, and >> > > + * more efficient if both clone and copy are supported (e.g. NFS). >> > > + */ >> > > + if (file_in->f_op->clone_file_range) { >> > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, >> > > + file_out, pos_out, len); >> > > + if (ret == 0) { >> > > + ret = len; >> > > + goto done; >> > > + } >> > > + } >> > > + >> > > + if (file_out->f_op->copy_file_range64) { >> > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, >> > > + file_out, pos_out, len, flags); >> > > + if (ret != -EOPNOTSUPP) >> > > + goto done; >> > > + } >> > > + >> > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, >> > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); >> > > + >> > > +done: >> > > + if (ret > 0) { >> > > + fsnotify_access(file_in); >> > > + add_rchar(current, ret); >> > > + fsnotify_modify(file_out); >> > > + add_wchar(current, ret); >> > > + } >> > > + >> > > + inc_syscr(current); >> > > + inc_syscw(current); >> > > + >> > > + file_end_write(file_out); >> > > + >> > > + return ret; >> > > +} >> > > +EXPORT_SYMBOL(vfs_copy_file_range64); >> > > + >> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, >> > > + int, fd_out, loff_t __user *, off_out, >> > > + u64, len, unsigned int, flags) >> > > +{ >> > > + loff_t pos_in; >> > > + loff_t pos_out; >> > > + struct fd f_in; >> > > + struct fd f_out; >> > > + u64 ret = -EBADF; >> > > + >> > > + f_in = fdget(fd_in); >> > > + if (!f_in.file) >> > > + goto out2; >> > > + >> > > + f_out = fdget(fd_out); >> > > + if (!f_out.file) >> > > + goto out1; >> > > + >> > > + ret = -EFAULT; >> > > + if (off_in) { >> > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) >> > > + goto out; >> > > + } else { >> > > + pos_in = f_in.file->f_pos; >> > > + } >> > > + >> > > + if (off_out) { >> > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) >> > > + goto out; >> > > + } else { >> > > + pos_out = f_out.file->f_pos; >> > > + } >> > > + >> > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, >> > > + flags); >> > > + if (ret > 0) { >> > > + pos_in += ret; >> > > + pos_out += ret; >> > > + >> > > + if (off_in) { >> > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) >> > > + ret = -EFAULT; >> > > + } else { >> > > + f_in.file->f_pos = pos_in; >> > > + } >> > > + >> > > + if (off_out) { >> > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) >> > > + ret = -EFAULT; >> > > + } else { >> > > + f_out.file->f_pos = pos_out; >> > > + } >> > > + } >> > > + >> > > +out: >> > > + fdput(f_out); >> > > +out1: >> > > + fdput(f_in); >> > > +out2: >> > > + return ret; >> > > +} >> > > + >> > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) >> > > { >> > > struct inode *inode = file_inode(file); >> > > diff --git a/include/linux/fs.h b/include/linux/fs.h >> > > index 803e5a9..2727a98 100644 >> > > --- a/include/linux/fs.h >> > > +++ b/include/linux/fs.h >> > > @@ -1690,6 +1690,8 @@ struct file_operations { >> > > u64); >> > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, >> > > u64); >> > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, >> > > + loff_t, u64, unsigned int); >> > > }; >> > > >> > > struct inode_operations { >> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, >> > > loff_t len, bool *is_same); >> > > extern int vfs_dedupe_file_range(struct file *file, >> > > struct file_dedupe_range *same); >> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, >> > > + loff_t, u64, unsigned int); >> > > >> > > struct super_operations { >> > > struct inode *(*alloc_inode)(struct super_block *sb); >> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> > > index 980c3c9..f7ea78e 100644 >> > > --- a/include/linux/syscalls.h >> > > +++ b/include/linux/syscalls.h >> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, >> > > asmlinkage long sys_pkey_free(int pkey); >> > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, >> > > unsigned mask, struct statx __user *buffer); >> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, >> > > + int fd_out, loff_t __user *off_out, >> > > + u64 len, unsigned int flags); >> > > >> > > #endif >> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h >> > > index 061185a..e385a76 100644 >> > > --- a/include/uapi/asm-generic/unistd.h >> > > +++ b/include/uapi/asm-generic/unistd.h >> > > @@ -731,9 +731,11 @@ >> > > __SYSCALL(__NR_pkey_free, sys_pkey_free) >> > > #define __NR_statx 291 >> > > __SYSCALL(__NR_statx, sys_statx) >> > > +#define __NR_copy_file_range64 292 >> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) >> > > >> > > #undef __NR_syscalls >> > > -#define __NR_syscalls 292 >> > > +#define __NR_syscalls 293 >> > > >> > > /* >> > > * All syscalls below here should go away really, >> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> > > index 8acef85..8e0cfd4 100644 >> > > --- a/kernel/sys_ni.c >> > > +++ b/kernel/sys_ni.c >> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) >> > > cond_syscall(sys_capget); >> > > cond_syscall(sys_capset); >> > > cond_syscall(sys_copy_file_range); >> > > +cond_syscall(sys_copy_file_range64); >> > > >> > > /* arch-specific weak syscall entries */ >> > > cond_syscall(sys_pciconfig_read); >> > >> > Hi Olga, >> > >> > We discussed this a bit privately, but I'll note it here too. >> > >> > Server-to-server copy seems like a nice thing to me as well. There are >> > several filesystems that can likely make use of that ability. >> > >> > I don't have a real opinion on the API change either, but you're making >> > a very subtle change with this patch. Prior to this, the existing >> > copy_file_range calls could count on the superblocks being the same. >> > Now, it looks like you can pass them any old file pointer. >> >> What if we add a new file op for xdev copy that gets called when >> superblocks are different, but filesystem type is the same? We could >> keep the current copy_file_range fop to fall back on if superblocks >> are the same. > > I don't think that would really help much. There are only two > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't > think we really need another special case op for this. > > What I would probably suggest is to just push the check for the same > superblock down into the copy_file_range ops in one patch (with no > functional change). Then, you could go and lift that restriction in the > NFSv4 operation without regressing cifs. The CIFS folks could eventually > do the same for theirs. CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses? ^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 1/1] VFS permit cross device vfs_copy_file_range 2017-06-26 14:46 ` Olga Kornievskaia @ 2017-06-26 14:49 ` Olga Kornievskaia 2017-06-27 11:47 ` Jeff Layton 0 siblings, 1 reply; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-26 14:49 UTC (permalink / raw) To: linux-fsdevel, linux-nfs Allow copy_file_range to copy between different superblocks but only of the same file system types. This feature is needed by NFSv4.2 to perform file copy operation on the same server or file copy between different NFSv4.2 servers. If a file system's fileoperations copy_file_range operation prohibits cross-device copies, fall back to do_splice_direct. This is needed for the NFS (destination) server side implementation of the file copy. Currently there is only 1 implementor of the copy_file_range FS operation -- CIFS. CIFS assumes incoming file descriptors are both CIFS but it will check if they are coming from different servers. NFS will allow for copies between different NFS servers. Adding to the vfs.txt documentation to explicitly warn about allowing for different superblocks of the same file type to be passed into the copy_file_range for the future users of copy_file_range method. Signed-off-by: Olga Kornievskaia <kolga@netapp.com> --- Documentation/filesystems/vfs.txt | 7 +++++++ fs/read_write.c | 12 +++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index f42b906..cf22424 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -845,6 +845,8 @@ struct file_operations { #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, + loff_t, size_t, unsigned int); }; Again, all methods are called without any locks being held, unless @@ -913,6 +915,11 @@ otherwise noted. fallocate: called by the VFS to preallocate blocks or punch a hole. + copy_file_range: called by copy_file_range(2) system call. This method + works on two file descriptors that might reside on + different superblocks of the same type of file system. + + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d44..effbfb2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - /* this could be relaxed once a method supports cross-fs copies */ - if (inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - if (len == 0) return 0; @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { + if (inode_in->i_sb == inode_out->i_sb && + file_in->f_op->clone_file_range) { ret = file_in->f_op->clone_file_range(file_in, pos_in, file_out, pos_out, len); if (ret == 0) { @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, } } - if (file_out->f_op->copy_file_range) { + if (file_out->f_op->copy_file_range && + (file_in->f_op == file_out->f_op)) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - if (ret != -EOPNOTSUPP) + if (ret != -EOPNOTSUPP && ret != -EXDEV) goto done; } -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range 2017-06-26 14:49 ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia @ 2017-06-27 11:47 ` Jeff Layton 2017-06-27 16:12 ` Olga Kornievskaia 0 siblings, 1 reply; 35+ messages in thread From: Jeff Layton @ 2017-06-27 11:47 UTC (permalink / raw) To: Olga Kornievskaia, linux-fsdevel, linux-nfs On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote: > Allow copy_file_range to copy between different superblocks but only > of the same file system types. > > This feature is needed by NFSv4.2 to perform file copy operation on > the same server or file copy between different NFSv4.2 servers. > > If a file system's fileoperations copy_file_range operation prohibits > cross-device copies, fall back to do_splice_direct. This is needed for > the NFS (destination) server side implementation of the file copy. > > Currently there is only 1 implementor of the copy_file_range FS > operation -- CIFS. CIFS assumes incoming file descriptors are both > CIFS but it will check if they are coming from different servers. > > NFS will allow for copies between different NFS servers. > > Adding to the vfs.txt documentation to explicitly warn about allowing > for different superblocks of the same file type to be passed into the > copy_file_range for the future users of copy_file_range method. > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > --- > Documentation/filesystems/vfs.txt | 7 +++++++ > fs/read_write.c | 12 +++++------- > 2 files changed, 12 insertions(+), 7 deletions(-) > > diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt > index f42b906..cf22424 100644 > --- a/Documentation/filesystems/vfs.txt > +++ b/Documentation/filesystems/vfs.txt > @@ -845,6 +845,8 @@ struct file_operations { > #ifndef CONFIG_MMU > unsigned (*mmap_capabilities)(struct file *); > #endif > + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > + loff_t, size_t, unsigned int); > }; > > Again, all methods are called without any locks being held, unless > @@ -913,6 +915,11 @@ otherwise noted. > > fallocate: called by the VFS to preallocate blocks or punch a hole. > > + copy_file_range: called by copy_file_range(2) system call. This method > + works on two file descriptors that might reside on > + different superblocks of the same type of file system. > + > + > Note that the file operations are implemented by the specific > filesystem in which the inode resides. When opening a device node > (character or block special) most filesystems will call special > diff --git a/fs/read_write.c b/fs/read_write.c > index 47c1d44..effbfb2 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > (file_out->f_flags & O_APPEND)) > return -EBADF; > > - /* this could be relaxed once a method supports cross-fs copies */ > - if (inode_in->i_sb != inode_out->i_sb) > - return -EXDEV; > - > if (len == 0) > return 0; > > @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > * Try cloning first, this is supported by more file systems, and > * more efficient if both clone and copy are supported (e.g. NFS). > */ > - if (file_in->f_op->clone_file_range) { > + if (inode_in->i_sb == inode_out->i_sb && > + file_in->f_op->clone_file_range) { I do wonder if we really ought to check for the same superblock here. ISTM that it might be possible for some fs' (btrfs?) to clone blocks across superblocks in some situations? Oh well -- I guess we can cross that bridge when we come to it. > ret = file_in->f_op->clone_file_range(file_in, pos_in, > file_out, pos_out, len); > if (ret == 0) { > @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > } > } > > - if (file_out->f_op->copy_file_range) { > + if (file_out->f_op->copy_file_range && > + (file_in->f_op == file_out->f_op)) { It's possible that they might not have the same f_op pointer, but have the same copy_file_range operation (e.g. cifs.ko has a bunch of different f_op structures for different cases, so they might not be the same even though they have the same copy_file_range op). Maybe this should be: file_in->f_op->copy_file_range == file_out->f_op->copy_file_range ? > ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, > pos_out, len, flags); > - if (ret != -EOPNOTSUPP) > + if (ret != -EOPNOTSUPP && ret != -EXDEV) > goto done; > } > -- Jeff Layton <jlayton@redhat.com> ^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range 2017-06-27 11:47 ` Jeff Layton @ 2017-06-27 16:12 ` Olga Kornievskaia 0 siblings, 0 replies; 35+ messages in thread From: Olga Kornievskaia @ 2017-06-27 16:12 UTC (permalink / raw) To: Jeff Layton; +Cc: Olga Kornievskaia, linux-fsdevel@vger.kernel.org, linux-nfs On Tue, Jun 27, 2017 at 7:47 AM, Jeff Layton <jlayton@redhat.com> wrote: > On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote: >> Allow copy_file_range to copy between different superblocks but only >> of the same file system types. >> >> This feature is needed by NFSv4.2 to perform file copy operation on >> the same server or file copy between different NFSv4.2 servers. >> >> If a file system's fileoperations copy_file_range operation prohibits >> cross-device copies, fall back to do_splice_direct. This is needed for >> the NFS (destination) server side implementation of the file copy. >> >> Currently there is only 1 implementor of the copy_file_range FS >> operation -- CIFS. CIFS assumes incoming file descriptors are both >> CIFS but it will check if they are coming from different servers. >> >> NFS will allow for copies between different NFS servers. >> >> Adding to the vfs.txt documentation to explicitly warn about allowing >> for different superblocks of the same file type to be passed into the >> copy_file_range for the future users of copy_file_range method. >> >> Signed-off-by: Olga Kornievskaia <kolga@netapp.com> >> --- >> Documentation/filesystems/vfs.txt | 7 +++++++ >> fs/read_write.c | 12 +++++------- >> 2 files changed, 12 insertions(+), 7 deletions(-) >> >> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt >> index f42b906..cf22424 100644 >> --- a/Documentation/filesystems/vfs.txt >> +++ b/Documentation/filesystems/vfs.txt >> @@ -845,6 +845,8 @@ struct file_operations { >> #ifndef CONFIG_MMU >> unsigned (*mmap_capabilities)(struct file *); >> #endif >> + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, >> + loff_t, size_t, unsigned int); >> }; >> >> Again, all methods are called without any locks being held, unless >> @@ -913,6 +915,11 @@ otherwise noted. >> >> fallocate: called by the VFS to preallocate blocks or punch a hole. >> >> + copy_file_range: called by copy_file_range(2) system call. This method >> + works on two file descriptors that might reside on >> + different superblocks of the same type of file system. >> + >> + >> Note that the file operations are implemented by the specific >> filesystem in which the inode resides. When opening a device node >> (character or block special) most filesystems will call special >> diff --git a/fs/read_write.c b/fs/read_write.c >> index 47c1d44..effbfb2 100644 >> --- a/fs/read_write.c >> +++ b/fs/read_write.c >> @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> (file_out->f_flags & O_APPEND)) >> return -EBADF; >> >> - /* this could be relaxed once a method supports cross-fs copies */ >> - if (inode_in->i_sb != inode_out->i_sb) >> - return -EXDEV; >> - >> if (len == 0) >> return 0; >> >> @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> * Try cloning first, this is supported by more file systems, and >> * more efficient if both clone and copy are supported (e.g. NFS). >> */ >> - if (file_in->f_op->clone_file_range) { >> + if (inode_in->i_sb == inode_out->i_sb && >> + file_in->f_op->clone_file_range) { > > I do wonder if we really ought to check for the same superblock here. > > ISTM that it might be possible for some fs' (btrfs?) to clone blocks > across superblocks in some situations? > > Oh well -- I guess we can cross that bridge when we come to it. I was trying to keep with the expectations of the underlying function calls. Since I'm not proposing to change clone_file_range, I added that check. >> ret = file_in->f_op->clone_file_range(file_in, pos_in, >> file_out, pos_out, len); >> if (ret == 0) { >> @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, >> } >> } >> >> - if (file_out->f_op->copy_file_range) { >> + if (file_out->f_op->copy_file_range && >> + (file_in->f_op == file_out->f_op)) { > > It's possible that they might not have the same f_op pointer, but have > the same copy_file_range operation (e.g. cifs.ko has a bunch of > different f_op structures for different cases, so they might not be the > same even though they have the same copy_file_range op). > > Maybe this should be: > > file_in->f_op->copy_file_range == file_out->f_op->copy_file_range > > ? Yeah, I was debating between those two options and decided to go with f_op pointer but I agree the other way is the way to go. I'll wait to hear if there are any other comments and if not make this change and send the next version. thank you for the comments! >> ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, >> pos_out, len, flags); >> - if (ret != -EOPNOTSUPP) >> + if (ret != -EOPNOTSUPP && ret != -EXDEV) >> goto done; >> } >> > > -- > Jeff Layton <jlayton@redhat.com> > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 35+ messages in thread
[parent not found: <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com>]
* Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call [not found] ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com> @ 2017-06-26 14:51 ` Jeff Layton 0 siblings, 0 replies; 35+ messages in thread From: Jeff Layton @ 2017-06-26 14:51 UTC (permalink / raw) To: Olga Kornievskaia; +Cc: Anna Schumaker, linux-fsdevel, linux-nfs On Mon, 2017-06-26 at 10:40 -0400, Olga Kornievskaia wrote: > > On Jun 26, 2017, at 9:46 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote: > > > On 06/24/2017 09:23 AM, Jeff Layton wrote: > > > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote: > > > > > This is a proposal to allow 64bit count to copy and return as > > > > > a result of a copy_file_range. No attempt was made to share code > > > > > with the 32bit function because 32bit interface should probably > > > > > get depreciated. > > > > > > > > > > Why use 64bit? Current uses of 32bit are by clone_file_range() > > > > > which could use 64bit count and NFS copy_file_range also supports > > > > > 64bit count value. > > > > > > > > > > Also with this proposal off-the-bet allow the userland to copy > > > > > between different mount points. > > > > > > > > > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com> > > > > > --- > > > > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > > > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > > > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++ > > > > > include/linux/fs.h | 4 + > > > > > include/linux/syscalls.h | 3 + > > > > > include/uapi/asm-generic/unistd.h | 4 +- > > > > > kernel/sys_ni.c | 1 + > > > > > 7 files changed, 149 insertions(+), 1 deletion(-) > > > > > > > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > > > > > index 448ac21..1f5f1e7 100644 > > > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl > > > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > > > > > @@ -391,3 +391,4 @@ > > > > > 382 i386 pkey_free sys_pkey_free > > > > > 383 i386 statx sys_statx > > > > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > > > > > +385 i386 copy_file_range64 sys_copy_file_range64 > > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > > > > index 5aef183..e658bbe 100644 > > > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > > > > @@ -339,6 +339,7 @@ > > > > > 330 common pkey_alloc sys_pkey_alloc > > > > > 331 common pkey_free sys_pkey_free > > > > > 332 common statx sys_statx > > > > > +333 64 copy_file_range64 sys_copy_file_range64 > > > > > > > > > > # > > > > > # x32-specific system call numbers start at 512 to avoid cache impact > > > > > diff --git a/fs/read_write.c b/fs/read_write.c > > > > > index 47c1d44..e2665c1 100644 > > > > > --- a/fs/read_write.c > > > > > +++ b/fs/read_write.c > > > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, > > > > > return ret; > > > > > } > > > > > > > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in, > > > > > + struct file *file_out, loff_t pos_out, > > > > > + u64 len, unsigned int flags) > > > > > +{ > > > > > + struct inode *inode_in = file_inode(file_in); > > > > > + struct inode *inode_out = file_inode(file_out); > > > > > + u64 ret; > > > > > + > > > > > + if (flags != 0) > > > > > + return -EINVAL; > > > > > + > > > > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) > > > > > + return -EISDIR; > > > > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) > > > > > + return -EINVAL; > > > > > + > > > > > + ret = rw_verify_area(READ, file_in, &pos_in, len); > > > > > + if (unlikely(ret)) > > > > > + return ret; > > > > > + > > > > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len); > > > > > + if (unlikely(ret)) > > > > > + return ret; > > > > > + > > > > > + if (!(file_in->f_mode & FMODE_READ) || > > > > > + !(file_out->f_mode & FMODE_WRITE) || > > > > > + (file_out->f_flags & O_APPEND)) > > > > > + return -EBADF; > > > > > + > > > > > + if (len == 0) > > > > > + return 0; > > > > > + > > > > > + file_start_write(file_out); > > > > > + > > > > > + /* > > > > > + * Try cloning first, this is supported by more file systems, and > > > > > + * more efficient if both clone and copy are supported (e.g. NFS). > > > > > + */ > > > > > + if (file_in->f_op->clone_file_range) { > > > > > + ret = file_in->f_op->clone_file_range(file_in, pos_in, > > > > > + file_out, pos_out, len); > > > > > + if (ret == 0) { > > > > > + ret = len; > > > > > + goto done; > > > > > + } > > > > > + } > > > > > + > > > > > + if (file_out->f_op->copy_file_range64) { > > > > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in, > > > > > + file_out, pos_out, len, flags); > > > > > + if (ret != -EOPNOTSUPP) > > > > > + goto done; > > > > > + } > > > > > + > > > > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, > > > > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); > > > > > + > > > > > +done: > > > > > + if (ret > 0) { > > > > > + fsnotify_access(file_in); > > > > > + add_rchar(current, ret); > > > > > + fsnotify_modify(file_out); > > > > > + add_wchar(current, ret); > > > > > + } > > > > > + > > > > > + inc_syscr(current); > > > > > + inc_syscw(current); > > > > > + > > > > > + file_end_write(file_out); > > > > > + > > > > > + return ret; > > > > > +} > > > > > +EXPORT_SYMBOL(vfs_copy_file_range64); > > > > > + > > > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in, > > > > > + int, fd_out, loff_t __user *, off_out, > > > > > + u64, len, unsigned int, flags) > > > > > +{ > > > > > + loff_t pos_in; > > > > > + loff_t pos_out; > > > > > + struct fd f_in; > > > > > + struct fd f_out; > > > > > + u64 ret = -EBADF; > > > > > + > > > > > + f_in = fdget(fd_in); > > > > > + if (!f_in.file) > > > > > + goto out2; > > > > > + > > > > > + f_out = fdget(fd_out); > > > > > + if (!f_out.file) > > > > > + goto out1; > > > > > + > > > > > + ret = -EFAULT; > > > > > + if (off_in) { > > > > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) > > > > > + goto out; > > > > > + } else { > > > > > + pos_in = f_in.file->f_pos; > > > > > + } > > > > > + > > > > > + if (off_out) { > > > > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) > > > > > + goto out; > > > > > + } else { > > > > > + pos_out = f_out.file->f_pos; > > > > > + } > > > > > + > > > > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len, > > > > > + flags); > > > > > + if (ret > 0) { > > > > > + pos_in += ret; > > > > > + pos_out += ret; > > > > > + > > > > > + if (off_in) { > > > > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) > > > > > + ret = -EFAULT; > > > > > + } else { > > > > > + f_in.file->f_pos = pos_in; > > > > > + } > > > > > + > > > > > + if (off_out) { > > > > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) > > > > > + ret = -EFAULT; > > > > > + } else { > > > > > + f_out.file->f_pos = pos_out; > > > > > + } > > > > > + } > > > > > + > > > > > +out: > > > > > + fdput(f_out); > > > > > +out1: > > > > > + fdput(f_in); > > > > > +out2: > > > > > + return ret; > > > > > +} > > > > > + > > > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) > > > > > { > > > > > struct inode *inode = file_inode(file); > > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > > > index 803e5a9..2727a98 100644 > > > > > --- a/include/linux/fs.h > > > > > +++ b/include/linux/fs.h > > > > > @@ -1690,6 +1690,8 @@ struct file_operations { > > > > > u64); > > > > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, > > > > > u64); > > > > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *, > > > > > + loff_t, u64, unsigned int); > > > > > }; > > > > > > > > > > struct inode_operations { > > > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, > > > > > loff_t len, bool *is_same); > > > > > extern int vfs_dedupe_file_range(struct file *file, > > > > > struct file_dedupe_range *same); > > > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *, > > > > > + loff_t, u64, unsigned int); > > > > > > > > > > struct super_operations { > > > > > struct inode *(*alloc_inode)(struct super_block *sb); > > > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > > > > > index 980c3c9..f7ea78e 100644 > > > > > --- a/include/linux/syscalls.h > > > > > +++ b/include/linux/syscalls.h > > > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, > > > > > asmlinkage long sys_pkey_free(int pkey); > > > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, > > > > > unsigned mask, struct statx __user *buffer); > > > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in, > > > > > + int fd_out, loff_t __user *off_out, > > > > > + u64 len, unsigned int flags); > > > > > > > > > > #endif > > > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > > > > > index 061185a..e385a76 100644 > > > > > --- a/include/uapi/asm-generic/unistd.h > > > > > +++ b/include/uapi/asm-generic/unistd.h > > > > > @@ -731,9 +731,11 @@ > > > > > __SYSCALL(__NR_pkey_free, sys_pkey_free) > > > > > #define __NR_statx 291 > > > > > __SYSCALL(__NR_statx, sys_statx) > > > > > +#define __NR_copy_file_range64 292 > > > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64) > > > > > > > > > > #undef __NR_syscalls > > > > > -#define __NR_syscalls 292 > > > > > +#define __NR_syscalls 293 > > > > > > > > > > /* > > > > > * All syscalls below here should go away really, > > > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > > > > > index 8acef85..8e0cfd4 100644 > > > > > --- a/kernel/sys_ni.c > > > > > +++ b/kernel/sys_ni.c > > > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void) > > > > > cond_syscall(sys_capget); > > > > > cond_syscall(sys_capset); > > > > > cond_syscall(sys_copy_file_range); > > > > > +cond_syscall(sys_copy_file_range64); > > > > > > > > > > /* arch-specific weak syscall entries */ > > > > > cond_syscall(sys_pciconfig_read); > > > > > > > > Hi Olga, > > > > > > > > We discussed this a bit privately, but I'll note it here too. > > > > > > > > Server-to-server copy seems like a nice thing to me as well. There are > > > > several filesystems that can likely make use of that ability. > > > > > > > > I don't have a real opinion on the API change either, but you're making > > > > a very subtle change with this patch. Prior to this, the existing > > > > copy_file_range calls could count on the superblocks being the same. > > > > Now, it looks like you can pass them any old file pointer. > > > > > > What if we add a new file op for xdev copy that gets called when > > > superblocks are different, but filesystem type is the same? We could > > > keep the current copy_file_range fop to fall back on if superblocks > > > are the same. > > > > I don't think that would really help much. There are only two > > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't > > think we really need another special case op for this. > > > > What I would probably suggest is to just push the check for the same > > superblock down into the copy_file_range ops in one patch (with no > > functional change). Then, you could go and lift that restriction in the > > NFSv4 operation without regressing cifs. The CIFS folks could eventually > > do the same for theirs. > > CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses? > Yes that should cover it. -- Jeff Layton <jlayton@redhat.com> ^ permalink raw reply [flat|nested] 35+ messages in thread
end of thread, other threads:[~2017-06-27 16:12 UTC | newest] Thread overview: 35+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2017-06-14 17:06 [PATCH 1/1] [RFC] 64bit copy_file_range system call Olga Kornievskaia 2017-06-14 17:24 ` Anna Schumaker 2017-06-14 18:53 ` Olga Kornievskaia 2017-06-14 19:32 ` Amir Goldstein 2017-06-14 20:08 ` Olga Kornievskaia 2017-06-14 21:50 ` Randy Dunlap 2017-06-15 13:07 ` Olga Kornievskaia 2017-06-15 3:59 ` Darrick J. Wong 2017-06-15 14:39 ` Olga Kornievskaia 2017-06-15 20:53 ` Darrick J. Wong 2017-06-19 18:34 ` Olga Kornievskaia 2017-06-19 19:39 ` Christoph Hellwig 2017-06-19 19:48 ` Olga Kornievskaia 2017-06-20 12:38 ` Steve Dickson 2017-06-20 19:33 ` Chuck Lever 2017-06-20 19:37 ` Olga Kornievskaia 2017-06-24 11:29 ` Mkrtchyan, Tigran 2017-06-20 12:44 ` Steve Dickson 2017-06-15 8:15 ` Christoph Hellwig 2017-06-15 13:07 ` Olga Kornievskaia 2017-06-15 18:35 ` Andreas Dilger 2017-06-15 19:11 ` Olga Kornievskaia 2017-06-16 17:36 ` J. Bruce Fields 2017-06-17 10:03 ` Christoph Hellwig 2017-06-17 12:42 ` Olga Kornievskaia 2017-06-17 15:09 ` Christoph Hellwig 2017-06-17 18:24 ` J. Bruce Fields 2017-06-24 13:23 ` Jeff Layton 2017-06-26 13:21 ` Anna Schumaker 2017-06-26 13:46 ` Jeff Layton 2017-06-26 14:46 ` Olga Kornievskaia 2017-06-26 14:49 ` [PATCH 1/1] VFS permit cross device vfs_copy_file_range Olga Kornievskaia 2017-06-27 11:47 ` Jeff Layton 2017-06-27 16:12 ` Olga Kornievskaia [not found] ` <5E53733A-8136-4FB5-A7E9-F4846BF98507@netapp.com> 2017-06-26 14:51 ` [PATCH 1/1] [RFC] 64bit copy_file_range system call Jeff Layton
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).