* [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-14 21:15 [RFC v0 0/4] sys_copy_range() rough draft Zach Brown
@ 2013-05-14 21:15 ` Zach Brown
2013-05-15 19:44 ` Eric Wong
0 siblings, 1 reply; 14+ messages in thread
From: Zach Brown @ 2013-05-14 21:15 UTC (permalink / raw)
To: Martin K. Petersen, Trond Myklebust, linux-kernel, linux-fsdevel,
linux-btrfs, linux-nfs
This adds a syscall and vfs entry point for clone_range which offloads
data copying between existing files.
The syscall is a thin wrapper around the vfs entry point. Its arguments
are inspired by sys_splice().
The behaviour of the vfs helper is derived from the current btrfs
CLONE_RANGE ioctl.
---
fs/Makefile | 2 +-
fs/copy_range.c | 127 ++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
5 files changed, 135 insertions(+), 2 deletions(-)
create mode 100644 fs/copy_range.c
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3..1be83b3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o copy_range.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/copy_range.c b/fs/copy_range.c
new file mode 100644
index 0000000..3000b9f
--- /dev/null
+++ b/fs/copy_range.c
@@ -0,0 +1,127 @@
+/*
+ * "copy_range": offload data copying between existing files
+ *
+ * Copyright (C) 2013 Zach Brown <zab@redhat.com>
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/export.h>
+#include <linux/fsnotify.h>
+
+/**
+ * vfs_copy_range - copy range of bytes from source file to existing file
+ * @file_in: source regular file
+ * @pos_in: starting byte offset to copy from the source file
+ * @file_out: destination regular file
+ * @pos_out: starting byte offset to copy to in the destination file
+ * @count: number of bytes to copy
+ *
+ * Returns number of bytes successfully copied from the start of the range or
+ * a negative errno error value.
+ *
+ * The number of bytes successfully written can be less than the input
+ * count if an error is encountered. In this partial success case the
+ * contents of the destination range after the copied bytes can be a mix
+ * of pre-existing bytes, bytes from the source range, or zeros,
+ * depending on the implementation.
+ *
+ * The source range must be entirely within i_size in the source file.
+ * A destination range outside of the size of the destination file will
+ * extend its size.
+ */
+ssize_t vfs_copy_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count)
+{
+ struct inode *inode_in;
+ struct inode *inode_out;
+ ssize_t ret;
+
+ if (count == 0)
+ return 0;
+
+ /* copy_range allows full ssize_t count, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, count);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, count);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op || !file_in->f_op->copy_range)
+ return -EINVAL;
+
+ inode_in = file_inode(file_in);
+ inode_out = file_inode(file_out);
+
+ /* make sure offsets don't wrap and the input is inside i_size */
+ if (pos_in + count < pos_in || pos_out + count < pos_out ||
+ pos_in + count > i_size_read(inode_in))
+ return -EINVAL;
+
+ /* XXX do we want this test? btrfs_ioctl_clone_range() */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ /* forbid ranges in the same file for now */
+ if (inode_in == inode_out)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->copy_range(file_in, pos_in, file_out, pos_out,
+ count);
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_range);
+
+SYSCALL_DEFINE5(copy_range, int, fd_in, loff_t __user *, upos_in,
+ int, fd_out, loff_t __user *, upos_out, size_t, count)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret;
+
+ if (get_user(pos_in, upos_in) || get_user(pos_out, upos_out))
+ return -EFAULT;
+
+ f_in = fdget(fd_in);
+ f_out = fdget(fd_out);
+
+ if (f_in.file && f_out.file)
+ ret = vfs_copy_range(f_in.file, pos_in, f_out.file, pos_out,
+ count);
+ else
+ ret = -EBADF;
+
+ fdput(f_in);
+ fdput(f_out);
+
+ return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 43db02e..6214893 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1543,6 +1543,7 @@ struct file_operations {
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
int (*show_fdinfo)(struct seq_file *m, struct file *f);
+ ssize_t (*copy_range)(struct file *, loff_t, struct file *, loff_t, size_t);
};
struct inode_operations {
@@ -1588,6 +1589,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern ssize_t vfs_copy_range(struct file *, loff_t , struct file *, loff_t,
+ size_t);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 0cc74c4..3935d1c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -692,9 +692,11 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
__SYSCALL(__NR_kcmp, sys_kcmp)
#define __NR_finit_module 273
__SYSCALL(__NR_finit_module, sys_finit_module)
+#define __NR_copy_range 274
+__SYSCALL(__NR_copy_range, sys_copy_range)
#undef __NR_syscalls
-#define __NR_syscalls 274
+#define __NR_syscalls 275
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052..af7808a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -151,6 +151,7 @@ cond_syscall(sys_process_vm_readv);
cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_copy_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
1.7.11.7
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
@ 2013-05-15 17:50 Steve French
2013-05-15 18:54 ` J. Bruce Fields
[not found] ` <CAH2r5ms0P8Hgv1mUpyHA32Er38iiaC1HHC4fhxvz2SBFy6Sucw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 2 replies; 14+ messages in thread
From: Steve French @ 2013-05-15 17:50 UTC (permalink / raw)
To: linux-fsdevel, zab-H+wXaHxf7aLQT0dZR+AlfA
Cc: linux-cifs-u79uwXL29TY76Z2rM5mHXA
Doesn't the new syscall have to invalidate the page cache pages that
the server is about to overwrite as btrfs does with the following line
in fs/btrfs/ioctl.c
truncate_inode_pages_range(&inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len) - 1);
(and doesn't truncate_inode_pages_range handle page cache alignment
anyway - and also why did btrfs use truncate_inode_pages_range instead
of invalidate?)
Does nfs client ever have the case where two different superblocks map
to the same nfs export (and thus the check below is restricting the
ability to do server side copy)?
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
I am working on cifs client patches for the ioctl, and the new syscall
also looks pretty easy. Some popular cifs servers (like Windows)
have supported smb/cifs copy offload for many, many years - and now
Samba with the support that David Disseldorp added for the clone range
ioctl has been supporting copychunk (server side copy from Windows to
Samba) so about time to finish the cifs client equivalent.
--
Thanks,
Steve
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 17:50 [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Steve French
@ 2013-05-15 18:54 ` J. Bruce Fields
[not found] ` <20130515185429.GA25994-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
[not found] ` <CAH2r5ms0P8Hgv1mUpyHA32Er38iiaC1HHC4fhxvz2SBFy6Sucw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
1 sibling, 1 reply; 14+ messages in thread
From: J. Bruce Fields @ 2013-05-15 18:54 UTC (permalink / raw)
To: Steve French; +Cc: linux-fsdevel, zab, linux-cifs
On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
> Doesn't the new syscall have to invalidate the page cache pages that
> the server is about to overwrite as btrfs does with the following line
> in fs/btrfs/ioctl.c
>
> truncate_inode_pages_range(&inode->i_data, destoff,
> PAGE_CACHE_ALIGN(destoff + len) - 1);
>
> (and doesn't truncate_inode_pages_range handle page cache alignment
> anyway - and also why did btrfs use truncate_inode_pages_range instead
> of invalidate?)
>
> Does nfs client ever have the case where two different superblocks map
> to the same nfs export (and thus the check below is restricting the
> ability to do server side copy)?
>
> + if (inode_in->i_sb != inode_out->i_sb ||
> + file_in->f_path.mnt != file_out->f_path.mnt)
> + return -EXDEV;
The client attempts to use the same superblock whenever it can.
I suppose you're also losing the opportunity to copy between two
different filesystems on the same server, which should be faster than
requiring the client to do the copy.
--b.
>
> I am working on cifs client patches for the ioctl, and the new syscall
> also looks pretty easy. Some popular cifs servers (like Windows)
> have supported smb/cifs copy offload for many, many years - and now
> Samba with the support that David Disseldorp added for the clone range
> ioctl has been supporting copychunk (server side copy from Windows to
> Samba) so about time to finish the cifs client equivalent.
>
> --
> Thanks,
>
> Steve
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
[not found] ` <CAH2r5ms0P8Hgv1mUpyHA32Er38iiaC1HHC4fhxvz2SBFy6Sucw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2013-05-15 19:36 ` Zach Brown
[not found] ` <20130515193600.GA318-fypN+1c5dIyjpB87vu3CluTW4wlIGRCZ@public.gmane.org>
0 siblings, 1 reply; 14+ messages in thread
From: Zach Brown @ 2013-05-15 19:36 UTC (permalink / raw)
To: Steve French; +Cc: linux-fsdevel, linux-cifs-u79uwXL29TY76Z2rM5mHXA
On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
> Doesn't the new syscall have to invalidate the page cache pages that
> the server is about to overwrite as btrfs does with the following line
> in fs/btrfs/ioctl.c
>
> truncate_inode_pages_range(&inode->i_data, destoff,
> PAGE_CACHE_ALIGN(destoff + len) - 1);
The file_operations ->copy_range implementation is responsible for this,
yeah, similar to ->write being responsible for invalidating around
O_DIRECT writes.
> (and doesn't truncate_inode_pages_range handle page cache alignment
> anyway
No, it bugs if given an end offset that isn't page aligned:
BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
Lukas is working on a patch series to fix this:
http://marc.info/?l=linux-fsdevel&m=136854986101843&w=2
> - and also why did btrfs use truncate_inode_pages_range instead
> of invalidate?)
I'm not sure. Maybe they can have racing dirty pages and want them
thrown away rather than having invalidation fail?
> Does nfs client ever have the case where two different superblocks map
> to the same nfs export (and thus the check below is restricting the
> ability to do server side copy)?
>
> + if (inode_in->i_sb != inode_out->i_sb ||
> + file_in->f_path.mnt != file_out->f_path.mnt)
> + return -EXDEV;
Ah, yes, thanks for bringing this up. This check was brought over from
the btrfs ioctl code into the core just to make our lives less
complicated in the short term.
We may well want to push this test down into the ->copy_range methods so
that some can support these cross-sb/mnt copies. I'm not sure how the
nfs client manages all these relationships, but yes, what you suggest
seems plausible.
There's also the long term possibility of using the nfs copy op to
offload copying between files on different servers. The spec talks
about this quite a bit (how the client informs both servers, whether to
use nfs between the servers or some server-specific protocol), but this
certainly isn't my priority.
I figure we can worry about this once we have a ->copy_range method that
can copy files like this safely and are stopped by the test. It's easy
enough to move the test around.
> I am working on cifs client patches for the ioctl, and the new syscall
> also looks pretty easy. Some popular cifs servers (like Windows)
> have supported smb/cifs copy offload for many, many years - and now
> Samba with the support that David Disseldorp added for the clone range
> ioctl has been supporting copychunk (server side copy from Windows to
> Samba) so about time to finish the cifs client equivalent.
Cool, let me know if anything strange pops up.
- z
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
[not found] ` <20130515185429.GA25994-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
@ 2013-05-15 19:39 ` Zach Brown
0 siblings, 0 replies; 14+ messages in thread
From: Zach Brown @ 2013-05-15 19:39 UTC (permalink / raw)
To: J. Bruce Fields
Cc: Steve French, linux-fsdevel, linux-cifs-u79uwXL29TY76Z2rM5mHXA
On Wed, May 15, 2013 at 02:54:29PM -0400, J. Bruce Fields wrote:
> On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
> > Does nfs client ever have the case where two different superblocks map
> > to the same nfs export (and thus the check below is restricting the
> > ability to do server side copy)?
> >
> > + if (inode_in->i_sb != inode_out->i_sb ||
> > + file_in->f_path.mnt != file_out->f_path.mnt)
> > + return -EXDEV;
>
> The client attempts to use the same superblock whenever it can.
>
> I suppose you're also losing the opportunity to copy between two
> different filesystems on the same server, which should be faster than
> requiring the client to do the copy.
Ah, yeah, that'd probably be the simplest motivation to move this down
into the ->copy_range methods. It should be trivial to test once the
nfs bits are going and we have the page cache fallback.
- z
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-14 21:15 ` [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Zach Brown
@ 2013-05-15 19:44 ` Eric Wong
2013-05-15 20:03 ` Zach Brown
0 siblings, 1 reply; 14+ messages in thread
From: Eric Wong @ 2013-05-15 19:44 UTC (permalink / raw)
To: Zach Brown
Cc: Martin K. Petersen, Trond Myklebust, linux-kernel, linux-fsdevel,
linux-btrfs, linux-nfs
Zach Brown <zab@redhat.com> wrote:
> This adds a syscall and vfs entry point for clone_range which offloads
> data copying between existing files.
>
> The syscall is a thin wrapper around the vfs entry point. Its arguments
> are inspired by sys_splice().
Why introduce a new syscall instead of extending sys_splice?
Perhaps adding a new SPLICE_F_COPYRANGE flag to avoid compatibility
problems with userspace which may expect ESPIPE when given two non-pipes
would be useful.
If the user doesn't need a out offset, then sendfile() should also be
able to transparently utilize COPY/CLONE_RANGE, too.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 19:44 ` Eric Wong
@ 2013-05-15 20:03 ` Zach Brown
2013-05-16 21:16 ` Ric Wheeler
2013-05-21 19:47 ` Eric Wong
0 siblings, 2 replies; 14+ messages in thread
From: Zach Brown @ 2013-05-15 20:03 UTC (permalink / raw)
To: Eric Wong
Cc: Martin K. Petersen, Trond Myklebust, linux-kernel, linux-fsdevel,
linux-btrfs, linux-nfs
On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> Why introduce a new syscall instead of extending sys_splice?
Personally, I think it's ugly to have different operations use the same
syscall just because their arguments match.
But that preference aside, sure, if the consensus is that we'd rather
use the splice() entry point then I can duck tape the pieces together to
make it work.
> If the user doesn't need a out offset, then sendfile() should also be
> able to transparently utilize COPY/CLONE_RANGE, too.
Perhaps, yeah.
- z
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
[not found] ` <20130515193600.GA318-fypN+1c5dIyjpB87vu3CluTW4wlIGRCZ@public.gmane.org>
@ 2013-05-15 20:08 ` Steve French
2013-05-15 20:16 ` Chris Mason
0 siblings, 1 reply; 14+ messages in thread
From: Steve French @ 2013-05-15 20:08 UTC (permalink / raw)
To: Zach Brown
Cc: linux-fsdevel, linux-cifs-u79uwXL29TY76Z2rM5mHXA, J. Bruce Fields
On Wed, May 15, 2013 at 2:36 PM, Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
>> Doesn't the new syscall have to invalidate the page cache pages that
>> the server is about to overwrite as btrfs does with the following line
>> in fs/btrfs/ioctl.c
>>
>> truncate_inode_pages_range(&inode->i_data, destoff,
>> PAGE_CACHE_ALIGN(destoff + len) - 1);
>
> The file_operations ->copy_range implementation is responsible for this,
> yeah, similar to ->write being responsible for invalidating around
> O_DIRECT writes.
>
>> (and doesn't truncate_inode_pages_range handle page cache alignment
>> anyway
>
> No, it bugs if given an end offset that isn't page aligned:
>
> BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
>
> Lukas is working on a patch series to fix this:
>
> http://marc.info/?l=linux-fsdevel&m=136854986101843&w=2
>
>> - and also why did btrfs use truncate_inode_pages_range instead
>> of invalidate?)
>
> I'm not sure. Maybe they can have racing dirty pages and want them
> thrown away rather than having invalidation fail?
Ideally cifs (and presumably) nfs would do a flush of the beginning of
the first page (before source offset) and/or the end of the last page
of the range (after target offset) if dirty (to send it to the server
before the copy) if they are not writing the complete page out, and
then toss the pages (including the first and last partial pages) from
the page cache. Is there a good fs example for code for this?
>> Does nfs client ever have the case where two different superblocks map
>> to the same nfs export (and thus the check below is restricting the
>> ability to do server side copy)?
>>
>> + if (inode_in->i_sb != inode_out->i_sb ||
>> + file_in->f_path.mnt != file_out->f_path.mnt)
>> + return -EXDEV;
>
> Ah, yes, thanks for bringing this up. This check was brought over from
> the btrfs ioctl code into the core just to make our lives less
> complicated in the short term.
>
> We may well want to push this test down into the ->copy_range methods so
> that some can support these cross-sb/mnt copies. I'm not sure how the
> nfs client manages all these relationships, but yes, what you suggest
> seems plausible.
>
> There's also the long term possibility of using the nfs copy op to
> offload copying between files on different servers. The spec talks
> about this quite a bit (how the client informs both servers, whether to
> use nfs between the servers or some server-specific protocol), but this
> certainly isn't my priority.
>
> I figure we can worry about this once we have a ->copy_range method that
> can copy files like this safely and are stopped by the test. It's easy
> enough to move the test around.
>
>> I am working on cifs client patches for the ioctl, and the new syscall
>> also looks pretty easy. Some popular cifs servers (like Windows)
>> have supported smb/cifs copy offload for many, many years - and now
>> Samba with the support that David Disseldorp added for the clone range
>> ioctl has been supporting copychunk (server side copy from Windows to
>> Samba) so about time to finish the cifs client equivalent.
>
> Cool, let me know if anything strange pops up.
>
> - z
--
Thanks,
Steve
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 20:08 ` Steve French
@ 2013-05-15 20:16 ` Chris Mason
[not found] ` <20130515201614.24668.83788-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
0 siblings, 1 reply; 14+ messages in thread
From: Chris Mason @ 2013-05-15 20:16 UTC (permalink / raw)
To: Steve French, Zach Brown
Cc: linux-fsdevel, linux-cifs@vger.kernel.org, J. Bruce Fields
Quoting Steve French (2013-05-15 16:08:16)
> On Wed, May 15, 2013 at 2:36 PM, Zach Brown <zab@redhat.com> wrote:
> > On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
> >> Doesn't the new syscall have to invalidate the page cache pages that
> >> the server is about to overwrite as btrfs does with the following line
> >> in fs/btrfs/ioctl.c
> >>
> >> truncate_inode_pages_range(&inode->i_data, destoff,
> >> PAGE_CACHE_ALIGN(destoff + len) - 1);
> >
> > The file_operations ->copy_range implementation is responsible for this,
> > yeah, similar to ->write being responsible for invalidating around
> > O_DIRECT writes.
> >
> >> (and doesn't truncate_inode_pages_range handle page cache alignment
> >> anyway
> >
> > No, it bugs if given an end offset that isn't page aligned:
> >
> > BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
> >
> > Lukas is working on a patch series to fix this:
> >
> > http://marc.info/?l=linux-fsdevel&m=136854986101843&w=2
> >
> >> - and also why did btrfs use truncate_inode_pages_range instead
> >> of invalidate?)
> >
> > I'm not sure. Maybe they can have racing dirty pages and want them
> > thrown away rather than having invalidation fail?
truncate_inode_pages_range ends with invalidate. It just locks the page
first and makes sure to wait for writeback if it was already in
progress.
Looking at the btrfs side, I think we should add some locking on the
destination extents as well.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
[not found] ` <20130515201614.24668.83788-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2013-05-15 20:21 ` Steve French
2013-05-15 20:25 ` Chris Mason
0 siblings, 1 reply; 14+ messages in thread
From: Steve French @ 2013-05-15 20:21 UTC (permalink / raw)
To: Chris Mason
Cc: Zach Brown, linux-fsdevel,
linux-cifs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
J. Bruce Fields
On Wed, May 15, 2013 at 3:16 PM, Chris Mason <chris.mason-5c4llco8/ftWk0Htik3J/w@public.gmane.org> wrote:
> Quoting Steve French (2013-05-15 16:08:16)
>> On Wed, May 15, 2013 at 2:36 PM, Zach Brown <zab-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> > On Wed, May 15, 2013 at 12:50:13PM -0500, Steve French wrote:
>> >> Doesn't the new syscall have to invalidate the page cache pages that
>> >> the server is about to overwrite as btrfs does with the following line
>> >> in fs/btrfs/ioctl.c
>> >>
>> >> truncate_inode_pages_range(&inode->i_data, destoff,
>> >> PAGE_CACHE_ALIGN(destoff + len) - 1);
>> >
>> > The file_operations ->copy_range implementation is responsible for this,
>> > yeah, similar to ->write being responsible for invalidating around
>> > O_DIRECT writes.
>> >
>> >> (and doesn't truncate_inode_pages_range handle page cache alignment
>> >> anyway
>> >
>> > No, it bugs if given an end offset that isn't page aligned:
>> >
>> > BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
>> >
>> > Lukas is working on a patch series to fix this:
>> >
>> > http://marc.info/?l=linux-fsdevel&m=136854986101843&w=2
>> >
>> >> - and also why did btrfs use truncate_inode_pages_range instead
>> >> of invalidate?)
>> >
>> > I'm not sure. Maybe they can have racing dirty pages and want them
>> > thrown away rather than having invalidation fail?
>
> truncate_inode_pages_range ends with invalidate. It just locks the page
> first and makes sure to wait for writeback if it was already in
> progress.
Does
truncate_inode_pages_range(&inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len) - 1);
I also am worried about what does:
1) writing out data from start of previous page up to destoff (if
destoff not page aligned, and first page dirty)
2) writing out data from dest+len through end of page (if destoff not
page aligned, and last page dirty)
before tossing first through last page out
--
Thanks,
Steve
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 20:21 ` Steve French
@ 2013-05-15 20:25 ` Chris Mason
0 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2013-05-15 20:25 UTC (permalink / raw)
To: Steve French
Cc: Zach Brown, linux-fsdevel, linux-cifs@vger.kernel.org,
J. Bruce Fields
Quoting Steve French (2013-05-15 16:21:12)
> On Wed, May 15, 2013 at 3:16 PM, Chris Mason <chris.mason@fusionio.com> wrote:
> > truncate_inode_pages_range ends with invalidate. It just locks the page
> > first and makes sure to wait for writeback if it was already in
> > progress.
>
> Does
>
> truncate_inode_pages_range(&inode->i_data, destoff,
> PAGE_CACHE_ALIGN(destoff + len) - 1);
>
>
> I also am worried about what does:
>
> 1) writing out data from start of previous page up to destoff (if
> destoff not page aligned, and first page dirty)
> 2) writing out data from dest+len through end of page (if destoff not
> page aligned, and last page dirty)
>
> before tossing first through last page out
It does a check for a partial first page (sending down the partial
offset to invalidatepage), but the last page is fully truncated.
-chris
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 20:03 ` Zach Brown
@ 2013-05-16 21:16 ` Ric Wheeler
2013-05-21 19:47 ` Eric Wong
1 sibling, 0 replies; 14+ messages in thread
From: Ric Wheeler @ 2013-05-16 21:16 UTC (permalink / raw)
To: Zach Brown
Cc: Eric Wong, Martin K. Petersen, Trond Myklebust, linux-kernel,
linux-fsdevel, linux-btrfs, linux-nfs
On 05/15/2013 04:03 PM, Zach Brown wrote:
> On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
>> Why introduce a new syscall instead of extending sys_splice?
> Personally, I think it's ugly to have different operations use the same
> syscall just because their arguments match.
I agree with Zach - having a system call called "splice" do copy offloads is not
intuitive.
This is a very reasonable name for something that battled its way through
several standards bodies (for NFS and SCSI :)), so we should give it a
reasonable name
thanks!
Ric
>
> But that preference aside, sure, if the consensus is that we'd rather
> use the splice() entry point then I can duck tape the pieces together to
> make it work.
>
>> If the user doesn't need a out offset, then sendfile() should also be
>> able to transparently utilize COPY/CLONE_RANGE, too.
> Perhaps, yeah.
>
> - z
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-15 20:03 ` Zach Brown
2013-05-16 21:16 ` Ric Wheeler
@ 2013-05-21 19:47 ` Eric Wong
2013-05-21 19:50 ` Zach Brown
1 sibling, 1 reply; 14+ messages in thread
From: Eric Wong @ 2013-05-21 19:47 UTC (permalink / raw)
To: Zach Brown
Cc: Martin K. Petersen, Trond Myklebust, linux-kernel, linux-fsdevel,
linux-btrfs, linux-nfs
Zach Brown <zab@redhat.com> wrote:
> On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> > Why introduce a new syscall instead of extending sys_splice?
>
> Personally, I think it's ugly to have different operations use the same
> syscall just because their arguments match.
Fair enough. I think adding a (currently unused) flags parameter would
make sense for future-proofing.
If this were extended to sockets/pipes, peek/nonblock/waitall flags
may be added.
Basically, something like splice/tee, but without needing to
create/manage a pipe in userspace or needing extra syscalls/looping.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
2013-05-21 19:47 ` Eric Wong
@ 2013-05-21 19:50 ` Zach Brown
0 siblings, 0 replies; 14+ messages in thread
From: Zach Brown @ 2013-05-21 19:50 UTC (permalink / raw)
To: Eric Wong
Cc: Martin K. Petersen, Trond Myklebust, linux-kernel, linux-fsdevel,
linux-btrfs, linux-nfs
On Tue, May 21, 2013 at 07:47:19PM +0000, Eric Wong wrote:
> Zach Brown <zab@redhat.com> wrote:
> > On Wed, May 15, 2013 at 07:44:05PM +0000, Eric Wong wrote:
> > > Why introduce a new syscall instead of extending sys_splice?
> >
> > Personally, I think it's ugly to have different operations use the same
> > syscall just because their arguments match.
>
> Fair enough. I think adding a (currently unused) flags parameter would
> make sense for future-proofing.
Yeah, that seems reasonble.
- z
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2013-05-21 19:50 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-05-15 17:50 [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Steve French
2013-05-15 18:54 ` J. Bruce Fields
[not found] ` <20130515185429.GA25994-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2013-05-15 19:39 ` Zach Brown
[not found] ` <CAH2r5ms0P8Hgv1mUpyHA32Er38iiaC1HHC4fhxvz2SBFy6Sucw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-15 19:36 ` Zach Brown
[not found] ` <20130515193600.GA318-fypN+1c5dIyjpB87vu3CluTW4wlIGRCZ@public.gmane.org>
2013-05-15 20:08 ` Steve French
2013-05-15 20:16 ` Chris Mason
[not found] ` <20130515201614.24668.83788-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2013-05-15 20:21 ` Steve French
2013-05-15 20:25 ` Chris Mason
-- strict thread matches above, loose matches on Subject: below --
2013-05-14 21:15 [RFC v0 0/4] sys_copy_range() rough draft Zach Brown
2013-05-14 21:15 ` [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point Zach Brown
2013-05-15 19:44 ` Eric Wong
2013-05-15 20:03 ` Zach Brown
2013-05-16 21:16 ` Ric Wheeler
2013-05-21 19:47 ` Eric Wong
2013-05-21 19:50 ` Zach Brown
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).