From: Amir Goldstein <amir73il@gmail.com>
To: Daniel Rosenberg <drosen@google.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>,
Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
John Fastabend <john.fastabend@gmail.com>,
Andrii Nakryiko <andrii@kernel.org>,
Martin KaFai Lau <martin.lau@linux.dev>,
Song Liu <song@kernel.org>, Yonghong Song <yhs@fb.com>,
KP Singh <kpsingh@kernel.org>,
Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>,
Jiri Olsa <jolsa@kernel.org>,
Paul Lawrence <paullawrence@google.com>,
Alessio Balsini <balsini@google.com>,
David Anderson <dvander@google.com>,
Sandeep Patil <sspatil@google.com>,
linux-fsdevel@vger.kernel.org, bpf@vger.kernel.org,
kernel-team@android.com
Subject: Re: [PATCH 15/26] fuse-bpf: Add support for read/write iter
Date: Sat, 1 Oct 2022 09:53:59 +0300 [thread overview]
Message-ID: <CAOQ4uxjssUfKxoTzzervqGaHNPcz1sK-JAVAXFE7=gEdkcYhvQ@mail.gmail.com> (raw)
In-Reply-To: <20220926231822.994383-16-drosen@google.com>
On Tue, Sep 27, 2022 at 2:46 AM Daniel Rosenberg <drosen@google.com> wrote:
>
> Signed-off-by: Daniel Rosenberg <drosen@google.com>
> Signed-off-by: Paul Lawrence <paullawrence@google.com>
> ---
> fs/fuse/backing.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++
> fs/fuse/control.c | 2 +-
> fs/fuse/file.c | 28 +++++
> fs/fuse/fuse_i.h | 42 ++++++-
> fs/fuse/inode.c | 13 +++
> 5 files changed, 374 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
> index 1fe61177cdfb..cf4ad9f4fe10 100644
> --- a/fs/fuse/backing.c
> +++ b/fs/fuse/backing.c
> @@ -12,6 +12,47 @@
> #include <linux/namei.h>
> #include <linux/bpf_fuse.h>
>
> +#define FUSE_BPF_IOCB_MASK (IOCB_APPEND | IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC)
> +
> +struct fuse_bpf_aio_req {
> + struct kiocb iocb;
> + refcount_t ref;
> + struct kiocb *iocb_orig;
> +};
> +
> +static struct kmem_cache *fuse_bpf_aio_request_cachep;
> +
> +static void fuse_file_accessed(struct file *dst_file, struct file *src_file)
> +{
> + struct inode *dst_inode;
> + struct inode *src_inode;
> +
> + if (dst_file->f_flags & O_NOATIME)
> + return;
> +
> + dst_inode = file_inode(dst_file);
> + src_inode = file_inode(src_file);
> +
> + if ((!timespec64_equal(&dst_inode->i_mtime, &src_inode->i_mtime) ||
> + !timespec64_equal(&dst_inode->i_ctime, &src_inode->i_ctime))) {
> + dst_inode->i_mtime = src_inode->i_mtime;
> + dst_inode->i_ctime = src_inode->i_ctime;
> + }
> +
> + touch_atime(&dst_file->f_path);
> +}
> +
> +static void fuse_copyattr(struct file *dst_file, struct file *src_file)
> +{
> + struct inode *dst = file_inode(dst_file);
> + struct inode *src = file_inode(src_file);
> +
> + dst->i_atime = src->i_atime;
> + dst->i_mtime = src->i_mtime;
> + dst->i_ctime = src->i_ctime;
> + i_size_write(dst, i_size_read(src));
> +}
> +
> struct bpf_prog *fuse_get_bpf_prog(struct file *file)
> {
> struct bpf_prog *bpf_prog = ERR_PTR(-EINVAL);
> @@ -469,6 +510,241 @@ int fuse_lseek_finalize(struct bpf_fuse_args *fa, loff_t *out,
> return 0;
> }
>
> +static inline void fuse_bpf_aio_put(struct fuse_bpf_aio_req *aio_req)
> +{
> + if (refcount_dec_and_test(&aio_req->ref))
> + kmem_cache_free(fuse_bpf_aio_request_cachep, aio_req);
> +}
> +
> +static void fuse_bpf_aio_cleanup_handler(struct fuse_bpf_aio_req *aio_req)
> +{
> + struct kiocb *iocb = &aio_req->iocb;
> + struct kiocb *iocb_orig = aio_req->iocb_orig;
> +
> + if (iocb->ki_flags & IOCB_WRITE) {
> + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
> + SB_FREEZE_WRITE);
> + file_end_write(iocb->ki_filp);
> + fuse_copyattr(iocb_orig->ki_filp, iocb->ki_filp);
> + }
> + iocb_orig->ki_pos = iocb->ki_pos;
> + fuse_bpf_aio_put(aio_req);
> +}
> +
> +static void fuse_bpf_aio_rw_complete(struct kiocb *iocb, long res)
> +{
> + struct fuse_bpf_aio_req *aio_req =
> + container_of(iocb, struct fuse_bpf_aio_req, iocb);
> + struct kiocb *iocb_orig = aio_req->iocb_orig;
> +
> + fuse_bpf_aio_cleanup_handler(aio_req);
> + iocb_orig->ki_complete(iocb_orig, res);
> +}
> +
> +int fuse_file_read_iter_initialize_in(struct bpf_fuse_args *fa, struct fuse_file_read_iter_io *fri,
> + struct kiocb *iocb, struct iov_iter *to)
> +{
> + struct file *file = iocb->ki_filp;
> + struct fuse_file *ff = file->private_data;
> +
> + fri->fri = (struct fuse_read_in) {
> + .fh = ff->fh,
> + .offset = iocb->ki_pos,
> + .size = to->count,
> + };
> +
> + /* TODO we can't assume 'to' is a kvec */
> + /* TODO we also can't assume the vector has only one component */
> + *fa = (struct bpf_fuse_args) {
> + .opcode = FUSE_READ,
> + .nodeid = ff->nodeid,
> + .in_numargs = 1,
> + .in_args[0].size = sizeof(fri->fri),
> + .in_args[0].value = &fri->fri,
> + /*
> + * TODO Design this properly.
> + * Possible approach: do not pass buf to bpf
> + * If going to userland, do a deep copy
> + * For extra credit, do that to/from the vector, rather than
> + * making an extra copy in the kernel
> + */
> + };
> +
> + return 0;
> +}
> +
> +int fuse_file_read_iter_initialize_out(struct bpf_fuse_args *fa, struct fuse_file_read_iter_io *fri,
> + struct kiocb *iocb, struct iov_iter *to)
> +{
> + fri->frio = (struct fuse_read_iter_out) {
> + .ret = fri->fri.size,
> + };
> +
> + fa->out_numargs = 1;
> + fa->out_args[0].size = sizeof(fri->frio);
> + fa->out_args[0].value = &fri->frio;
> +
> + return 0;
> +}
> +
> +int fuse_file_read_iter_backing(struct bpf_fuse_args *fa, ssize_t *out,
> + struct kiocb *iocb, struct iov_iter *to)
> +{
> + struct fuse_read_iter_out *frio = fa->out_args[0].value;
> + struct file *file = iocb->ki_filp;
> + struct fuse_file *ff = file->private_data;
> +
> + if (!iov_iter_count(to))
> + return 0;
> +
> + if ((iocb->ki_flags & IOCB_DIRECT) &&
> + (!ff->backing_file->f_mapping->a_ops ||
> + !ff->backing_file->f_mapping->a_ops->direct_IO))
> + return -EINVAL;
> +
> + /* TODO This just plain ignores any change to fuse_read_in */
> + if (is_sync_kiocb(iocb)) {
> + *out = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
> + iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
> + } else {
> + struct fuse_bpf_aio_req *aio_req;
> +
> + *out = -ENOMEM;
> + aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL);
> + if (!aio_req)
> + goto out;
> +
> + aio_req->iocb_orig = iocb;
> + kiocb_clone(&aio_req->iocb, iocb, ff->backing_file);
> + aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete;
> + refcount_set(&aio_req->ref, 2);
> + *out = vfs_iocb_iter_read(ff->backing_file, &aio_req->iocb, to);
> + fuse_bpf_aio_put(aio_req);
> + if (*out != -EIOCBQUEUED)
> + fuse_bpf_aio_cleanup_handler(aio_req);
> + }
> +
> + frio->ret = *out;
> +
> + /* TODO Need to point value at the buffer for post-modification */
> +
> +out:
> + fuse_file_accessed(file, ff->backing_file);
> +
> + return *out;
> +}
> +
> +int fuse_file_read_iter_finalize(struct bpf_fuse_args *fa, ssize_t *out,
> + struct kiocb *iocb, struct iov_iter *to)
> +{
> + struct fuse_read_iter_out *frio = fa->out_args[0].value;
> +
> + *out = frio->ret;
> +
> + return 0;
> +}
> +
> +int fuse_file_write_iter_initialize_in(struct bpf_fuse_args *fa,
> + struct fuse_file_write_iter_io *fwio,
> + struct kiocb *iocb, struct iov_iter *from)
> +{
> + struct file *file = iocb->ki_filp;
> + struct fuse_file *ff = file->private_data;
> +
> + *fwio = (struct fuse_file_write_iter_io) {
> + .fwi.fh = ff->fh,
> + .fwi.offset = iocb->ki_pos,
> + .fwi.size = from->count,
> + };
> +
> + /* TODO we can't assume 'from' is a kvec */
> + *fa = (struct bpf_fuse_args) {
> + .opcode = FUSE_WRITE,
> + .nodeid = ff->nodeid,
> + .in_numargs = 2,
> + .in_args[0].size = sizeof(fwio->fwi),
> + .in_args[0].value = &fwio->fwi,
> + .in_args[1].size = fwio->fwi.size,
> + .in_args[1].value = from->kvec->iov_base,
> + };
> +
> + return 0;
> +}
> +
> +int fuse_file_write_iter_initialize_out(struct bpf_fuse_args *fa,
> + struct fuse_file_write_iter_io *fwio,
> + struct kiocb *iocb, struct iov_iter *from)
> +{
> + /* TODO we can't assume 'from' is a kvec */
> + fa->out_numargs = 1;
> + fa->out_args[0].size = sizeof(fwio->fwio);
> + fa->out_args[0].value = &fwio->fwio;
> +
> + return 0;
> +}
> +
> +int fuse_file_write_iter_backing(struct bpf_fuse_args *fa, ssize_t *out,
> + struct kiocb *iocb, struct iov_iter *from)
> +{
> + struct file *file = iocb->ki_filp;
> + struct fuse_file *ff = file->private_data;
> + struct fuse_write_iter_out *fwio = fa->out_args[0].value;
> +
> + if (!iov_iter_count(from))
> + return 0;
> +
> + /* TODO This just plain ignores any change to fuse_write_in */
> + /* TODO uint32_t seems smaller than ssize_t.... right? */
> + inode_lock(file_inode(file));
> +
> + fuse_copyattr(file, ff->backing_file);
> +
> + if (is_sync_kiocb(iocb)) {
> + file_start_write(ff->backing_file);
> + *out = vfs_iter_write(ff->backing_file, from, &iocb->ki_pos,
> + iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
> + file_end_write(ff->backing_file);
> +
> + /* Must reflect change in size of backing file to upper file */
> + if (*out > 0)
> + fuse_copyattr(file, ff->backing_file);
Regarding attribute cache, things can get a tad more complicated
when the inode is not purely passthrough.
To put things in context, the reason that ovl_copyattr() is correct
in ovl_write_iter() is because the overlayfs inode is purely passthrough
to the backing inode, that is, all operations are passthrough.
The only incident when backing inode changes (copy up) takes
care of copying inode attributes.
This is not the case with FUSE passthrough.
Not in Alessio's FUSE_PASSTHROUGH patches and not in
this proposal.
With FUSE passthrough, every inode is hybrid, some operations
can be served from the backing inode and some operations served
from the server.
My FUSE passthrough branch [1] has two fixes on top of Allesio's patches
to fix two issues regarding attribute caches (size and times).
[1] https://github.com/amir73il/linux/commits/linux-5.10.y-fuse-passthrough
This problem is not unique to FUSE.
It also exists for attribute caches of NFS clients and the NFS protocol
has several techniques to deal with this problem, but it is certainly not
a trivial issue.
As a matter of fact, I found the problems fixed in my branch above
by running the nfstest_posix [2] tests on the FUSE passthrough code.
[2] https://wiki.linux-nfs.org/wiki/index.php/NFStest#nfstest_posix_-_POSIX_file_system_level_access_tests
The easiest way out is to declare one of the copies (backing or remote)
the authoritative copy w.r.t. attributes (like noac nfs mount option).
Declaring the remote attribute copy authoritative (as FUSE usually does)
has performance implications.
I guess if FUSE-bpf attaches a backing inode to FUSE inode on lookup
then the option of making the backing inode attributes authoritative
(like in overlayfs) is valid, but I think this needs to be spelled out.
Thanks,
Amir.
next prev parent reply other threads:[~2022-10-01 6:54 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-09-26 23:17 [PATCH 00/26] FUSE BPF: A Stacked Filesystem Extension for FUSE Daniel Rosenberg
2022-09-26 23:17 ` [PATCH 01/26] bpf: verifier: Allow for multiple packets Daniel Rosenberg
2022-09-26 23:17 ` [PATCH 02/26] bpf: verifier: Allow single packet invalidation Daniel Rosenberg
2022-09-27 4:49 ` kernel test robot
2022-09-29 4:21 ` kernel test robot
2022-09-26 23:17 ` [PATCH 03/26] fuse-bpf: Update uapi for fuse-bpf Daniel Rosenberg
2022-09-27 18:19 ` Miklos Szeredi
2022-09-30 22:02 ` Paul Lawrence
2022-10-01 7:47 ` Amir Goldstein
2022-09-26 23:18 ` [PATCH 04/26] fuse-bpf: Add BPF supporting functions Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 05/26] fs: Generic function to convert iocb to rw flags Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 06/26] bpf: Export bpf_prog_fops Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 07/26] fuse-bpf: Prepare for fuse-bpf patch Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 08/26] fuse: Add fuse-bpf, a stacked fs extension for FUSE Daniel Rosenberg
2022-09-27 22:45 ` kernel test robot
2022-09-29 5:09 ` kernel test robot
2022-09-26 23:18 ` [PATCH 09/26] fuse-bpf: Don't support export_operations Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 10/26] fuse-bpf: Partially add mapping support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 11/26] fuse-bpf: Add lseek support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 12/26] fuse-bpf: Add support for fallocate Daniel Rosenberg
2022-09-27 22:07 ` Dave Chinner
2022-09-27 23:36 ` Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 13/26] fuse-bpf: Support file/dir open/close Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 14/26] fuse-bpf: Support mknod/unlink/mkdir/rmdir Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 15/26] fuse-bpf: Add support for read/write iter Daniel Rosenberg
2022-10-01 6:53 ` Amir Goldstein [this message]
2022-09-26 23:18 ` [PATCH 16/26] fuse-bpf: support FUSE_READDIR Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 17/26] fuse-bpf: Add support for sync operations Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 18/26] fuse-bpf: Add Rename support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 19/26] fuse-bpf: Add attr support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 20/26] fuse-bpf: Add support for FUSE_COPY_FILE_RANGE Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 21/26] fuse-bpf: Add xattr support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 22/26] fuse-bpf: Add symlink/link support Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 23/26] fuse-bpf: allow mounting with no userspace daemon Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 24/26] fuse-bpf: Call bpf for pre/post filters Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 25/26] fuse-bpf: Add userspace " Daniel Rosenberg
2022-09-26 23:18 ` [PATCH 26/26] fuse-bpf: Add selftests Daniel Rosenberg
2022-09-28 6:41 ` [PATCH 00/26] FUSE BPF: A Stacked Filesystem Extension for FUSE Martin KaFai Lau
2022-09-28 12:31 ` Brian Foster
2022-10-01 0:47 ` Daniel Rosenberg
2022-10-01 0:05 ` Daniel Rosenberg
2022-10-01 0:24 ` Alexei Starovoitov
2022-10-06 1:58 ` Martin KaFai Lau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAOQ4uxjssUfKxoTzzervqGaHNPcz1sK-JAVAXFE7=gEdkcYhvQ@mail.gmail.com' \
--to=amir73il@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=balsini@google.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=drosen@google.com \
--cc=dvander@google.com \
--cc=haoluo@google.com \
--cc=john.fastabend@gmail.com \
--cc=jolsa@kernel.org \
--cc=kernel-team@android.com \
--cc=kpsingh@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=martin.lau@linux.dev \
--cc=miklos@szeredi.hu \
--cc=paullawrence@google.com \
--cc=sdf@google.com \
--cc=song@kernel.org \
--cc=sspatil@google.com \
--cc=yhs@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).