* Re: [fuse-devel] fuse zero copy
[not found] <20190805101351.GB21755@GEERT-PC>
@ 2019-08-05 12:14 ` Miklos Szeredi
2019-08-07 9:12 ` Geert Custers
0 siblings, 1 reply; 2+ messages in thread
From: Miklos Szeredi @ 2019-08-05 12:14 UTC (permalink / raw)
To: Geert Custers; +Cc: fuse-devel, linux-fsdevel
[-- Attachment #1: Type: text/plain, Size: 1942 bytes --]
On Mon, Aug 5, 2019 at 12:14 PM Geert Custers
<geert.aj.custers@gmail.com> wrote:
>
> Hello,
>
> I recently wrote a fuse driver for a custom on-disk file system. As far
> as I could tell, the documentation didn't have an explicit example for
> how to deal with filesystem that are on disk, most examples centered
> around a very simple in memory "filesystem". The way I do it now is
> by calling open() on the .img on init and fread()'ing and fwrite()'ing
> to it (with some caching to make it faster). I'm not sure this is the
> proper way to implement something like this, but I'll assume it is.
>
> Implementing the file system I noticed that when performing fuse read()s
> and write()s that I do a lot of unneeded copying. Right now I fread()
> into the buffer passed to the read() function, but as far as I can tell
> this buffer is then copied from the fuse server to kernel space where it
> is copied back to the user program. A more natural way (the way I see it)
> would be a mechanism by which I could tell the kernel "read from fd 4
> 512 bytes starting at position 0x1000" for example. Then the whole
> operation involves only one copy operation. Reading around I have seen
> some ideas around this, but as far as I could tell this isn't actively
> being worked on... So my question is if there are any plans to implement a
> zero-copy system for fuse.
Actually it is being worked on. Attaching the current
proof-of-concept kernel patch for this.
I don't have a patch for libfuse yet, as I'm testing new ideas with a
dummy filesystem that does raw /dev/fuse access. Also attached, needs
to be run with "-m" to enable the file mapping mode.
To make this more useful, the kernel would need to cache the mapping,
so it doesn't need to issue a MAP request on each read. That would
also optimize the case of long extents, or files mirrored completely
from an underlying filesystem (as done by the test program).
Thanks,
Miklos
[-- Attachment #2: fuse-add-map-request.patch --]
[-- Type: text/x-patch, Size: 6657 bytes --]
From a90a38e4700fbf0e8f73ce19cb6dfe30db5902f2 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 5 Aug 2019 13:44:59 +0200
Subject: [PATCH] fuse: add map request
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
fs/fuse/Makefile | 2 +-
fs/fuse/dev.c | 23 ++++++++++++++
fs/fuse/file.c | 64 +++++++++++++++++++++++++++++++++++++++
fs/fuse/fuse_i.h | 5 +++
fs/fuse/map.c | 58 +++++++++++++++++++++++++++++++++++
include/uapi/linux/fuse.h | 10 ++++++
6 files changed, 161 insertions(+), 1 deletion(-)
create mode 100644 fs/fuse/map.c
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 9485019c2a14..7e110c77d553 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -6,4 +6,4 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o map.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ea8237513dfa..ed64ce383b11 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2306,6 +2306,26 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
return 0;
}
+static long fuse_dev_map_ioctl(unsigned int cmd, unsigned long arg)
+{
+ struct file *file;
+
+ switch (cmd) {
+ case FUSE_DEV_IOC_MAP_OPEN:
+ file = fget(arg);
+ if (!file)
+ return -EBADF;
+ return fuse_map_open(file);
+
+ case FUSE_DEV_IOC_MAP_CLOSE:
+ return fuse_map_close(arg);
+
+ default:
+ return -ENOTTY;
+ }
+
+}
+
static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -2338,7 +2358,10 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
fput(old);
}
}
+ } else {
+ err = fuse_dev_map_ioctl(cmd, arg);
}
+
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ae2828beb00..6413f41cd2ac 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1481,6 +1481,67 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
return res;
}
+static int fuse_send_map(struct kiocb *iocb, size_t count,
+ struct fuse_map_out *outarg)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_conn *fc = get_fuse_conn(file_inode(file));
+ struct fuse_file *ff = file->private_data;
+ struct fuse_read_in inarg = {
+ .fh = ff->fh,
+ .offset = iocb->ki_pos,
+ .size = count,
+ .flags = file->f_flags,
+ };
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_MAP;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(*outarg);
+ args.out.args[0].value = outarg;
+
+ return fuse_simple_request(fc, &args);
+}
+
+static ssize_t fuse_file_map_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_map_out outarg;
+ struct file *mapfile;
+ ssize_t res, total = 0;
+ size_t count;
+ loff_t pos;
+
+ while ((count = iov_iter_count(to))) {
+ res = fuse_send_map(iocb, count, &outarg);
+ if (res || !outarg.size)
+ break;
+
+ res = -EBADF;
+ mapfile = fuse_map_get(outarg.mapfd);
+ if (!mapfile)
+ break;
+
+ iov_iter_truncate(to, outarg.size);
+ pos = outarg.offset;
+ res = vfs_iter_read(mapfile, to, &pos, /* FIXME */ 0);
+ fput(mapfile);
+ if (res < 0)
+ break;
+ iov_iter_reexpand(to, count - res);
+ if (res == 0)
+ break;
+
+ total += res;
+ iocb->ki_pos += res;
+ }
+
+ return total ?: res;
+}
+
static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
@@ -1489,6 +1550,9 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (is_bad_inode(file_inode(file)))
return -EIO;
+ if (ff->open_flags & FOPEN_MAP)
+ return fuse_file_map_iter(iocb, to);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca777775..ea7b0548e034 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1093,4 +1093,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/* readdir.c */
int fuse_readdir(struct file *file, struct dir_context *ctx);
+/* map.c */
+int fuse_map_open(struct file *file);
+int fuse_map_close(unsigned long mapfd);
+struct file *fuse_map_get(u64 mapfd);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/map.c b/fs/fuse/map.c
new file mode 100644
index 000000000000..e5801b9465cd
--- /dev/null
+++ b/fs/fuse/map.c
@@ -0,0 +1,58 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+#include <linux/idr.h>
+
+static DEFINE_SPINLOCK(fuse_map_lock);
+static DEFINE_IDR(fuse_map);
+
+int fuse_map_open(struct file *file)
+{
+ int res;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fuse_map_lock);
+ res = idr_alloc(&fuse_map, file, 0, 0, GFP_ATOMIC);
+ spin_unlock(&fuse_map_lock);
+ idr_preload_end();
+ if (res)
+ fput(file);
+
+ return res;
+}
+
+int fuse_map_close(unsigned long mapfd)
+{
+ struct file *file;
+
+ spin_lock(&fuse_map_lock);
+ file = idr_remove(&fuse_map, mapfd);
+ spin_unlock(&fuse_map_lock);
+
+ if (!file)
+ return -EBADF;
+
+ fput(file);
+ return 0;
+}
+
+struct file *fuse_map_get(u64 mapfd)
+{
+ struct file *file;
+
+ rcu_read_lock();
+ file = idr_find(&fuse_map, mapfd);
+ if (file)
+ get_file(file);
+ rcu_read_unlock();
+
+ return file;
+}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 2971d29a42e4..65fca0128716 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -244,6 +244,7 @@ struct fuse_file_lock {
#define FOPEN_NONSEEKABLE (1 << 2)
#define FOPEN_CACHE_DIR (1 << 3)
#define FOPEN_STREAM (1 << 4)
+#define FOPEN_MAP (1 << 5)
/**
* INIT request/reply flags
@@ -422,6 +423,7 @@ enum fuse_opcode {
FUSE_RENAME2 = 45,
FUSE_LSEEK = 46,
FUSE_COPY_FILE_RANGE = 47,
+ FUSE_MAP = 50,
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -571,6 +573,12 @@ struct fuse_read_in {
uint32_t padding;
};
+struct fuse_map_out {
+ uint64_t mapfd;
+ uint64_t offset;
+ uint64_t size;
+};
+
#define FUSE_COMPAT_WRITE_IN_SIZE 24
struct fuse_write_in {
@@ -823,6 +831,8 @@ struct fuse_notify_retrieve_in {
/* Device ioctls: */
#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_MAP_OPEN _IO(229, 4)
+#define FUSE_DEV_IOC_MAP_CLOSE _IO(229, 5)
struct fuse_lseek_in {
uint64_t fh;
--
2.21.0
[-- Attachment #3: loraw.tar.gz --]
[-- Type: application/gzip, Size: 11012 bytes --]
^ permalink raw reply related [flat|nested] 2+ messages in thread