* [PATCH] fuse: enable larger read buffers for readdir.
@ 2023-07-26 10:59 Jaco Kroon
2023-07-26 11:43 ` Jaco Kroon
` (3 more replies)
0 siblings, 4 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-26 10:59 UTC (permalink / raw)
To: Miklos Szeredi, linux-fsdevel, linux-kernel; +Cc: Jaco Kroon
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/fuse/Kconfig | 16 ++++++++++++++++
fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
2 files changed, 40 insertions(+), 18 deletions(-)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..0783f9ee5cd3 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -18,6 +18,22 @@ config FUSE_FS
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.
+config FUSE_READDIR_ORDER
+ int
+ range 0 5
+ default 5
+ help
+ readdir performance varies greatly depending on the size of the read.
+ Larger buffers results in larger reads, thus fewer reads and higher
+ performance in return.
+
+ You may want to reduce this value on seriously constrained memory
+ systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
+
+ This value reprents the order of the number of pages to allocate (ie,
+ the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
+ pages (128KiB).
+
config CUSE
tristate "Character device in Userspace support"
depends on FUSE_FS
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index dc603479b30e..98c62b623240 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -13,6 +13,12 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
+#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
+#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
+
static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
struct fuse_conn *fc = get_fuse_conn(dir);
@@ -52,10 +58,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
}
version = fi->rdc.version;
size = fi->rdc.size;
- offset = size & ~PAGE_MASK;
- index = size >> PAGE_SHIFT;
+ offset = size & ~READDIR_PAGES_MASK;
+ index = size >> READDIR_PAGES_SHIFT;
/* Dirent doesn't fit in current page? Jump to next page. */
- if (offset + reclen > PAGE_SIZE) {
+ if (offset + reclen > READDIR_PAGES_SIZE) {
index++;
offset = 0;
}
@@ -83,7 +89,7 @@ static void fuse_add_dirent_to_cache(struct file *file,
}
memcpy(addr + offset, dirent, reclen);
kunmap_local(addr);
- fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
+ fi->rdc.size = (index << READDIR_PAGES_SHIFT) + offset + reclen;
fi->rdc.pos = dirent->off;
unlock:
spin_unlock(&fi->rdc.lock);
@@ -104,7 +110,7 @@ static void fuse_readdir_cache_end(struct file *file, loff_t pos)
}
fi->rdc.cached = true;
- end = ALIGN(fi->rdc.size, PAGE_SIZE);
+ end = ALIGN(fi->rdc.size, READDIR_PAGES_SIZE);
spin_unlock(&fi->rdc.lock);
/* truncate unused tail of cache */
@@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
- struct fuse_page_desc desc = { .length = PAGE_SIZE };
+ struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
u64 attr_version = 0;
bool locked;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
if (!page)
return -ENOMEM;
plus = fuse_use_readdirplus(inode, ctx);
ap->args.out_pages = true;
- ap->num_pages = 1;
+ ap->num_pages = READDIR_PAGES;
ap->pages = &page;
ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
@@ -383,7 +389,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
void *addr, unsigned int size,
struct dir_context *ctx)
{
- unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
+ unsigned int offset = ff->readdir.cache_off & ~READDIR_PAGES_MASK;
enum fuse_parse_result res = FOUND_NONE;
WARN_ON(offset >= size);
@@ -504,16 +510,16 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
WARN_ON(fi->rdc.size < ff->readdir.cache_off);
- index = ff->readdir.cache_off >> PAGE_SHIFT;
+ index = ff->readdir.cache_off >> READDIR_PAGES_SHIFT;
- if (index == (fi->rdc.size >> PAGE_SHIFT))
- size = fi->rdc.size & ~PAGE_MASK;
+ if (index == (fi->rdc.size >> READDIR_PAGES_SHIFT))
+ size = fi->rdc.size & ~READDIR_PAGES_MASK;
else
- size = PAGE_SIZE;
+ size = READDIR_PAGES_SIZE;
spin_unlock(&fi->rdc.lock);
/* EOF? */
- if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
+ if ((ff->readdir.cache_off & ~READDIR_PAGES_MASK) == size)
return 0;
page = find_get_page_flags(file->f_mapping, index,
@@ -559,9 +565,9 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
if (res == FOUND_ALL)
return 0;
- if (size == PAGE_SIZE) {
+ if (size == READDIR_PAGES_SIZE) {
/* We hit end of page: skip to next page. */
- ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
+ ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, READDIR_PAGES_SIZE);
goto retry;
}
--
2.41.0
^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 10:59 [PATCH] fuse: enable larger read buffers for readdir Jaco Kroon
@ 2023-07-26 11:43 ` Jaco Kroon
2023-07-26 13:53 ` Bernd Schubert
` (2 subsequent siblings)
3 siblings, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-26 11:43 UTC (permalink / raw)
To: Miklos Szeredi, linux-fsdevel, linux-kernel
Hi,
Just to give some context, we've got maildir folders running on top of
glusterfs. Without this a "medium" sized folder can take several
minutes to go through readdir (getdents64) and each system call will
return 14-18 entries at a time. These calls (as measures using strace
-T -p) takes anywhere from around 150 micro-seconds to several
milli-seconds. Inter-server round-trip time (as measured with ping) is
generally 100-120 micro-seconds so 150 µs is probably a good lower limit.
I've tested the below patch from a more remote location (7-8ms ping
round-trip) and in spite of this massively increased latency a readdir
iteration over a small folder (a few hundred entries) was still on par
with the local case, usually even marginally better (10%), where on
larger folders the difference became much more pronounced, with
performance for the remote location at times being drastically better
than for the "close" location.
This has a few benefits overall that I see:
1. This enables glusterfs to internally read and process larger
chunks. Whether this is happening I don't know (I have a separate
ongoing discussion with the developers on that side to see what can be
done inside of glusterfs itself to improve the mechanisms on the other
side of fuse).
2. Fewer overall system calls (by an order of up to 32 fewer
getdents64()) calls for userspace to get a full directory listing, in
cases where there's 100k+ files in a folder this makes a HUGE difference
(14-18 entries vs ~500 entries per call, so >5000 calls vs 200).
3. Fewer fuse messages being passed over the fuse link.
One concerns I have is that I don't understand all of the caching and
stuff going on, and I'm typically as per strace seeing less than 64KB of
data being returned to userspace, so I'm not sure there is a direct
correlation between the FUSE readdir size and that from glibc to kernel,
and I'm slightly worried that the caching may now be broken due to
smaller than READDIR_PAGES_SIZE records being cached, with that said,
successive readdir() iterations from userspace provided identical
results so I *think* this should be OK. Someone with a better
understanding should please confirm.
I made the option configurable (but max it out by default) in case
memory constrained systems may need to drop the 128KiB value.
I suspect the discrepancy may also relate to the way in which glusterfs
merges directory listings from the underlying bricks where data is
actually stored.
Without this patch the remote system is orders of magnitude slower that
the close together systems.
Kind regards,
Jaco
On 2023/07/26 12:59, Jaco Kroon wrote:
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
> fs/fuse/Kconfig | 16 ++++++++++++++++
> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
> 2 files changed, 40 insertions(+), 18 deletions(-)
>
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 038ed0b9aaa5..0783f9ee5cd3 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -18,6 +18,22 @@ config FUSE_FS
> If you want to develop a userspace FS, or if you want to use
> a filesystem based on FUSE, answer Y or M.
>
> +config FUSE_READDIR_ORDER
> + int
> + range 0 5
> + default 5
> + help
> + readdir performance varies greatly depending on the size of the read.
> + Larger buffers results in larger reads, thus fewer reads and higher
> + performance in return.
> +
> + You may want to reduce this value on seriously constrained memory
> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> +
> + This value reprents the order of the number of pages to allocate (ie,
> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
> + pages (128KiB).
> +
> config CUSE
> tristate "Character device in Userspace support"
> depends on FUSE_FS
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index dc603479b30e..98c62b623240 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -13,6 +13,12 @@
> #include <linux/pagemap.h>
> #include <linux/highmem.h>
>
> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
> +
> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> {
> struct fuse_conn *fc = get_fuse_conn(dir);
> @@ -52,10 +58,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
> }
> version = fi->rdc.version;
> size = fi->rdc.size;
> - offset = size & ~PAGE_MASK;
> - index = size >> PAGE_SHIFT;
> + offset = size & ~READDIR_PAGES_MASK;
> + index = size >> READDIR_PAGES_SHIFT;
> /* Dirent doesn't fit in current page? Jump to next page. */
> - if (offset + reclen > PAGE_SIZE) {
> + if (offset + reclen > READDIR_PAGES_SIZE) {
> index++;
> offset = 0;
> }
> @@ -83,7 +89,7 @@ static void fuse_add_dirent_to_cache(struct file *file,
> }
> memcpy(addr + offset, dirent, reclen);
> kunmap_local(addr);
> - fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
> + fi->rdc.size = (index << READDIR_PAGES_SHIFT) + offset + reclen;
> fi->rdc.pos = dirent->off;
> unlock:
> spin_unlock(&fi->rdc.lock);
> @@ -104,7 +110,7 @@ static void fuse_readdir_cache_end(struct file *file, loff_t pos)
> }
>
> fi->rdc.cached = true;
> - end = ALIGN(fi->rdc.size, PAGE_SIZE);
> + end = ALIGN(fi->rdc.size, READDIR_PAGES_SIZE);
> spin_unlock(&fi->rdc.lock);
>
> /* truncate unused tail of cache */
> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_io_args ia = {};
> struct fuse_args_pages *ap = &ia.ap;
> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> u64 attr_version = 0;
> bool locked;
>
> - page = alloc_page(GFP_KERNEL);
> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
> if (!page)
> return -ENOMEM;
>
> plus = fuse_use_readdirplus(inode, ctx);
> ap->args.out_pages = true;
> - ap->num_pages = 1;
> + ap->num_pages = READDIR_PAGES;
> ap->pages = &page;
> ap->descs = &desc;
> if (plus) {
> attr_version = fuse_get_attr_version(fm->fc);
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
> FUSE_READDIRPLUS);
> } else {
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
> FUSE_READDIR);
> }
> locked = fuse_lock_inode(inode);
> @@ -383,7 +389,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
> void *addr, unsigned int size,
> struct dir_context *ctx)
> {
> - unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
> + unsigned int offset = ff->readdir.cache_off & ~READDIR_PAGES_MASK;
> enum fuse_parse_result res = FOUND_NONE;
>
> WARN_ON(offset >= size);
> @@ -504,16 +510,16 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
>
> WARN_ON(fi->rdc.size < ff->readdir.cache_off);
>
> - index = ff->readdir.cache_off >> PAGE_SHIFT;
> + index = ff->readdir.cache_off >> READDIR_PAGES_SHIFT;
>
> - if (index == (fi->rdc.size >> PAGE_SHIFT))
> - size = fi->rdc.size & ~PAGE_MASK;
> + if (index == (fi->rdc.size >> READDIR_PAGES_SHIFT))
> + size = fi->rdc.size & ~READDIR_PAGES_MASK;
> else
> - size = PAGE_SIZE;
> + size = READDIR_PAGES_SIZE;
> spin_unlock(&fi->rdc.lock);
>
> /* EOF? */
> - if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
> + if ((ff->readdir.cache_off & ~READDIR_PAGES_MASK) == size)
> return 0;
>
> page = find_get_page_flags(file->f_mapping, index,
> @@ -559,9 +565,9 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
> if (res == FOUND_ALL)
> return 0;
>
> - if (size == PAGE_SIZE) {
> + if (size == READDIR_PAGES_SIZE) {
> /* We hit end of page: skip to next page. */
> - ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
> + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, READDIR_PAGES_SIZE);
> goto retry;
> }
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 10:59 [PATCH] fuse: enable larger read buffers for readdir Jaco Kroon
2023-07-26 11:43 ` Jaco Kroon
@ 2023-07-26 13:53 ` Bernd Schubert
2023-07-26 15:26 ` Jaco Kroon
2023-07-26 15:19 ` Randy Dunlap
2023-07-27 8:12 ` [PATCH] fuse: enable larger read buffers for readdir [v2] Jaco Kroon
3 siblings, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2023-07-26 13:53 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi, linux-fsdevel, linux-kernel
On 7/26/23 12:59, Jaco Kroon wrote:
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
> fs/fuse/Kconfig | 16 ++++++++++++++++
> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
> 2 files changed, 40 insertions(+), 18 deletions(-)
>
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 038ed0b9aaa5..0783f9ee5cd3 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -18,6 +18,22 @@ config FUSE_FS
> If you want to develop a userspace FS, or if you want to use
> a filesystem based on FUSE, answer Y or M.
>
> +config FUSE_READDIR_ORDER
> + int
> + range 0 5
> + default 5
> + help
> + readdir performance varies greatly depending on the size of the read.
> + Larger buffers results in larger reads, thus fewer reads and higher
> + performance in return.
> +
> + You may want to reduce this value on seriously constrained memory
> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> +
> + This value reprents the order of the number of pages to allocate (ie,
> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
> + pages (128KiB).
> +
I like the idea of a larger readdir size, but shouldn't that be a
server/daemon/library decision which size to use, instead of kernel
compile time? So should be part of FUSE_INIT negotiation?
> config CUSE
> tristate "Character device in Userspace support"
> depends on FUSE_FS
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index dc603479b30e..98c62b623240 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -13,6 +13,12 @@
> #include <linux/pagemap.h>
> #include <linux/highmem.h>
>
> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
> +
> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> {
> struct fuse_conn *fc = get_fuse_conn(dir);
> @@ -52,10 +58,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
> }
> version = fi->rdc.version;
> size = fi->rdc.size;
> - offset = size & ~PAGE_MASK;
> - index = size >> PAGE_SHIFT;
> + offset = size & ~READDIR_PAGES_MASK;
> + index = size >> READDIR_PAGES_SHIFT;
> /* Dirent doesn't fit in current page? Jump to next page. */
> - if (offset + reclen > PAGE_SIZE) {
> + if (offset + reclen > READDIR_PAGES_SIZE) {
> index++;
> offset = 0;
> }
> @@ -83,7 +89,7 @@ static void fuse_add_dirent_to_cache(struct file *file,
> }
> memcpy(addr + offset, dirent, reclen);
> kunmap_local(addr);
> - fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
> + fi->rdc.size = (index << READDIR_PAGES_SHIFT) + offset + reclen;
> fi->rdc.pos = dirent->off;
> unlock:
> spin_unlock(&fi->rdc.lock);
> @@ -104,7 +110,7 @@ static void fuse_readdir_cache_end(struct file *file, loff_t pos)
> }
>
> fi->rdc.cached = true;
> - end = ALIGN(fi->rdc.size, PAGE_SIZE);
> + end = ALIGN(fi->rdc.size, READDIR_PAGES_SIZE);
> spin_unlock(&fi->rdc.lock);
>
> /* truncate unused tail of cache */
> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_io_args ia = {};
> struct fuse_args_pages *ap = &ia.ap;
> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> u64 attr_version = 0;
> bool locked;
>
> - page = alloc_page(GFP_KERNEL);
> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
I guess that should become folio alloc(), one way or the other. Now I
think order 0 was chosen before to avoid risk of allocation failure. I
guess it might work to try a large size and to fall back to 0 when that
failed. Or fail back to the slower vmalloc.
> if (!page)
> return -ENOMEM;
>
> plus = fuse_use_readdirplus(inode, ctx);
> ap->args.out_pages = true;
> - ap->num_pages = 1;
> + ap->num_pages = READDIR_PAGES;
> ap->pages = &page;
> ap->descs = &desc;
> if (plus) {
> attr_version = fuse_get_attr_version(fm->fc);
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
> FUSE_READDIRPLUS);
> } else {
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
> FUSE_READDIR);
> }
> locked = fuse_lock_inode(inode);
> @@ -383,7 +389,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
> void *addr, unsigned int size,
> struct dir_context *ctx)
> {
> - unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
> + unsigned int offset = ff->readdir.cache_off & ~READDIR_PAGES_MASK;
> enum fuse_parse_result res = FOUND_NONE;
>
> WARN_ON(offset >= size);
> @@ -504,16 +510,16 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
>
> WARN_ON(fi->rdc.size < ff->readdir.cache_off);
>
> - index = ff->readdir.cache_off >> PAGE_SHIFT;
> + index = ff->readdir.cache_off >> READDIR_PAGES_SHIFT;
>
> - if (index == (fi->rdc.size >> PAGE_SHIFT))
> - size = fi->rdc.size & ~PAGE_MASK;
> + if (index == (fi->rdc.size >> READDIR_PAGES_SHIFT))
> + size = fi->rdc.size & ~READDIR_PAGES_MASK;
> else
> - size = PAGE_SIZE;
> + size = READDIR_PAGES_SIZE;
> spin_unlock(&fi->rdc.lock);
>
> /* EOF? */
> - if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
> + if ((ff->readdir.cache_off & ~READDIR_PAGES_MASK) == size)
> return 0;
>
> page = find_get_page_flags(file->f_mapping, index,
> @@ -559,9 +565,9 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
> if (res == FOUND_ALL)
> return 0;
>
> - if (size == PAGE_SIZE) {
> + if (size == READDIR_PAGES_SIZE) {
> /* We hit end of page: skip to next page. */
> - ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
> + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, READDIR_PAGES_SIZE);
> goto retry;
> }
>
Thanks,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 10:59 [PATCH] fuse: enable larger read buffers for readdir Jaco Kroon
2023-07-26 11:43 ` Jaco Kroon
2023-07-26 13:53 ` Bernd Schubert
@ 2023-07-26 15:19 ` Randy Dunlap
2023-07-27 8:12 ` [PATCH] fuse: enable larger read buffers for readdir [v2] Jaco Kroon
3 siblings, 0 replies; 50+ messages in thread
From: Randy Dunlap @ 2023-07-26 15:19 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi, linux-fsdevel, linux-kernel
On 7/26/23 03:59, Jaco Kroon wrote:
> +config FUSE_READDIR_ORDER
> + int
> + range 0 5
> + default 5
> + help
> + readdir performance varies greatly depending on the size of the read.
> + Larger buffers results in larger reads, thus fewer reads and higher
> + performance in return.
> +
> + You may want to reduce this value on seriously constrained memory
> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> +
> + This value reprents the order of the number of pages to allocate (ie,
represents (i.e.,
> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
> + pages (128KiB).
--
~Randy
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 13:53 ` Bernd Schubert
@ 2023-07-26 15:26 ` Jaco Kroon
2023-07-26 15:30 ` Bernd Schubert
2023-07-26 15:45 ` Bernd Schubert
0 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-26 15:26 UTC (permalink / raw)
To: Bernd Schubert, Miklos Szeredi, linux-fsdevel, linux-kernel
Hi,
On 2023/07/26 15:53, Bernd Schubert wrote:
>
>
> On 7/26/23 12:59, Jaco Kroon wrote:
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>> fs/fuse/Kconfig | 16 ++++++++++++++++
>> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
>> 2 files changed, 40 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>> If you want to develop a userspace FS, or if you want to use
>> a filesystem based on FUSE, answer Y or M.
>> +config FUSE_READDIR_ORDER
>> + int
>> + range 0 5
>> + default 5
>> + help
>> + readdir performance varies greatly depending on the size of
>> the read.
>> + Larger buffers results in larger reads, thus fewer reads and
>> higher
>> + performance in return.
>> +
>> + You may want to reduce this value on seriously constrained
>> memory
>> + systems where 128KiB (assuming 4KiB pages) cache pages is
>> not ideal.
>> +
>> + This value reprents the order of the number of pages to
>> allocate (ie,
>> + the shift value). A value of 0 is thus 1 page (4KiB) where
>> 5 is 32
>> + pages (128KiB).
>> +
>
> I like the idea of a larger readdir size, but shouldn't that be a
> server/daemon/library decision which size to use, instead of kernel
> compile time? So should be part of FUSE_INIT negotiation?
Yes sure, but there still needs to be a default. And one page at a time
doesn't cut it.
-- snip --
>> - page = alloc_page(GFP_KERNEL);
>> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>
> I guess that should become folio alloc(), one way or the other. Now I
> think order 0 was chosen before to avoid risk of allocation failure. I
> guess it might work to try a large size and to fall back to 0 when
> that failed. Or fail back to the slower vmalloc.
If this varies then a bunch of other code will become somewhat more
complex, especially if one alloc succeeds, and then a follow-up succeeds.
I'm not familiar with the differences between the different mechanisms
available for allocation.
-- snip --
> Thanks,
My pleasure,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 15:26 ` Jaco Kroon
@ 2023-07-26 15:30 ` Bernd Schubert
2023-07-26 15:45 ` Bernd Schubert
1 sibling, 0 replies; 50+ messages in thread
From: Bernd Schubert @ 2023-07-26 15:30 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi, linux-fsdevel, linux-kernel
On 7/26/23 17:26, Jaco Kroon wrote:
> Hi,
>
> On 2023/07/26 15:53, Bernd Schubert wrote:
>>
>>
>> On 7/26/23 12:59, Jaco Kroon wrote:
>>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>>> ---
>>> fs/fuse/Kconfig | 16 ++++++++++++++++
>>> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
>>> 2 files changed, 40 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>>> --- a/fs/fuse/Kconfig
>>> +++ b/fs/fuse/Kconfig
>>> @@ -18,6 +18,22 @@ config FUSE_FS
>>> If you want to develop a userspace FS, or if you want to use
>>> a filesystem based on FUSE, answer Y or M.
>>> +config FUSE_READDIR_ORDER
>>> + int
>>> + range 0 5
>>> + default 5
>>> + help
>>> + readdir performance varies greatly depending on the size of
>>> the read.
>>> + Larger buffers results in larger reads, thus fewer reads and
>>> higher
>>> + performance in return.
>>> +
>>> + You may want to reduce this value on seriously constrained
>>> memory
>>> + systems where 128KiB (assuming 4KiB pages) cache pages is
>>> not ideal.
>>> +
>>> + This value reprents the order of the number of pages to
>>> allocate (ie,
>>> + the shift value). A value of 0 is thus 1 page (4KiB) where
>>> 5 is 32
>>> + pages (128KiB).
>>> +
>>
>> I like the idea of a larger readdir size, but shouldn't that be a
>> server/daemon/library decision which size to use, instead of kernel
>> compile time? So should be part of FUSE_INIT negotiation?
>
> Yes sure, but there still needs to be a default. And one page at a time
> doesn't cut it.
>
> -- snip --
>
>>> - page = alloc_page(GFP_KERNEL);
>>> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>>
>> I guess that should become folio alloc(), one way or the other. Now I
>> think order 0 was chosen before to avoid risk of allocation failure. I
>> guess it might work to try a large size and to fall back to 0 when
>> that failed. Or fail back to the slower vmalloc.
>
> If this varies then a bunch of other code will become somewhat more
> complex, especially if one alloc succeeds, and then a follow-up succeeds.
Yeah, the better choice is kvmalloc/kvfree which handles it internally.
>
> I'm not familiar with the differences between the different mechanisms
> available for allocation.
>
> -- snip --
>
>> Thanks,
> My pleasure,
> Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 15:26 ` Jaco Kroon
2023-07-26 15:30 ` Bernd Schubert
@ 2023-07-26 15:45 ` Bernd Schubert
2023-07-26 17:23 ` Antonio SJ Musumeci
1 sibling, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2023-07-26 15:45 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi, linux-fsdevel, linux-kernel
On 7/26/23 17:26, Jaco Kroon wrote:
> Hi,
>
> On 2023/07/26 15:53, Bernd Schubert wrote:
>>
>>
>> On 7/26/23 12:59, Jaco Kroon wrote:
>>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>>> ---
>>> fs/fuse/Kconfig | 16 ++++++++++++++++
>>> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
>>> 2 files changed, 40 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>>> --- a/fs/fuse/Kconfig
>>> +++ b/fs/fuse/Kconfig
>>> @@ -18,6 +18,22 @@ config FUSE_FS
>>> If you want to develop a userspace FS, or if you want to use
>>> a filesystem based on FUSE, answer Y or M.
>>> +config FUSE_READDIR_ORDER
>>> + int
>>> + range 0 5
>>> + default 5
>>> + help
>>> + readdir performance varies greatly depending on the size of
>>> the read.
>>> + Larger buffers results in larger reads, thus fewer reads and
>>> higher
>>> + performance in return.
>>> +
>>> + You may want to reduce this value on seriously constrained
>>> memory
>>> + systems where 128KiB (assuming 4KiB pages) cache pages is
>>> not ideal.
>>> +
>>> + This value reprents the order of the number of pages to
>>> allocate (ie,
>>> + the shift value). A value of 0 is thus 1 page (4KiB) where
>>> 5 is 32
>>> + pages (128KiB).
>>> +
>>
>> I like the idea of a larger readdir size, but shouldn't that be a
>> server/daemon/library decision which size to use, instead of kernel
>> compile time? So should be part of FUSE_INIT negotiation?
>
> Yes sure, but there still needs to be a default. And one page at a time
> doesn't cut it.
With FUSE_INIT userspace would make that decision, based on what kernel
fuse suggests? process_init_reply() already handles other limits - I
don't see why readdir max has to be compile time option. Maybe a module
option to set the limit?
Thanks,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 15:45 ` Bernd Schubert
@ 2023-07-26 17:23 ` Antonio SJ Musumeci
2023-07-26 18:25 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Antonio SJ Musumeci @ 2023-07-26 17:23 UTC (permalink / raw)
To: Bernd Schubert, Jaco Kroon, Miklos Szeredi, linux-fsdevel,
linux-kernel
On 7/26/23 10:45, Bernd Schubert wrote:
>
> On 7/26/23 17:26, Jaco Kroon wrote:
>> Hi,
>>
>> On 2023/07/26 15:53, Bernd Schubert wrote:
>>>
>>> On 7/26/23 12:59, Jaco Kroon wrote:
>>>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>>>> ---
>>>> fs/fuse/Kconfig | 16 ++++++++++++++++
>>>> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
>>>> 2 files changed, 40 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>>>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>>>> --- a/fs/fuse/Kconfig
>>>> +++ b/fs/fuse/Kconfig
>>>> @@ -18,6 +18,22 @@ config FUSE_FS
>>>> If you want to develop a userspace FS, or if you want to use
>>>> a filesystem based on FUSE, answer Y or M.
>>>> +config FUSE_READDIR_ORDER
>>>> + int
>>>> + range 0 5
>>>> + default 5
>>>> + help
>>>> + readdir performance varies greatly depending on the size of
>>>> the read.
>>>> + Larger buffers results in larger reads, thus fewer reads and
>>>> higher
>>>> + performance in return.
>>>> +
>>>> + You may want to reduce this value on seriously constrained
>>>> memory
>>>> + systems where 128KiB (assuming 4KiB pages) cache pages is
>>>> not ideal.
>>>> +
>>>> + This value reprents the order of the number of pages to
>>>> allocate (ie,
>>>> + the shift value). A value of 0 is thus 1 page (4KiB) where
>>>> 5 is 32
>>>> + pages (128KiB).
>>>> +
>>> I like the idea of a larger readdir size, but shouldn't that be a
>>> server/daemon/library decision which size to use, instead of kernel
>>> compile time? So should be part of FUSE_INIT negotiation?
>> Yes sure, but there still needs to be a default. And one page at a time
>> doesn't cut it.
> With FUSE_INIT userspace would make that decision, based on what kernel
> fuse suggests? process_init_reply() already handles other limits - I
> don't see why readdir max has to be compile time option. Maybe a module
> option to set the limit?
>
> Thanks,
> Bernd
I had similar question / comment. This seems to me to be more
appropriately handed by the server via FUSE_INIT.
And wouldn't "max" more easily be FUSE_MAX_MAX_PAGES? Is there a reason
not to allow upwards of 256 pages sized readdir buffer?
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 17:23 ` Antonio SJ Musumeci
@ 2023-07-26 18:25 ` Jaco Kroon
2023-07-27 15:21 ` Miklos Szeredi
2023-07-27 19:21 ` Miklos Szeredi
0 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-26 18:25 UTC (permalink / raw)
To: Antonio SJ Musumeci, Bernd Schubert, Miklos Szeredi,
linux-fsdevel, linux-kernel
Hi,
On 2023/07/26 19:23, Antonio SJ Musumeci wrote:
> On 7/26/23 10:45, Bernd Schubert wrote:
>> On 7/26/23 17:26, Jaco Kroon wrote:
>>> Hi,
>>>
>>> On 2023/07/26 15:53, Bernd Schubert wrote:
>>>> On 7/26/23 12:59, Jaco Kroon wrote:
>>>>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>>>>> ---
>>>>> fs/fuse/Kconfig | 16 ++++++++++++++++
>>>>> fs/fuse/readdir.c | 42 ++++++++++++++++++++++++------------------
>>>>> 2 files changed, 40 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>>>>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>>>>> --- a/fs/fuse/Kconfig
>>>>> +++ b/fs/fuse/Kconfig
>>>>> @@ -18,6 +18,22 @@ config FUSE_FS
>>>>> If you want to develop a userspace FS, or if you want to use
>>>>> a filesystem based on FUSE, answer Y or M.
>>>>> +config FUSE_READDIR_ORDER
>>>>> + int
>>>>> + range 0 5
>>>>> + default 5
>>>>> + help
>>>>> + readdir performance varies greatly depending on the size of
>>>>> the read.
>>>>> + Larger buffers results in larger reads, thus fewer reads and
>>>>> higher
>>>>> + performance in return.
>>>>> +
>>>>> + You may want to reduce this value on seriously constrained
>>>>> memory
>>>>> + systems where 128KiB (assuming 4KiB pages) cache pages is
>>>>> not ideal.
>>>>> +
>>>>> + This value reprents the order of the number of pages to
>>>>> allocate (ie,
>>>>> + the shift value). A value of 0 is thus 1 page (4KiB) where
>>>>> 5 is 32
>>>>> + pages (128KiB).
>>>>> +
>>>> I like the idea of a larger readdir size, but shouldn't that be a
>>>> server/daemon/library decision which size to use, instead of kernel
>>>> compile time? So should be part of FUSE_INIT negotiation?
>>> Yes sure, but there still needs to be a default. And one page at a time
>>> doesn't cut it.
>> With FUSE_INIT userspace would make that decision, based on what kernel
>> fuse suggests? process_init_reply() already handles other limits - I
>> don't see why readdir max has to be compile time option. Maybe a module
>> option to set the limit?
>>
>> Thanks,
>> Bernd
> I had similar question / comment. This seems to me to be more
> appropriately handed by the server via FUSE_INIT.
>
> And wouldn't "max" more easily be FUSE_MAX_MAX_PAGES? Is there a reason
> not to allow upwards of 256 pages sized readdir buffer?
Will look into FUSE_INIT. The FUSE_INIT as I understand from what I've
read has some expansion constraints or the structure is somehow
negotiated. Older clients in other words that's not aware of the option
will follow some default. For backwards compatibility that default
should probably be 1 page. For performance reasons it makes sense that
this limit be larger.
glibc uses a 128KiB buffer for getdents64, so I'm not sure >128KiB here
makes sense. Or if these two buffers are even directly related.
Default to fc->max_pages (which defaults to 32 or 128KiB) if the
user-space side doesn't understand the max_readdir_pages limit aspect of
things? Assuming these limits should be set separately. I'm thinking
piggy backing on fc->max_pages is just fine to be honest.
For the sake of avoiding division and modulo operations in the cache,
I'm thinking round-down max_pages to the closest power of two for the
sake of sticking to binary operators rather than divisions and mods?
Current patch introduces a definite memory leak either way. Tore
through about 12GB of RAM in a matter of 20 minutes or so. Just going
to patch it that way first, and then based on responses above will look
into an alternative patch that does not depend on a compile-time
option. Guessing __free_page should be a multi-page variant now.
Thanks for all the feedback so far.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-26 10:59 [PATCH] fuse: enable larger read buffers for readdir Jaco Kroon
` (2 preceding siblings ...)
2023-07-26 15:19 ` Randy Dunlap
@ 2023-07-27 8:12 ` Jaco Kroon
2023-07-27 15:35 ` Miklos Szeredi
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
3 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-27 8:12 UTC (permalink / raw)
To: Miklos Szeredi, linux-fsdevel, linux-kernel, Randy Dunlap,
Bernd Schubert, Antonio SJ Musumeci
Cc: Jaco Kroon
This patch does not mess with the caching infrastructure like the
previous one, which we believe caused excessive CPU and broke directory
listings in some cases.
This version only affects the uncached read, which then during parse adds an
entry at a time to the cached structures by way of copying, and as such,
we believe this should be sufficient.
We're still seeing cases where getdents64 takes ~10s (this was the case
in any case without this patch, the difference now that we get ~500
entries for that time rather than the 14-18 previously). We believe
that that latency is introduced on glusterfs side and is under separate
discussion with the glusterfs developers.
This is still a compile-time option, but a working one compared to
previous patch. For now this works, but it's not recommended for merge
(as per email discussion).
This still uses alloc_pages rather than kvmalloc/kvfree.
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/fuse/Kconfig | 16 ++++++++++++++++
fs/fuse/readdir.c | 18 ++++++++++++------
2 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..0783f9ee5cd3 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -18,6 +18,22 @@ config FUSE_FS
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.
+config FUSE_READDIR_ORDER
+ int
+ range 0 5
+ default 5
+ help
+ readdir performance varies greatly depending on the size of the read.
+ Larger buffers results in larger reads, thus fewer reads and higher
+ performance in return.
+
+ You may want to reduce this value on seriously constrained memory
+ systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
+
+ This value reprents the order of the number of pages to allocate (ie,
+ the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
+ pages (128KiB).
+
config CUSE
tristate "Character device in Userspace support"
depends on FUSE_FS
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index dc603479b30e..47cea4d91228 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -13,6 +13,12 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
+#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
+#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
+
static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
struct fuse_conn *fc = get_fuse_conn(dir);
@@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
- struct fuse_page_desc desc = { .length = PAGE_SIZE };
+ struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
u64 attr_version = 0;
bool locked;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
if (!page)
return -ENOMEM;
plus = fuse_use_readdirplus(inode, ctx);
ap->args.out_pages = true;
- ap->num_pages = 1;
+ ap->num_pages = READDIR_PAGES;
ap->pages = &page;
ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
@@ -367,7 +373,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
}
}
- __free_page(page);
+ __free_pages(page, READDIR_PAGES_ORDER);
fuse_invalidate_atime(inode);
return res;
}
--
2.41.0
^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 18:25 ` Jaco Kroon
@ 2023-07-27 15:21 ` Miklos Szeredi
2023-07-27 19:21 ` Miklos Szeredi
1 sibling, 0 replies; 50+ messages in thread
From: Miklos Szeredi @ 2023-07-27 15:21 UTC (permalink / raw)
To: Jaco Kroon
Cc: Antonio SJ Musumeci, Bernd Schubert, linux-fsdevel, linux-kernel
On Wed, Jul 26, 2023 at 08:25:48PM +0200, Jaco Kroon wrote:
> glibc uses a 128KiB buffer for getdents64, so I'm not sure >128KiB here
> makes sense. Or if these two buffers are even directly related.
Definitely related. See how the dir_emit() works: if the entry doesn't fit in
the userspace buffer, then ->readdir() returns. If entries remain in the fuse
buffer at that time, they are simply discarded. Simply put, making the
buffer too large is going to be a waste of resources and at some point will be
slower than the single page buffer.
Note: discarding received entries is the only possible action if caching is
disabled. With caching enabled it would make sense to pre-fill the cache with
the overflowed entries and use them in the next iteration. However the caching
code is not prepared for using partial caches at the moment.
The best strategy would be to find the optimal buffer size based on the size of
the userspace buffer. Putting that info into struct dir_context doesn't sound
too complicated...
Here's a patch. It doesn't touch readdir. Simply setting the fuse buffer size
to the userspace buffer size should work, the record sizes are similar (fuse's
is slightly larger than libc's, so no overflow should ever happen).
Thanks,
Miklos
---
fs/exportfs/expfs.c | 1 +
fs/overlayfs/readdir.c | 12 +++++++++++-
fs/readdir.c | 29 ++++++++++++++---------------
include/linux/fs.h | 7 +++++++
4 files changed, 33 insertions(+), 16 deletions(-)
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -286,6 +286,7 @@ static int get_name(const struct path *p
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -347,6 +347,7 @@ static int ovl_dir_read_merged(struct de
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -557,6 +558,7 @@ static int ovl_dir_read_impure(const str
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -658,6 +660,7 @@ static bool ovl_fill_real(struct dir_con
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -672,7 +675,10 @@ static bool ovl_fill_real(struct dir_con
name, namelen, rdt->xinowarn);
}
- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}
static bool ovl_is_impure_dir(struct file *file)
@@ -699,6 +705,7 @@ static int ovl_iterate_real(struct file
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1058,6 +1065,7 @@ int ovl_check_d_type_supported(const str
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};
@@ -1079,6 +1087,7 @@ static int ovl_workdir_cleanup_recurse(s
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1163,6 +1172,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -184,6 +184,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned in
struct fd f = fdget_pos(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -215,7 +216,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -234,7 +234,7 @@ static bool filldir(struct dir_context *
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -259,7 +259,7 @@ static bool filldir(struct dir_context *
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -274,7 +274,7 @@ SYSCALL_DEFINE3(getdents, unsigned int,
struct fd f;
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -293,7 +293,7 @@ SYSCALL_DEFINE3(getdents, unsigned int,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
@@ -303,7 +303,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -321,7 +320,7 @@ static bool filldir64(struct dir_context
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
@@ -341,7 +340,7 @@ static bool filldir64(struct dir_context
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
@@ -357,7 +356,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int
struct fd f;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -377,7 +376,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
@@ -442,6 +441,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsi
struct fd f = fdget_pos(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -467,7 +467,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -486,7 +485,7 @@ static bool compat_filldir(struct dir_co
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -510,7 +509,7 @@ static bool compat_filldir(struct dir_co
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -525,8 +524,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigne
struct fd f;
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;
@@ -544,7 +543,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigne
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1719,6 +1719,13 @@ typedef bool (*filldir_t)(struct dir_con
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};
/*
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 8:12 ` [PATCH] fuse: enable larger read buffers for readdir [v2] Jaco Kroon
@ 2023-07-27 15:35 ` Miklos Szeredi
2023-07-27 16:58 ` Jaco Kroon
` (2 more replies)
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
1 sibling, 3 replies; 50+ messages in thread
From: Miklos Szeredi @ 2023-07-27 15:35 UTC (permalink / raw)
To: Jaco Kroon
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Bernd Schubert,
Antonio SJ Musumeci
On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>
> This patch does not mess with the caching infrastructure like the
> previous one, which we believe caused excessive CPU and broke directory
> listings in some cases.
>
> This version only affects the uncached read, which then during parse adds an
> entry at a time to the cached structures by way of copying, and as such,
> we believe this should be sufficient.
>
> We're still seeing cases where getdents64 takes ~10s (this was the case
> in any case without this patch, the difference now that we get ~500
> entries for that time rather than the 14-18 previously). We believe
> that that latency is introduced on glusterfs side and is under separate
> discussion with the glusterfs developers.
>
> This is still a compile-time option, but a working one compared to
> previous patch. For now this works, but it's not recommended for merge
> (as per email discussion).
>
> This still uses alloc_pages rather than kvmalloc/kvfree.
>
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
> fs/fuse/Kconfig | 16 ++++++++++++++++
> fs/fuse/readdir.c | 18 ++++++++++++------
> 2 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 038ed0b9aaa5..0783f9ee5cd3 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -18,6 +18,22 @@ config FUSE_FS
> If you want to develop a userspace FS, or if you want to use
> a filesystem based on FUSE, answer Y or M.
>
> +config FUSE_READDIR_ORDER
> + int
> + range 0 5
> + default 5
> + help
> + readdir performance varies greatly depending on the size of the read.
> + Larger buffers results in larger reads, thus fewer reads and higher
> + performance in return.
> +
> + You may want to reduce this value on seriously constrained memory
> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> +
> + This value reprents the order of the number of pages to allocate (ie,
> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
> + pages (128KiB).
> +
> config CUSE
> tristate "Character device in Userspace support"
> depends on FUSE_FS
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index dc603479b30e..47cea4d91228 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -13,6 +13,12 @@
> #include <linux/pagemap.h>
> #include <linux/highmem.h>
>
> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
> +
> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> {
> struct fuse_conn *fc = get_fuse_conn(dir);
> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_io_args ia = {};
> struct fuse_args_pages *ap = &ia.ap;
> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
Does this really work? I would've thought we are relying on single
page lengths somewhere.
> u64 attr_version = 0;
> bool locked;
>
> - page = alloc_page(GFP_KERNEL);
> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
> if (!page)
> return -ENOMEM;
>
> plus = fuse_use_readdirplus(inode, ctx);
> ap->args.out_pages = true;
> - ap->num_pages = 1;
> + ap->num_pages = READDIR_PAGES;
No. This is the array lenght, which is 1. This is the hack I guess,
which makes the above trick work.
Better use kvmalloc, which might have a slightly worse performance
than a large page, but definitely not worse than the current single
page.
If we want to optimize the overhead of kvmalloc (and it's a big if)
then the parse_dir*file() functions would need to be converted to
using a page array instead of a plain kernel pointer, which would add
some complexity for sure.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 15:35 ` Miklos Szeredi
@ 2023-07-27 16:58 ` Jaco Kroon
2023-07-27 19:17 ` Miklos Szeredi
2023-07-27 19:16 ` Bernd Schubert
2023-07-27 20:35 ` Bernd Schubert
2 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2023-07-27 16:58 UTC (permalink / raw)
To: Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Bernd Schubert,
Antonio SJ Musumeci
Hi,
On 2023/07/27 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously). We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch. For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>> fs/fuse/Kconfig | 16 ++++++++++++++++
>> fs/fuse/readdir.c | 18 ++++++++++++------
>> 2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>> If you want to develop a userspace FS, or if you want to use
>> a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> + int
>> + range 0 5
>> + default 5
>> + help
>> + readdir performance varies greatly depending on the size of the read.
>> + Larger buffers results in larger reads, thus fewer reads and higher
>> + performance in return.
>> +
>> + You may want to reduce this value on seriously constrained memory
>> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> + This value reprents the order of the number of pages to allocate (ie,
>> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
>> + pages (128KiB).
>> +
>> config CUSE
>> tristate "Character device in Userspace support"
>> depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>> #include <linux/pagemap.h>
>> #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>> {
>> struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>> struct fuse_mount *fm = get_fuse_mount(inode);
>> struct fuse_io_args ia = {};
>> struct fuse_args_pages *ap = &ia.ap;
>> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> Does this really work? I would've thought we are relying on single
> page lengths somewhere.
Based on my testing yes. Getting just under 500 entries per
getdents64() call from userspace vs 14-18 before I guess the answer is yes.
>
>> u64 attr_version = 0;
>> bool locked;
>>
>> - page = alloc_page(GFP_KERNEL);
>> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>> if (!page)
>> return -ENOMEM;
>>
>> plus = fuse_use_readdirplus(inode, ctx);
>> ap->args.out_pages = true;
>> - ap->num_pages = 1;
>> + ap->num_pages = READDIR_PAGES;
> No. This is the array lenght, which is 1. This is the hack I guess,
> which makes the above trick work.
Oh? So the page referenced above isn't an array of pages? It's
actually a single piece of contiguous memory?
> Better use kvmalloc, which might have a slightly worse performance
> than a large page, but definitely not worse than the current single
> page.
Which returns a void*, not struct page* - guess this can be converted
using __virt_to_page (iirc)?
> If we want to optimize the overhead of kvmalloc (and it's a big if)
> then the parse_dir*file() functions would need to be converted to
> using a page array instead of a plain kernel pointer, which would add
> some complexity for sure.
Sorry, I read the above as "I'm surprised this works at all and you're
not kernel panicking all over the show", this is probably the most
ambitious kernel patch I've attempted to date.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 15:35 ` Miklos Szeredi
2023-07-27 16:58 ` Jaco Kroon
@ 2023-07-27 19:16 ` Bernd Schubert
2023-07-27 20:35 ` Bernd Schubert
2 siblings, 0 replies; 50+ messages in thread
From: Bernd Schubert @ 2023-07-27 19:16 UTC (permalink / raw)
To: Miklos Szeredi, Jaco Kroon
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Antonio SJ Musumeci
On 7/27/23 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>>
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously). We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch. For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>> fs/fuse/Kconfig | 16 ++++++++++++++++
>> fs/fuse/readdir.c | 18 ++++++++++++------
>> 2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>> If you want to develop a userspace FS, or if you want to use
>> a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> + int
>> + range 0 5
>> + default 5
>> + help
>> + readdir performance varies greatly depending on the size of the read.
>> + Larger buffers results in larger reads, thus fewer reads and higher
>> + performance in return.
>> +
>> + You may want to reduce this value on seriously constrained memory
>> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> + This value reprents the order of the number of pages to allocate (ie,
>> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
>> + pages (128KiB).
>> +
>> config CUSE
>> tristate "Character device in Userspace support"
>> depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>> #include <linux/pagemap.h>
>> #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>> {
>> struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>> struct fuse_mount *fm = get_fuse_mount(inode);
>> struct fuse_io_args ia = {};
>> struct fuse_args_pages *ap = &ia.ap;
>> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
>
> Does this really work? I would've thought we are relying on single
> page lengths somewhere.
>
>> u64 attr_version = 0;
>> bool locked;
>>
>> - page = alloc_page(GFP_KERNEL);
>> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>> if (!page)
>> return -ENOMEM;
>>
>> plus = fuse_use_readdirplus(inode, ctx);
>> ap->args.out_pages = true;
>> - ap->num_pages = 1;
>> + ap->num_pages = READDIR_PAGES;
>
> No. This is the array lenght, which is 1. This is the hack I guess,
> which makes the above trick work.
>
> Better use kvmalloc, which might have a slightly worse performance
> than a large page, but definitely not worse than the current single
> page.
>
> If we want to optimize the overhead of kvmalloc (and it's a big if)
> then the parse_dir*file() functions would need to be converted to
> using a page array instead of a plain kernel pointer, which would add
> some complexity for sure.
One simple possibility might be to do pos=0 with a small buffer size
single page and only if pos is set we switch to a larger buffer - that
way small directories don't get the overhead of the large allocation.
Although following your idea to to the getdents buffer size - this is
something libc could already start with.
Cheers,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 16:58 ` Jaco Kroon
@ 2023-07-27 19:17 ` Miklos Szeredi
0 siblings, 0 replies; 50+ messages in thread
From: Miklos Szeredi @ 2023-07-27 19:17 UTC (permalink / raw)
To: Jaco Kroon
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Bernd Schubert,
Antonio SJ Musumeci
On Thu, 27 Jul 2023 at 18:58, Jaco Kroon <jaco@uls.co.za> wrote:
>
> Hi,
>
> On 2023/07/27 17:35, Miklos Szeredi wrote:
> > On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
> >> This patch does not mess with the caching infrastructure like the
> >> previous one, which we believe caused excessive CPU and broke directory
> >> listings in some cases.
> >>
> >> This version only affects the uncached read, which then during parse adds an
> >> entry at a time to the cached structures by way of copying, and as such,
> >> we believe this should be sufficient.
> >>
> >> We're still seeing cases where getdents64 takes ~10s (this was the case
> >> in any case without this patch, the difference now that we get ~500
> >> entries for that time rather than the 14-18 previously). We believe
> >> that that latency is introduced on glusterfs side and is under separate
> >> discussion with the glusterfs developers.
> >>
> >> This is still a compile-time option, but a working one compared to
> >> previous patch. For now this works, but it's not recommended for merge
> >> (as per email discussion).
> >>
> >> This still uses alloc_pages rather than kvmalloc/kvfree.
> >>
> >> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> >> ---
> >> fs/fuse/Kconfig | 16 ++++++++++++++++
> >> fs/fuse/readdir.c | 18 ++++++++++++------
> >> 2 files changed, 28 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> >> index 038ed0b9aaa5..0783f9ee5cd3 100644
> >> --- a/fs/fuse/Kconfig
> >> +++ b/fs/fuse/Kconfig
> >> @@ -18,6 +18,22 @@ config FUSE_FS
> >> If you want to develop a userspace FS, or if you want to use
> >> a filesystem based on FUSE, answer Y or M.
> >>
> >> +config FUSE_READDIR_ORDER
> >> + int
> >> + range 0 5
> >> + default 5
> >> + help
> >> + readdir performance varies greatly depending on the size of the read.
> >> + Larger buffers results in larger reads, thus fewer reads and higher
> >> + performance in return.
> >> +
> >> + You may want to reduce this value on seriously constrained memory
> >> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> >> +
> >> + This value reprents the order of the number of pages to allocate (ie,
> >> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
> >> + pages (128KiB).
> >> +
> >> config CUSE
> >> tristate "Character device in Userspace support"
> >> depends on FUSE_FS
> >> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> >> index dc603479b30e..47cea4d91228 100644
> >> --- a/fs/fuse/readdir.c
> >> +++ b/fs/fuse/readdir.c
> >> @@ -13,6 +13,12 @@
> >> #include <linux/pagemap.h>
> >> #include <linux/highmem.h>
> >>
> >> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
> >> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
> >> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
> >> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
> >> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
> >> +
> >> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> >> {
> >> struct fuse_conn *fc = get_fuse_conn(dir);
> >> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> >> struct fuse_mount *fm = get_fuse_mount(inode);
> >> struct fuse_io_args ia = {};
> >> struct fuse_args_pages *ap = &ia.ap;
> >> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
> >> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> > Does this really work? I would've thought we are relying on single
> > page lengths somewhere.
> Based on my testing yes. Getting just under 500 entries per
> getdents64() call from userspace vs 14-18 before I guess the answer is yes.
> >
> >> u64 attr_version = 0;
> >> bool locked;
> >>
> >> - page = alloc_page(GFP_KERNEL);
> >> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
> >> if (!page)
> >> return -ENOMEM;
> >>
> >> plus = fuse_use_readdirplus(inode, ctx);
> >> ap->args.out_pages = true;
> >> - ap->num_pages = 1;
> >> + ap->num_pages = READDIR_PAGES;
> > No. This is the array lenght, which is 1. This is the hack I guess,
> > which makes the above trick work.
>
> Oh? So the page referenced above isn't an array of pages? It's
> actually a single piece of contiguous memory?
Yes.
>
> > Better use kvmalloc, which might have a slightly worse performance
> > than a large page, but definitely not worse than the current single
> > page.
>
> Which returns a void*, not struct page* - guess this can be converted
> using __virt_to_page (iirc)?
No, it cannot be converted to a page or a page array, use it as just a
piece of kernel memory.
Which means:
- don't set ->args.out_pages and ->num_pages
- instead set ->args.out_args[0].value to the allocated pointer
and that should be it (fingers crossed).
>
> > If we want to optimize the overhead of kvmalloc (and it's a big if)
> > then the parse_dir*file() functions would need to be converted to
> > using a page array instead of a plain kernel pointer, which would add
> > some complexity for sure.
>
> Sorry, I read the above as "I'm surprised this works at all and you're
> not kernel panicking all over the show", this is probably the most
> ambitious kernel patch I've attempted to date.
Good start, you'll learn.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-26 18:25 ` Jaco Kroon
2023-07-27 15:21 ` Miklos Szeredi
@ 2023-07-27 19:21 ` Miklos Szeredi
2023-07-27 19:43 ` Bernd Schubert
1 sibling, 1 reply; 50+ messages in thread
From: Miklos Szeredi @ 2023-07-27 19:21 UTC (permalink / raw)
To: Jaco Kroon
Cc: Antonio SJ Musumeci, Bernd Schubert, linux-fsdevel, linux-kernel
On Wed, 26 Jul 2023 at 20:40, Jaco Kroon <jaco@uls.co.za> wrote:
> Will look into FUSE_INIT. The FUSE_INIT as I understand from what I've
> read has some expansion constraints or the structure is somehow
> negotiated. Older clients in other words that's not aware of the option
> will follow some default. For backwards compatibility that default
> should probably be 1 page. For performance reasons it makes sense that
> this limit be larger.
Yes, might need this for backward compatibility. But perhaps a
feature flag is enough and the readdir buf can be limited to
fc->max_read.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-27 19:21 ` Miklos Szeredi
@ 2023-07-27 19:43 ` Bernd Schubert
2023-07-28 8:42 ` Miklos Szeredi
0 siblings, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2023-07-27 19:43 UTC (permalink / raw)
To: Miklos Szeredi, Jaco Kroon
Cc: Antonio SJ Musumeci, linux-fsdevel, linux-kernel
On 7/27/23 21:21, Miklos Szeredi wrote:
> On Wed, 26 Jul 2023 at 20:40, Jaco Kroon <jaco@uls.co.za> wrote:
>
>> Will look into FUSE_INIT. The FUSE_INIT as I understand from what I've
>> read has some expansion constraints or the structure is somehow
>> negotiated. Older clients in other words that's not aware of the option
>> will follow some default. For backwards compatibility that default
>> should probably be 1 page. For performance reasons it makes sense that
>> this limit be larger.
>
> Yes, might need this for backward compatibility. But perhaps a
> feature flag is enough and the readdir buf can be limited to
> fc->max_read.
fc->max_read is set by default to ~0 and only set to something else when
the max_read mount option is given? So typically that is a large value
(UINT_MAX)?
Thanks,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 15:35 ` Miklos Szeredi
2023-07-27 16:58 ` Jaco Kroon
2023-07-27 19:16 ` Bernd Schubert
@ 2023-07-27 20:35 ` Bernd Schubert
2023-07-28 5:05 ` Jaco Kroon
2 siblings, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2023-07-27 20:35 UTC (permalink / raw)
To: Miklos Szeredi, Jaco Kroon
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Antonio SJ Musumeci
On 7/27/23 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>>
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously). We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch. For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>> fs/fuse/Kconfig | 16 ++++++++++++++++
>> fs/fuse/readdir.c | 18 ++++++++++++------
>> 2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>> If you want to develop a userspace FS, or if you want to use
>> a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> + int
>> + range 0 5
>> + default 5
>> + help
>> + readdir performance varies greatly depending on the size of the read.
>> + Larger buffers results in larger reads, thus fewer reads and higher
>> + performance in return.
>> +
>> + You may want to reduce this value on seriously constrained memory
>> + systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> + This value reprents the order of the number of pages to allocate (ie,
>> + the shift value). A value of 0 is thus 1 page (4KiB) where 5 is 32
>> + pages (128KiB).
>> +
>> config CUSE
>> tristate "Character device in Userspace support"
>> depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>> #include <linux/pagemap.h>
>> #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>> {
>> struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>> struct fuse_mount *fm = get_fuse_mount(inode);
>> struct fuse_io_args ia = {};
>> struct fuse_args_pages *ap = &ia.ap;
>> - struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> + struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
>
> Does this really work? I would've thought we are relying on single
> page lengths somewhere.
>
>> u64 attr_version = 0;
>> bool locked;
>>
>> - page = alloc_page(GFP_KERNEL);
>> + page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>> if (!page)
>> return -ENOMEM;
>>
>> plus = fuse_use_readdirplus(inode, ctx);
>> ap->args.out_pages = true;
>> - ap->num_pages = 1;
>> + ap->num_pages = READDIR_PAGES;
>
> No. This is the array lenght, which is 1. This is the hack I guess,
> which makes the above trick work.
Hmm, ap->num_pages / ap->pages[] is used in fuse_copy_pages, but so is
ap->descs[] - shouldn't the patch caused an out-of-bound access?
Out of interest, would you mind to explain how the hack worked?
Thanks,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir [v2].
2023-07-27 20:35 ` Bernd Schubert
@ 2023-07-28 5:05 ` Jaco Kroon
0 siblings, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2023-07-28 5:05 UTC (permalink / raw)
To: Bernd Schubert, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, Randy Dunlap, Antonio SJ Musumeci
Hi,
>>> plus = fuse_use_readdirplus(inode, ctx);
>>> ap->args.out_pages = true;
>>> - ap->num_pages = 1;
>>> + ap->num_pages = READDIR_PAGES;
>>
>> No. This is the array lenght, which is 1. This is the hack I guess,
>> which makes the above trick work.
>
> Hmm, ap->num_pages / ap->pages[] is used in fuse_copy_pages, but so is
> ap->descs[] - shouldn't the patch caused an out-of-bound access?
> Out of interest, would you mind to explain how the hack worked?
Apparently it shouldn't ... my understanding of how pages* worked was
all wrong.
I'm guessing since all the data fits in the first page (ap->pages[0] in
other words, of length/size desc.length) that the other pages are never
accessed. Looking at fuse_copy_pages this does indeed seem to be the
case. So I ended up just being really, really lucky here.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] fuse: enable larger read buffers for readdir.
2023-07-27 19:43 ` Bernd Schubert
@ 2023-07-28 8:42 ` Miklos Szeredi
0 siblings, 0 replies; 50+ messages in thread
From: Miklos Szeredi @ 2023-07-28 8:42 UTC (permalink / raw)
To: Bernd Schubert
Cc: Jaco Kroon, Antonio SJ Musumeci, linux-fsdevel, linux-kernel
On Thu, 27 Jul 2023 at 21:43, Bernd Schubert <bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 7/27/23 21:21, Miklos Szeredi wrote:
> > On Wed, 26 Jul 2023 at 20:40, Jaco Kroon <jaco@uls.co.za> wrote:
> >
> >> Will look into FUSE_INIT. The FUSE_INIT as I understand from what I've
> >> read has some expansion constraints or the structure is somehow
> >> negotiated. Older clients in other words that's not aware of the option
> >> will follow some default. For backwards compatibility that default
> >> should probably be 1 page. For performance reasons it makes sense that
> >> this limit be larger.
> >
> > Yes, might need this for backward compatibility. But perhaps a
> > feature flag is enough and the readdir buf can be limited to
> > fc->max_read.
>
> fc->max_read is set by default to ~0 and only set to something else when
> the max_read mount option is given? So typically that is a large value
> (UINT_MAX)?
That's fine. It probably still makes sense to limit it to 128k, but
with the ctx->count patch it would be naturally limited by the size of
the userspace buffer. There's really no downside to enabling a large
buffer, other than an unlikely regression in userspace. If server
wants to return less entries, it still can. Unlike plain reads,
filling the buffer to the fullest extent isn't required for readdir.
So the buffer size calculation can be somthing like:
init:
#define FUSE_READDIR_MAX (128*1024)
fc->max_readdir = PAGE_SIZE;
if (flags & FUSE_LARGE_READDIR)
fc->max_readdir = min(fc->max_read, FUSE_READDIR_MAX);
[...]
readdir:
bufsize = min(fc->max_readdir, max(ctx->count, PAGE_SIZE));
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* fuse: increase readdir() buffer size
2023-07-27 8:12 ` [PATCH] fuse: enable larger read buffers for readdir [v2] Jaco Kroon
2023-07-27 15:35 ` Miklos Szeredi
@ 2025-03-14 22:16 ` Jaco Kroon
2025-03-14 22:16 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
` (3 more replies)
1 sibling, 4 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-03-14 22:16 UTC (permalink / raw)
To: jaco; +Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
This is a follow up to the attempt made a while ago.
Whist the patch worked, newer kernels have moved from pages to folios,
which gave me the motivation to implement the mechanism based on the
userspace buffer size patch that Miklos supplied.
That patch works as is, but I note there are changes to components
(overlayfs and exportfs) that I've got very little experience with, and
have not tested specifically here. They do look logical. I've marked
Miklos as the Author: here, and added my own Signed-off-by - I hope this
is correct.
The second patch in the series implements the changes to fuse's readdir
in order to utilize the first to enable reading more than one page of
dent structures at a time from userspace, I've included a strace from
before and after this patch in the commit to illustrate the difference.
To get the relevant performance on glusterfs improved (which was
mentioned elsewhere in the thread) changes to glusterfs to increase the
number of cached dentries is also required (these are pushed to github
but not yet merged, because similar to this patch, got stalled before
getting to the "ready for merge" phase even though it's operational).
Please advise if these two patches looks good (I've only done relatively
basic testing now, and it's not running on production systems yet)
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
@ 2025-03-14 22:16 ` Jaco Kroon
2025-03-29 9:20 ` Christophe JAILLET
2025-03-14 22:16 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
` (2 subsequent siblings)
3 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-03-14 22:16 UTC (permalink / raw)
To: jaco; +Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
This was provided by Miklos <miklos@szeredi.hu> via LKML on 2023/07/27
subject "Re: [PATCH] fuse: enable larger read buffers for readdir.".
This is thus preperation for an improved fuse readdir() patch. The
description he provided:
"The best strategy would be to find the optimal buffer size based on the size of
the userspace buffer. Putting that info into struct dir_context doesn't sound
too complicated...
"Here's a patch. It doesn't touch readdir. Simply setting the fuse buffer size
to the userspace buffer size should work, the record sizes are similar (fuse's
is slightly larger than libc's, so no overflow should ever happen)."
Author: Mikros Szeredi <miklos@szeredi.hu>
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/exportfs/expfs.c | 1 +
fs/overlayfs/readdir.c | 12 +++++++++++-
fs/readdir.c | 31 +++++++++++++++----------------
include/linux/fs.h | 19 +++++++++++++------
4 files changed, 40 insertions(+), 23 deletions(-)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..2015eea19097 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -286,6 +286,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 881ec5592da5..126c797751e9 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -351,6 +351,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -571,6 +572,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -672,6 +674,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -686,7 +689,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
name, namelen, rdt->xinowarn);
}
- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}
static bool ovl_is_impure_dir(struct file *file)
@@ -713,6 +719,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1073,6 +1080,7 @@ int ovl_check_d_type_supported(const struct path *realpath)
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};
@@ -1094,6 +1102,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1178,6 +1187,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
diff --git a/fs/readdir.c b/fs/readdir.c
index 0038efda417b..90049075a2aa 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -239,7 +240,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
/*
* New, all-improved, singing, dancing, iBCS2-compliant getdents()
- * interface.
+ * interface.
*/
struct linux_dirent {
unsigned long d_ino;
@@ -252,7 +253,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -271,7 +271,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -296,7 +296,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -329,7 +329,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -338,7 +338,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -356,7 +355,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
@@ -376,7 +375,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
@@ -392,7 +391,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -411,7 +410,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -475,6 +474,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -499,7 +499,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -518,7 +517,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -542,7 +541,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -557,8 +556,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;
@@ -575,7 +574,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2788df98080f..1e426e2b5999 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -308,7 +308,7 @@ struct iattr {
*/
#define FILESYSTEM_MAX_STACK_DEPTH 2
-/**
+/**
* enum positive_aop_returns - aop return codes with specific semantics
*
* @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
@@ -318,7 +318,7 @@ struct iattr {
* be a candidate for writeback again in the near
* future. Other callers must be careful to unlock
* the page if they get this return. Returned by
- * writepage();
+ * writepage();
*
* @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
* unlocked it and the page might have been truncated.
@@ -1151,8 +1151,8 @@ struct file *get_file_active(struct file **f);
#define MAX_NON_LFS ((1UL<<31) - 1)
-/* Page cache limit. The filesystems should put that into their s_maxbytes
- limits, otherwise bad things can happen in VM. */
+/* Page cache limit. The filesystems should put that into their s_maxbytes
+ limits, otherwise bad things can happen in VM. */
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
@@ -2073,6 +2073,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};
/*
@@ -2609,7 +2616,7 @@ int sync_inode_metadata(struct inode *inode, int wait);
struct file_system_type {
const char *name;
int fs_flags;
-#define FS_REQUIRES_DEV 1
+#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
@@ -3189,7 +3196,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
-
+
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);
--
2.48.1
^ permalink raw reply related [flat|nested] 50+ messages in thread
* [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
2025-03-14 22:16 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
@ 2025-03-14 22:16 ` Jaco Kroon
2025-03-31 16:41 ` Joanne Koong
2025-03-28 10:15 ` fuse: increase readdir() " Jaco Kroon
2025-04-01 14:18 ` fuse: increase readdir() buffer size [v4] Jaco Kroon
3 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-03-14 22:16 UTC (permalink / raw)
To: jaco; +Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
Clamp to min 1 page (4KB) and max 128 pages (512KB).
Glusterfs trial using strace ls -l.
Before:
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
After:
getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/fuse/readdir.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..a0ccbc84b000 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -337,11 +337,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ struct fuse_folio_desc desc = { .length = ctx->count };
u64 attr_version = 0, evict_ctr = 0;
bool locked;
+ int order;
- folio = folio_alloc(GFP_KERNEL, 0);
+ if (desc.length < PAGE_SIZE)
+ desc.length = PAGE_SIZE;
+ else if (desc.length > (PAGE_SIZE << 7)) /* 128 pages, typically 512KB */
+ desc.length = PAGE_SIZE << 7;
+
+ order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
+
+ do {
+ folio = folio_alloc(GFP_KERNEL, order);
+ if (folio)
+ break;
+ --order;
+ desc.length = PAGE_SIZE << order;
+ } while (order >= 0);
if (!folio)
return -ENOMEM;
@@ -353,10 +367,10 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
evict_ctr = fuse_get_evict_ctr(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
--
2.48.1
^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: fuse: increase readdir() buffer size
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
2025-03-14 22:16 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
2025-03-14 22:16 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
@ 2025-03-28 10:15 ` Jaco Kroon
2025-03-28 10:16 ` Bernd Schubert
2025-03-28 19:40 ` David Laight
2025-04-01 14:18 ` fuse: increase readdir() buffer size [v4] Jaco Kroon
3 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-03-28 10:15 UTC (permalink / raw)
To: linux-fsdevel, linux-kernel; +Cc: bernd.schubert, miklos, rdunlap, trapexit
Hi All,
I've not seen feedback on this, please may I get some direction on this?
Kind regards,
Jaco
On 2025/03/15 00:16, Jaco Kroon wrote:
> This is a follow up to the attempt made a while ago.
>
> Whist the patch worked, newer kernels have moved from pages to folios,
> which gave me the motivation to implement the mechanism based on the
> userspace buffer size patch that Miklos supplied.
>
> That patch works as is, but I note there are changes to components
> (overlayfs and exportfs) that I've got very little experience with, and
> have not tested specifically here. They do look logical. I've marked
> Miklos as the Author: here, and added my own Signed-off-by - I hope this
> is correct.
>
> The second patch in the series implements the changes to fuse's readdir
> in order to utilize the first to enable reading more than one page of
> dent structures at a time from userspace, I've included a strace from
> before and after this patch in the commit to illustrate the difference.
>
> To get the relevant performance on glusterfs improved (which was
> mentioned elsewhere in the thread) changes to glusterfs to increase the
> number of cached dentries is also required (these are pushed to github
> but not yet merged, because similar to this patch, got stalled before
> getting to the "ready for merge" phase even though it's operational).
>
> Please advise if these two patches looks good (I've only done relatively
> basic testing now, and it's not running on production systems yet)
>
> Kind regards,
> Jaco
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: fuse: increase readdir() buffer size
2025-03-28 10:15 ` fuse: increase readdir() " Jaco Kroon
@ 2025-03-28 10:16 ` Bernd Schubert
2025-03-28 19:40 ` David Laight
1 sibling, 0 replies; 50+ messages in thread
From: Bernd Schubert @ 2025-03-28 10:16 UTC (permalink / raw)
To: Jaco Kroon, linux-fsdevel, linux-kernel; +Cc: miklos, rdunlap, trapexit
On 3/28/25 11:15, Jaco Kroon wrote:
> Hi All,
>
> I've not seen feedback on this, please may I get some direction on this?
>
> Kind regards,
> Jaco
>
Sorry, will review today.
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: fuse: increase readdir() buffer size
2025-03-28 10:15 ` fuse: increase readdir() " Jaco Kroon
2025-03-28 10:16 ` Bernd Schubert
@ 2025-03-28 19:40 ` David Laight
2025-03-29 8:59 ` Jaco Kroon
1 sibling, 1 reply; 50+ messages in thread
From: David Laight @ 2025-03-28 19:40 UTC (permalink / raw)
To: Jaco Kroon
Cc: linux-fsdevel, linux-kernel, bernd.schubert, miklos, rdunlap,
trapexit
On Fri, 28 Mar 2025 12:15:47 +0200
Jaco Kroon <jaco@uls.co.za> wrote:
> Hi All,
>
> I've not seen feedback on this, please may I get some direction on this?
The only thing I can think of is that the longer loop might affect
scheduling latencies.
David
>
> Kind regards,
> Jaco
>
> On 2025/03/15 00:16, Jaco Kroon wrote:
> > This is a follow up to the attempt made a while ago.
> >
> > Whist the patch worked, newer kernels have moved from pages to folios,
> > which gave me the motivation to implement the mechanism based on the
> > userspace buffer size patch that Miklos supplied.
> >
> > That patch works as is, but I note there are changes to components
> > (overlayfs and exportfs) that I've got very little experience with, and
> > have not tested specifically here. They do look logical. I've marked
> > Miklos as the Author: here, and added my own Signed-off-by - I hope this
> > is correct.
> >
> > The second patch in the series implements the changes to fuse's readdir
> > in order to utilize the first to enable reading more than one page of
> > dent structures at a time from userspace, I've included a strace from
> > before and after this patch in the commit to illustrate the difference.
> >
> > To get the relevant performance on glusterfs improved (which was
> > mentioned elsewhere in the thread) changes to glusterfs to increase the
> > number of cached dentries is also required (these are pushed to github
> > but not yet merged, because similar to this patch, got stalled before
> > getting to the "ready for merge" phase even though it's operational).
> >
> > Please advise if these two patches looks good (I've only done relatively
> > basic testing now, and it's not running on production systems yet)
> >
> > Kind regards,
> > Jaco
> >
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: fuse: increase readdir() buffer size
2025-03-28 19:40 ` David Laight
@ 2025-03-29 8:59 ` Jaco Kroon
0 siblings, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-03-29 8:59 UTC (permalink / raw)
To: David Laight
Cc: linux-fsdevel, linux-kernel, bernd.schubert, miklos, rdunlap,
trapexit
Hi David,
On 2025/03/28 21:40, David Laight wrote:
> On Fri, 28 Mar 2025 12:15:47 +0200
> Jaco Kroon <jaco@uls.co.za> wrote:
>
>> Hi All,
>>
>> I've not seen feedback on this, please may I get some direction on this?
> The only thing I can think of is that the longer loop might affect
> scheduling latencies.
I'm assuming you're referring to the fact that it's now possible to
iterate more times through the loop at the very last phase?
The first loop which sets up the size of the folio should only ever
execute once unless there is fairly huge memory pressure and allocations
fails.
The clamping code is unfortunate in my opinion, but it is possible that
we can either get an insane huge count (based on inspection of other
code, intended to be -1) or a zero value.
The upper limit here is arbitrary, but in the usual case I'm expecting a
128KiB buffer to be allocated, which should usually succeed on the first
round.
The call to fuse_read_args_fill may result in marginally longer running
code, along with the clamping code, but due to larger buffers of
dentries being returned this is more than compensated for in terms of
the drastic reduction in user-kernel space context switches by way of
fewer getdents() system calls having to be made to iterate a full readdir().
Other systems may be more resilient, but glusterfs without my dentry
cache patches has a tendency to "forget" dentries under pressure, and
then have to restart getdents() runs via client-server round-trip hoops,
often resulting in system call durations on getdents upwards of 30s at a
time. With this patch, the overall latency comes down drastically, and
the number of these (inferred) restart runs on the server side to find
the right place to continue reading from is also reduced, even without
increasing the dentry cache in the glusterfs code. These two patches in
combination basically in my experience makes glusterfs operate no worse
than NFS on average.
Given the feedback I'll discuss deploying updated kernels including
these patches to our production rather than test environments (one node
at a time, 21 in total) with the team. And then provide feedback from
there. Our rule remains not more than one node at a time for
potentially disruptive changes like these, and preferably no more than
one node a day per environment.
In our test environment this panned out on newest (at time of mailing
this in) kernel, and informal time trials (time find /mount/point, with
180m inodes) was quite positive, and orders of magnitude better than
without (We killed the without after it took about 3x longer than with,
still incomplete).
Kind regards,
Jaco
>
> David
>
>> Kind regards,
>> Jaco
>>
>> On 2025/03/15 00:16, Jaco Kroon wrote:
>>> This is a follow up to the attempt made a while ago.
>>>
>>> Whist the patch worked, newer kernels have moved from pages to folios,
>>> which gave me the motivation to implement the mechanism based on the
>>> userspace buffer size patch that Miklos supplied.
>>>
>>> That patch works as is, but I note there are changes to components
>>> (overlayfs and exportfs) that I've got very little experience with, and
>>> have not tested specifically here. They do look logical. I've marked
>>> Miklos as the Author: here, and added my own Signed-off-by - I hope this
>>> is correct.
>>>
>>> The second patch in the series implements the changes to fuse's readdir
>>> in order to utilize the first to enable reading more than one page of
>>> dent structures at a time from userspace, I've included a strace from
>>> before and after this patch in the commit to illustrate the difference.
>>>
>>> To get the relevant performance on glusterfs improved (which was
>>> mentioned elsewhere in the thread) changes to glusterfs to increase the
>>> number of cached dentries is also required (these are pushed to github
>>> but not yet merged, because similar to this patch, got stalled before
>>> getting to the "ready for merge" phase even though it's operational).
>>>
>>> Please advise if these two patches looks good (I've only done relatively
>>> basic testing now, and it's not running on production systems yet)
>>>
>>> Kind regards,
>>> Jaco
>>>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint
2025-03-14 22:16 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
@ 2025-03-29 9:20 ` Christophe JAILLET
2025-03-30 14:27 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Christophe JAILLET @ 2025-03-29 9:20 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
Hi,
a few nitpicks below to reduce the diffpatch with unrelated changes.
(trainling spaces)
Le 14/03/2025 à 23:16, Jaco Kroon a écrit :
> This was provided by Miklos <miklos@szeredi.hu> via LKML on 2023/07/27
> subject "Re: [PATCH] fuse: enable larger read buffers for readdir.".
>
> This is thus preperation for an improved fuse readdir() patch. The
s/preperation/preparation/
> description he provided:
>
> "The best strategy would be to find the optimal buffer size based on the size of
> the userspace buffer. Putting that info into struct dir_context doesn't sound
> too complicated...
>
> "Here's a patch. It doesn't touch readdir. Simply setting the fuse buffer size
> to the userspace buffer size should work, the record sizes are similar (fuse's
> is slightly larger than libc's, so no overflow should ever happen)."
...
> @@ -239,7 +240,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
>
> /*
> * New, all-improved, singing, dancing, iBCS2-compliant getdents()
> - * interface.
> + * interface.
Unrelated change.
> */
> struct linux_dirent {
> unsigned long d_ino;
...
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 2788df98080f..1e426e2b5999 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -308,7 +308,7 @@ struct iattr {
> */
> #define FILESYSTEM_MAX_STACK_DEPTH 2
>
> -/**
> +/**
Unrelated change.
> * enum positive_aop_returns - aop return codes with specific semantics
> *
> * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
> @@ -318,7 +318,7 @@ struct iattr {
> * be a candidate for writeback again in the near
> * future. Other callers must be careful to unlock
> * the page if they get this return. Returned by
> - * writepage();
> + * writepage();
Unrelated change.
> *
> * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
> * unlocked it and the page might have been truncated.
> @@ -1151,8 +1151,8 @@ struct file *get_file_active(struct file **f);
>
> #define MAX_NON_LFS ((1UL<<31) - 1)
>
> -/* Page cache limit. The filesystems should put that into their s_maxbytes
> - limits, otherwise bad things can happen in VM. */
> +/* Page cache limit. The filesystems should put that into their s_maxbytes
> + limits, otherwise bad things can happen in VM. */
Unrelated change.
> #if BITS_PER_LONG==32
> #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT)
> #elif BITS_PER_LONG==64
> @@ -2073,6 +2073,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
> struct dir_context {
> filldir_t actor;
> loff_t pos;
> + /*
> + * Filesystems MUST NOT MODIFY count, but may use as a hint:
> + * 0 unknown
> + * > 0 space in buffer (assume at least one entry)
> + * INT_MAX unlimited
> + */
> + int count;
> };
>
> /*
> @@ -2609,7 +2616,7 @@ int sync_inode_metadata(struct inode *inode, int wait);
> struct file_system_type {
> const char *name;
> int fs_flags;
> -#define FS_REQUIRES_DEV 1
> +#define FS_REQUIRES_DEV 1
Unrelated change.
> #define FS_BINARY_MOUNTDATA 2
> #define FS_HAS_SUBTYPE 4
> #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
> @@ -3189,7 +3196,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
> extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
> extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
> extern struct file * open_exec(const char *);
> -
> +
Unrelated change.
> /* fs/dcache.c -- generic fs support functions */
> extern bool is_subdir(struct dentry *, struct dentry *);
> extern bool path_is_under(const struct path *, const struct path *);
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint
2025-03-29 9:20 ` Christophe JAILLET
@ 2025-03-30 14:27 ` Jaco Kroon
0 siblings, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-03-30 14:27 UTC (permalink / raw)
To: Christophe JAILLET
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
ACK. Fixed locally.
Thank you.
Will give a few more days before re-sending.
Kind regards,
Jaco
On 2025/03/29 11:20, Christophe JAILLET wrote:
> Hi,
>
> a few nitpicks below to reduce the diffpatch with unrelated changes.
> (trainling spaces)
>
>
> Le 14/03/2025 à 23:16, Jaco Kroon a écrit :
>> This was provided by Miklos <miklos@szeredi.hu> via LKML on 2023/07/27
>> subject "Re: [PATCH] fuse: enable larger read buffers for readdir.".
>>
>> This is thus preperation for an improved fuse readdir() patch. The
>
> s/preperation/preparation/
>
>> description he provided:
>>
>> "The best strategy would be to find the optimal buffer size based on
>> the size of
>> the userspace buffer. Putting that info into struct dir_context
>> doesn't sound
>> too complicated...
>>
>> "Here's a patch. It doesn't touch readdir. Simply setting the fuse
>> buffer size
>> to the userspace buffer size should work, the record sizes are
>> similar (fuse's
>> is slightly larger than libc's, so no overflow should ever happen)."
>
> ...
>
>> @@ -239,7 +240,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
>> /*
>> * New, all-improved, singing, dancing, iBCS2-compliant getdents()
>> - * interface.
>> + * interface.
>
> Unrelated change.
>
>> */
>> struct linux_dirent {
>> unsigned long d_ino;
>
> ...
>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 2788df98080f..1e426e2b5999 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -308,7 +308,7 @@ struct iattr {
>> */
>> #define FILESYSTEM_MAX_STACK_DEPTH 2
>> -/**
>> +/**
>
> Unrelated change.
>
>> * enum positive_aop_returns - aop return codes with specific
>> semantics
>> *
>> * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
>> @@ -318,7 +318,7 @@ struct iattr {
>> * be a candidate for writeback again in the near
>> * future. Other callers must be careful to unlock
>> * the page if they get this return. Returned by
>> - * writepage();
>> + * writepage();
>
> Unrelated change.
>
>> *
>> * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked
>> page has
>> * unlocked it and the page might have been truncated.
>> @@ -1151,8 +1151,8 @@ struct file *get_file_active(struct file **f);
>> #define MAX_NON_LFS ((1UL<<31) - 1)
>> -/* Page cache limit. The filesystems should put that into their
>> s_maxbytes
>> - limits, otherwise bad things can happen in VM. */
>> +/* Page cache limit. The filesystems should put that into their
>> s_maxbytes
>> + limits, otherwise bad things can happen in VM. */
>
> Unrelated change.
>
>> #if BITS_PER_LONG==32
>> #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT)
>> #elif BITS_PER_LONG==64
>> @@ -2073,6 +2073,13 @@ typedef bool (*filldir_t)(struct dir_context
>> *, const char *, int, loff_t, u64,
>> struct dir_context {
>> filldir_t actor;
>> loff_t pos;
>> + /*
>> + * Filesystems MUST NOT MODIFY count, but may use as a hint:
>> + * 0 unknown
>> + * > 0 space in buffer (assume at least one entry)
>> + * INT_MAX unlimited
>> + */
>> + int count;
>> };
>> /*
>> @@ -2609,7 +2616,7 @@ int sync_inode_metadata(struct inode *inode,
>> int wait);
>> struct file_system_type {
>> const char *name;
>> int fs_flags;
>> -#define FS_REQUIRES_DEV 1
>> +#define FS_REQUIRES_DEV 1
>
> Unrelated change.
>
>> #define FS_BINARY_MOUNTDATA 2
>> #define FS_HAS_SUBTYPE 4
>> #define FS_USERNS_MOUNT 8 /* Can be mounted by userns
>> root */
>> @@ -3189,7 +3196,7 @@ ssize_t __kernel_read(struct file *file, void
>> *buf, size_t count, loff_t *pos);
>> extern ssize_t kernel_write(struct file *, const void *, size_t,
>> loff_t *);
>> extern ssize_t __kernel_write(struct file *, const void *, size_t,
>> loff_t *);
>> extern struct file * open_exec(const char *);
>> -
>> +
>
> Unrelated change.
>
>> /* fs/dcache.c -- generic fs support functions */
>> extern bool is_subdir(struct dentry *, struct dentry *);
>> extern bool path_is_under(const struct path *, const struct path *);
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-03-14 22:16 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
@ 2025-03-31 16:41 ` Joanne Koong
2025-03-31 20:43 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Joanne Koong @ 2025-03-31 16:41 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
On Fri, Mar 14, 2025 at 3:39 PM Jaco Kroon <jaco@uls.co.za> wrote:
>
> Clamp to min 1 page (4KB) and max 128 pages (512KB).
>
> Glusterfs trial using strace ls -l.
>
> Before:
>
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
> getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
> getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
>
> After:
>
> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
>
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
> fs/fuse/readdir.c | 22 ++++++++++++++++++----
> 1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index 17ce9636a2b1..a0ccbc84b000 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -337,11 +337,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_io_args ia = {};
> struct fuse_args_pages *ap = &ia.ap;
> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
> + struct fuse_folio_desc desc = { .length = ctx->count };
> u64 attr_version = 0, evict_ctr = 0;
> bool locked;
> + int order;
>
> - folio = folio_alloc(GFP_KERNEL, 0);
> + if (desc.length < PAGE_SIZE)
> + desc.length = PAGE_SIZE;
> + else if (desc.length > (PAGE_SIZE << 7)) /* 128 pages, typically 512KB */
> + desc.length = PAGE_SIZE << 7;
> +
Just wondering, how did 128 pages get decided as the upper bound? It
seems to me to make more sense if the upper bound is fc->max_pages.
Also btw, I think you can just use the clamp() helper from
<linux/minmax.h> to do the clamping
> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> +
> + do {
> + folio = folio_alloc(GFP_KERNEL, order);
Folios can now be larger than one page size for readdir requests with
your change but I don't believe the current page copying code in fuse
supports this yet. For example, I think the kmapping will be
insufficient in fuse_copy_page() where in the current code we kmap
only the first page in the folio. I sent a patch for supporting large
folios page copying [1] and am trying to get this merged in but
haven't heard back about this patchset yet. In your local tests that
used multiple pages for the readdir request, did you run into any
issues or it worked fine?
[1] https://lore.kernel.org/linux-fsdevel/20250123012448.2479372-2-joannelkoong@gmail.com/
Thanks,
Joanne
> + if (folio)
> + break;
> + --order;
> + desc.length = PAGE_SIZE << order;
> + } while (order >= 0);
> if (!folio)
> return -ENOMEM;
>
> @@ -353,10 +367,10 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> if (plus) {
> attr_version = fuse_get_attr_version(fm->fc);
> evict_ctr = fuse_get_evict_ctr(fm->fc);
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
> FUSE_READDIRPLUS);
> } else {
> - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
> + fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
> FUSE_READDIR);
> }
> locked = fuse_lock_inode(inode);
> --
> 2.48.1
>
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-03-31 16:41 ` Joanne Koong
@ 2025-03-31 20:43 ` Jaco Kroon
2025-03-31 21:48 ` Joanne Koong
0 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-03-31 20:43 UTC (permalink / raw)
To: Joanne Koong
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
Hi,
On 2025/03/31 18:41, Joanne Koong wrote:
> On Fri, Mar 14, 2025 at 3:39 PM Jaco Kroon<jaco@uls.co.za> wrote:
>> Clamp to min 1 page (4KB) and max 128 pages (512KB).
>>
>> Glusterfs trial using strace ls -l.
>>
>> Before:
>>
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
>> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
>> getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
>> getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
>>
>> After:
>>
>> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
>> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
>>
>> Signed-off-by: Jaco Kroon<jaco@uls.co.za>
>> ---
>> fs/fuse/readdir.c | 22 ++++++++++++++++++----
>> 1 file changed, 18 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index 17ce9636a2b1..a0ccbc84b000 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -337,11 +337,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>> struct fuse_mount *fm = get_fuse_mount(inode);
>> struct fuse_io_args ia = {};
>> struct fuse_args_pages *ap = &ia.ap;
>> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
>> + struct fuse_folio_desc desc = { .length = ctx->count };
>> u64 attr_version = 0, evict_ctr = 0;
>> bool locked;
>> + int order;
>>
>> - folio = folio_alloc(GFP_KERNEL, 0);
>> + if (desc.length < PAGE_SIZE)
>> + desc.length = PAGE_SIZE;
>> + else if (desc.length > (PAGE_SIZE << 7)) /* 128 pages, typically 512KB */
>> + desc.length = PAGE_SIZE << 7;
>> +
> Just wondering, how did 128 pages get decided as the upper bound? It
> seems to me to make more sense if the upper bound is fc->max_pages.
Best answer ... random/guess at something which may be sensible.
> Also btw, I think you can just use the clamp() helper from
> <linux/minmax.h> to do the clamping
Thanks. Not a regular contributor to the kernel, not often that I've
got an itch that needs scratching here :).
So something like this then:
345
346 desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages <<
CONFIG_PAGE_SHIFT);
347 order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
348
Note: Can use ctx->count here in clamp directly due to it being signed,
where desc.length is unsigned.
I'm *assuming* get_count_order will round-up, so if max_pages is 7 (if
non-power of two is even possible) we will really get 8 pages here?
Compile tested only. Will perform basic run-time test before re-submit.
>> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
>> +
>> + do {
>> + folio = folio_alloc(GFP_KERNEL, order);
> Folios can now be larger than one page size for readdir requests with
> your change but I don't believe the current page copying code in fuse
> supports this yet. For example, I think the kmapping will be
> insufficient in fuse_copy_page() where in the current code we kmap
> only the first page in the folio. I sent a patch for supporting large
> folios page copying [1] and am trying to get this merged in but
> haven't heard back about this patchset yet. In your local tests that
> used multiple pages for the readdir request, did you run into any
> issues or it worked fine?
My tests boiled down to running strace as per above, and then some basic
time trials using find /path/to/mount/point with and without the patch
over a fairly large structure containing about 170m inodes. No problems
observed. That said ... I've done similar before, and then introduced a
major memory leak that under load destroyed 100GB of RAM in minutes.
Thus why I'm looking for a few eyeballs on this before going to
production (what we have works, it's just on an older kernel).
If further improvements are possible that would be great, but based on
testing this is already at least a 10x improvement on readdir() performance.
> [1]https://lore.kernel.org/linux-fsdevel/20250123012448.2479372-2-joannelkoong@gmail.com/
Took a quick look, wish I could provide you some feedback but that's
beyond my kernel skill set to just eyeball.
Looks like you're primarily getting rid of the code that references the
pages inside the folio's and just operating on the folio's directly? A
side effect of which (your goal) is to enable larger copies rather than
small ones?
Thank you,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-03-31 20:43 ` Jaco Kroon
@ 2025-03-31 21:48 ` Joanne Koong
2025-03-31 23:01 ` Joanne Koong
0 siblings, 1 reply; 50+ messages in thread
From: Joanne Koong @ 2025-03-31 21:48 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
On Mon, Mar 31, 2025 at 1:43 PM Jaco Kroon <jaco@uls.co.za> wrote:
>
> Hi,
>
> On 2025/03/31 18:41, Joanne Koong wrote:
> > On Fri, Mar 14, 2025 at 3:39 PM Jaco Kroon<jaco@uls.co.za> wrote:
> >> Clamp to min 1 page (4KB) and max 128 pages (512KB).
> >>
> >> Glusterfs trial using strace ls -l.
> >>
> >> Before:
> >>
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
> >> getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
> >> getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
> >>
> >> After:
> >>
> >> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
> >> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
> >>
> >> Signed-off-by: Jaco Kroon<jaco@uls.co.za>
> >> ---
> >> fs/fuse/readdir.c | 22 ++++++++++++++++++----
> >> 1 file changed, 18 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> >> index 17ce9636a2b1..a0ccbc84b000 100644
> >> --- a/fs/fuse/readdir.c
> >> +++ b/fs/fuse/readdir.c
> >> @@ -337,11 +337,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> >> struct fuse_mount *fm = get_fuse_mount(inode);
> >> struct fuse_io_args ia = {};
> >> struct fuse_args_pages *ap = &ia.ap;
> >> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
> >> + struct fuse_folio_desc desc = { .length = ctx->count };
> >> u64 attr_version = 0, evict_ctr = 0;
> >> bool locked;
> >> + int order;
> >>
> >> - folio = folio_alloc(GFP_KERNEL, 0);
> >> + if (desc.length < PAGE_SIZE)
> >> + desc.length = PAGE_SIZE;
> >> + else if (desc.length > (PAGE_SIZE << 7)) /* 128 pages, typically 512KB */
> >> + desc.length = PAGE_SIZE << 7;
> >> +
> > Just wondering, how did 128 pages get decided as the upper bound? It
> > seems to me to make more sense if the upper bound is fc->max_pages.
>
> Best answer ... random/guess at something which may be sensible.
>
> > Also btw, I think you can just use the clamp() helper from
> > <linux/minmax.h> to do the clamping
>
> Thanks. Not a regular contributor to the kernel, not often that I've
> got an itch that needs scratching here :).
>
> So something like this then:
>
> 345
> 346 desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages <<
> CONFIG_PAGE_SHIFT);
> 347 order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> 348
>
You can just use PAGE_SHIFT here instead of CONFIG_PAGE_SHIFT
> Note: Can use ctx->count here in clamp directly due to it being signed,
> where desc.length is unsigned.
>
> I'm *assuming* get_count_order will round-up, so if max_pages is 7 (if
> non-power of two is even possible) we will really get 8 pages here?
Yes, if you have a max_pages of 7, this will round up and return to
you an order of 3
>
> Compile tested only. Will perform basic run-time test before re-submit.
>
> >> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> >> +
> >> + do {
> >> + folio = folio_alloc(GFP_KERNEL, order);
> > Folios can now be larger than one page size for readdir requests with
> > your change but I don't believe the current page copying code in fuse
> > supports this yet. For example, I think the kmapping will be
> > insufficient in fuse_copy_page() where in the current code we kmap
> > only the first page in the folio. I sent a patch for supporting large
> > folios page copying [1] and am trying to get this merged in but
> > haven't heard back about this patchset yet. In your local tests that
> > used multiple pages for the readdir request, did you run into any
> > issues or it worked fine?
>
> My tests boiled down to running strace as per above, and then some basic
> time trials using find /path/to/mount/point with and without the patch
> over a fairly large structure containing about 170m inodes. No problems
> observed. That said ... I've done similar before, and then introduced a
> major memory leak that under load destroyed 100GB of RAM in minutes.
> Thus why I'm looking for a few eyeballs on this before going to
> production (what we have works, it's just on an older kernel).
>
> If further improvements are possible that would be great, but based on
> testing this is already at least a 10x improvement on readdir() performance.
>
I think you need the patch I linked to or this could cause crashes.
The patch adds support for copying folios larger than 1 page size in
fuse. Maybe you're not running into the crash because it's going
through splice which will circumvent copying, but in the non-splice
case, I believe the kmap is insufficient when you go to do the actual
copying. IOW, I think that patch is a dependency for this one.
Thanks,
Joanne
> > [1]https://lore.kernel.org/linux-fsdevel/20250123012448.2479372-2-joannelkoong@gmail.com/
> Took a quick look, wish I could provide you some feedback but that's
> beyond my kernel skill set to just eyeball.
>
> Looks like you're primarily getting rid of the code that references the
> pages inside the folio's and just operating on the folio's directly? A
> side effect of which (your goal) is to enable larger copies rather than
> small ones?
>
> Thank you,
> Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-03-31 21:48 ` Joanne Koong
@ 2025-03-31 23:01 ` Joanne Koong
0 siblings, 0 replies; 50+ messages in thread
From: Joanne Koong @ 2025-03-31 23:01 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, miklos, rdunlap,
trapexit
On Mon, Mar 31, 2025 at 2:48 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Mar 31, 2025 at 1:43 PM Jaco Kroon <jaco@uls.co.za> wrote:
> >
> > Hi,
> >
> > On 2025/03/31 18:41, Joanne Koong wrote:
> > > On Fri, Mar 14, 2025 at 3:39 PM Jaco Kroon<jaco@uls.co.za> wrote:
> > >> Clamp to min 1 page (4KB) and max 128 pages (512KB).
> > >>
> > >> Glusterfs trial using strace ls -l.
> > >>
> > >> Before:
> > >>
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
> > >> getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
> > >> getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
> > >> getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
> > >>
> > >> After:
> > >>
> > >> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
> > >> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
> > >>
> > >> Signed-off-by: Jaco Kroon<jaco@uls.co.za>
> > >> ---
> > >> fs/fuse/readdir.c | 22 ++++++++++++++++++----
> > >> 1 file changed, 18 insertions(+), 4 deletions(-)
> > >>
> > >> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> > >> index 17ce9636a2b1..a0ccbc84b000 100644
> > >> --- a/fs/fuse/readdir.c
> > >> +++ b/fs/fuse/readdir.c
> > >> @@ -337,11 +337,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> > >> struct fuse_mount *fm = get_fuse_mount(inode);
> > >> struct fuse_io_args ia = {};
> > >> struct fuse_args_pages *ap = &ia.ap;
> > >> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
> > >> + struct fuse_folio_desc desc = { .length = ctx->count };
> > >> u64 attr_version = 0, evict_ctr = 0;
> > >> bool locked;
> > >> + int order;
> > >>
> > >> - folio = folio_alloc(GFP_KERNEL, 0);
> > >> + if (desc.length < PAGE_SIZE)
> > >> + desc.length = PAGE_SIZE;
> > >> + else if (desc.length > (PAGE_SIZE << 7)) /* 128 pages, typically 512KB */
> > >> + desc.length = PAGE_SIZE << 7;
> > >> +
> > > Just wondering, how did 128 pages get decided as the upper bound? It
> > > seems to me to make more sense if the upper bound is fc->max_pages.
> >
> > Best answer ... random/guess at something which may be sensible.
> >
> > > Also btw, I think you can just use the clamp() helper from
> > > <linux/minmax.h> to do the clamping
> >
> > Thanks. Not a regular contributor to the kernel, not often that I've
> > got an itch that needs scratching here :).
> >
> > So something like this then:
> >
> > 345
> > 346 desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages <<
> > CONFIG_PAGE_SHIFT);
> > 347 order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> > 348
> >
>
> You can just use PAGE_SHIFT here instead of CONFIG_PAGE_SHIFT
>
> > Note: Can use ctx->count here in clamp directly due to it being signed,
> > where desc.length is unsigned.
> >
> > I'm *assuming* get_count_order will round-up, so if max_pages is 7 (if
> > non-power of two is even possible) we will really get 8 pages here?
>
> Yes, if you have a max_pages of 7, this will round up and return to
> you an order of 3
>
> >
> > Compile tested only. Will perform basic run-time test before re-submit.
> >
> > >> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> > >> +
> > >> + do {
> > >> + folio = folio_alloc(GFP_KERNEL, order);
> > > Folios can now be larger than one page size for readdir requests with
> > > your change but I don't believe the current page copying code in fuse
> > > supports this yet. For example, I think the kmapping will be
> > > insufficient in fuse_copy_page() where in the current code we kmap
> > > only the first page in the folio. I sent a patch for supporting large
> > > folios page copying [1] and am trying to get this merged in but
> > > haven't heard back about this patchset yet. In your local tests that
> > > used multiple pages for the readdir request, did you run into any
> > > issues or it worked fine?
> >
> > My tests boiled down to running strace as per above, and then some basic
> > time trials using find /path/to/mount/point with and without the patch
> > over a fairly large structure containing about 170m inodes. No problems
> > observed. That said ... I've done similar before, and then introduced a
> > major memory leak that under load destroyed 100GB of RAM in minutes.
> > Thus why I'm looking for a few eyeballs on this before going to
> > production (what we have works, it's just on an older kernel).
> >
> > If further improvements are possible that would be great, but based on
> > testing this is already at least a 10x improvement on readdir() performance.
> >
>
> I think you need the patch I linked to or this could cause crashes.
> The patch adds support for copying folios larger than 1 page size in
> fuse. Maybe you're not running into the crash because it's going
> through splice which will circumvent copying, but in the non-splice
> case, I believe the kmap is insufficient when you go to do the actual
> copying. IOW, I think that patch is a dependency for this one.
>
Update: I looked more into this and I was wrong, you don't need that
patch as a dependency. In fuse_copy_do(), the number of bytes copied
is still done page by page regardless of how large the folio is (eg
see ncpy = min(*size, cs->len)). So please ignore my earlier comment
about this potentially accessing memory that hasn't been kmapped
properly.
Thanks,
Joanne
> Thanks,
> Joanne
>
> > > [1]https://lore.kernel.org/linux-fsdevel/20250123012448.2479372-2-joannelkoong@gmail.com/
> > Took a quick look, wish I could provide you some feedback but that's
> > beyond my kernel skill set to just eyeball.
> >
> > Looks like you're primarily getting rid of the code that references the
> > pages inside the folio's and just operating on the folio's directly? A
> > side effect of which (your goal) is to enable larger copies rather than
> > small ones?
> >
> > Thank you,
> > Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* fuse: increase readdir() buffer size [v4]
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
` (2 preceding siblings ...)
2025-03-28 10:15 ` fuse: increase readdir() " Jaco Kroon
@ 2025-04-01 14:18 ` Jaco Kroon
2025-04-01 14:18 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
2025-04-01 14:18 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
3 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-04-01 14:18 UTC (permalink / raw)
To: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong
Cc: miklos, rdunlap, trapexit, david.laight.linux
This is the fourth revision of the patch, second revision if you only
look at the folio variants and not consider the previous ones using
pages (ie, against kernel 6.13 onwards).
Basic testing was once more completed without any errors noticed.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint
2025-04-01 14:18 ` fuse: increase readdir() buffer size [v4] Jaco Kroon
@ 2025-04-01 14:18 ` Jaco Kroon
2025-04-01 14:18 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
1 sibling, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-04-01 14:18 UTC (permalink / raw)
To: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong
Cc: miklos, rdunlap, trapexit, david.laight.linux, Jaco Kroon
This was provided by Miklos <miklos@szeredi.hu> via LKML on 2023/07/27
subject "Re: [PATCH] fuse: enable larger read buffers for readdir.".
This is thus preparation for an improved fuse readdir() patch. The
description he provided:
"The best strategy would be to find the optimal buffer size based on the size of
the userspace buffer. Putting that info into struct dir_context doesn't sound
too complicated...
"Here's a patch. It doesn't touch readdir. Simply setting the fuse buffer size
to the userspace buffer size should work, the record sizes are similar (fuse's
is slightly larger than libc's, so no overflow should ever happen)."
Author: Mikros Szeredi <miklos@szeredi.hu>
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/exportfs/expfs.c | 1 +
fs/overlayfs/readdir.c | 12 +++++++++++-
fs/readdir.c | 29 ++++++++++++++---------------
include/linux/fs.h | 7 +++++++
4 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..2015eea19097 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -286,6 +286,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 881ec5592da5..126c797751e9 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -351,6 +351,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -571,6 +572,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -672,6 +674,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -686,7 +689,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
name, namelen, rdt->xinowarn);
}
- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}
static bool ovl_is_impure_dir(struct file *file)
@@ -713,6 +719,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1073,6 +1080,7 @@ int ovl_check_d_type_supported(const struct path *realpath)
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};
@@ -1094,6 +1102,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1178,6 +1187,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
diff --git a/fs/readdir.c b/fs/readdir.c
index 0038efda417b..0107d5c129a4 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -252,7 +253,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -271,7 +271,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -296,7 +296,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -329,7 +329,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -338,7 +338,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -356,7 +355,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
@@ -376,7 +375,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
@@ -392,7 +391,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -411,7 +410,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -475,6 +474,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -499,7 +499,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -518,7 +517,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -542,7 +541,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -557,8 +556,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;
@@ -575,7 +574,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2788df98080f..3b0e6b9a249c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2073,6 +2073,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};
/*
--
2.48.1
^ permalink raw reply related [flat|nested] 50+ messages in thread
* [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 14:18 ` fuse: increase readdir() buffer size [v4] Jaco Kroon
2025-04-01 14:18 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
@ 2025-04-01 14:18 ` Jaco Kroon
2025-04-01 14:40 ` Miklos Szeredi
1 sibling, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-04-01 14:18 UTC (permalink / raw)
To: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong
Cc: miklos, rdunlap, trapexit, david.laight.linux, Jaco Kroon
Clamp to min 1 page (4KB) and max 128 pages (512KB).
Glusterfs trial using strace ls -l.
Before:
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0
After:
getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
fs/fuse/readdir.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..a13534f411b4 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -12,6 +12,7 @@
#include <linux/posix_acl.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/minmax.h>
static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
@@ -337,11 +338,21 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ struct fuse_folio_desc desc = { .length = ctx->count };
u64 attr_version = 0, evict_ctr = 0;
bool locked;
+ int order;
- folio = folio_alloc(GFP_KERNEL, 0);
+ desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
+ order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
+
+ do {
+ folio = folio_alloc(GFP_KERNEL, order);
+ if (folio)
+ break;
+ --order;
+ desc.length = PAGE_SIZE << order;
+ } while (order >= 0);
if (!folio)
return -ENOMEM;
@@ -353,10 +364,10 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
evict_ctr = fuse_get_evict_ctr(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
--
2.48.1
^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 14:18 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
@ 2025-04-01 14:40 ` Miklos Szeredi
2025-04-01 15:03 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Miklos Szeredi @ 2025-04-01 14:40 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
On Tue, 1 Apr 2025 at 16:29, Jaco Kroon <jaco@uls.co.za> wrote:
> After:
>
> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
This looks great. But see below.
>
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
> fs/fuse/readdir.c | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index 17ce9636a2b1..a13534f411b4 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -12,6 +12,7 @@
> #include <linux/posix_acl.h>
> #include <linux/pagemap.h>
> #include <linux/highmem.h>
> +#include <linux/minmax.h>
>
> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> {
> @@ -337,11 +338,21 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_io_args ia = {};
> struct fuse_args_pages *ap = &ia.ap;
> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
> + struct fuse_folio_desc desc = { .length = ctx->count };
> u64 attr_version = 0, evict_ctr = 0;
> bool locked;
> + int order;
>
> - folio = folio_alloc(GFP_KERNEL, 0);
> + desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> +
> + do {
> + folio = folio_alloc(GFP_KERNEL, order);
> + if (folio)
> + break;
> + --order;
> + desc.length = PAGE_SIZE << order;
> + } while (order >= 0);
> if (!folio)
> return -ENOMEM;
Why not use kvmalloc instead?
We could also implement allocation based on size of result in dev.c to
optimize this, as most directories will be small, but that can be done
later.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 14:40 ` Miklos Szeredi
@ 2025-04-01 15:03 ` Jaco Kroon
2025-04-01 15:33 ` Miklos Szeredi
0 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-04-01 15:03 UTC (permalink / raw)
To: Miklos Szeredi
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
Hi,
On 2025/04/01 16:40, Miklos Szeredi wrote:
> On Tue, 1 Apr 2025 at 16:29, Jaco Kroon <jaco@uls.co.za> wrote:
>> After:
>>
>> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
>> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
> This looks great. But see below.
>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>> fs/fuse/readdir.c | 19 +++++++++++++++----
>> 1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index 17ce9636a2b1..a13534f411b4 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -12,6 +12,7 @@
>> #include <linux/posix_acl.h>
>> #include <linux/pagemap.h>
>> #include <linux/highmem.h>
>> +#include <linux/minmax.h>
>>
>> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>> {
>> @@ -337,11 +338,21 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>> struct fuse_mount *fm = get_fuse_mount(inode);
>> struct fuse_io_args ia = {};
>> struct fuse_args_pages *ap = &ia.ap;
>> - struct fuse_folio_desc desc = { .length = PAGE_SIZE };
>> + struct fuse_folio_desc desc = { .length = ctx->count };
>> u64 attr_version = 0, evict_ctr = 0;
>> bool locked;
>> + int order;
>>
>> - folio = folio_alloc(GFP_KERNEL, 0);
>> + desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
>> + order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
>> +
>> + do {
>> + folio = folio_alloc(GFP_KERNEL, order);
>> + if (folio)
>> + break;
>> + --order;
>> + desc.length = PAGE_SIZE << order;
>> + } while (order >= 0);
>> if (!folio)
>> return -ENOMEM;
> Why not use kvmalloc instead?
Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
(ia) expects folios and changing that is more than what I'm capable off,
and has larger overall impact.
> We could also implement allocation based on size of result in dev.c to
> optimize this, as most directories will be small, but that can be done
> later.
This indeed sounds interesting and would be great, but again, beyond
what I'm capable of doing at this stage.
Great insights.
Thank you.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 15:03 ` Jaco Kroon
@ 2025-04-01 15:33 ` Miklos Szeredi
2025-04-02 7:54 ` Jaco Kroon
2025-04-08 14:19 ` Bernd Schubert
0 siblings, 2 replies; 50+ messages in thread
From: Miklos Szeredi @ 2025-04-01 15:33 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
[-- Attachment #1: Type: text/plain, Size: 298 bytes --]
On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:
> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
> (ia) expects folios and changing that is more than what I'm capable off,
> and has larger overall impact.
Attaching a minimally tested patch.
Thanks,
Miklos
[-- Attachment #2: fuse-kvmalloc-readdir-buf.patch --]
[-- Type: text/x-patch, Size: 2155 bytes --]
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..18e3950e5ff5 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -332,35 +332,31 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
{
int plus;
ssize_t res;
- struct folio *folio;
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
- struct fuse_args_pages *ap = &ia.ap;
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ struct fuse_args *args = &ia.ap.args;
+ void *buf;
+ size_t bufsize = 131072;
u64 attr_version = 0, evict_ctr = 0;
bool locked;
- folio = folio_alloc(GFP_KERNEL, 0);
- if (!folio)
+ buf = kvmalloc(bufsize, GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
+ args->out_args[0].value = buf;
+
plus = fuse_use_readdirplus(inode, ctx);
- ap->args.out_pages = true;
- ap->num_folios = 1;
- ap->folios = &folio;
- ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
evict_ctr = fuse_get_evict_ctr(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIRPLUS);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIR);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fm, &ap->args);
+ res = fuse_simple_request(fm, args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
@@ -369,16 +365,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (ff->open_flags & FOPEN_CACHE_DIR)
fuse_readdir_cache_end(file, ctx->pos);
} else if (plus) {
- res = parse_dirplusfile(folio_address(folio), res,
- file, ctx, attr_version,
+ res = parse_dirplusfile(buf, res, file, ctx, attr_version,
evict_ctr);
} else {
- res = parse_dirfile(folio_address(folio), res, file,
- ctx);
+ res = parse_dirfile(buf, res, file, ctx);
}
}
- folio_put(folio);
+ kvfree(buf);
fuse_invalidate_atime(inode);
return res;
}
^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 15:33 ` Miklos Szeredi
@ 2025-04-02 7:54 ` Jaco Kroon
2025-04-02 8:18 ` Miklos Szeredi
2025-04-08 14:19 ` Bernd Schubert
1 sibling, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-04-02 7:54 UTC (permalink / raw)
To: Miklos Szeredi
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
Hi,
I can definitely build on that, thank you.
What's the advantage of kvmalloc over folio's here, why should it be
preferred?
Kind regards,
Jaco
On 2025/04/01 17:33, Miklos Szeredi wrote:
> On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:
>
>> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
>> (ia) expects folios and changing that is more than what I'm capable off,
>> and has larger overall impact.
> Attaching a minimally tested patch.
>
> Thanks,
> Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 7:54 ` Jaco Kroon
@ 2025-04-02 8:18 ` Miklos Szeredi
2025-04-02 8:52 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Miklos Szeredi @ 2025-04-02 8:18 UTC (permalink / raw)
To: Jaco Kroon
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>
> Hi,
>
> I can definitely build on that, thank you.
>
> What's the advantage of kvmalloc over folio's here, why should it be
> preferred?
It offers the best of both worlds: first tries plain malloc (which
just does a folio alloc internally for size > PAGE_SIZE) and if that
fails, falls back to vmalloc, which should always succeed since it
uses order 0 pages.
This saves the trouble of iterating the folio alloc until it succeeds,
which is both undeterministic and complex, neither of which is
desirable.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 8:18 ` Miklos Szeredi
@ 2025-04-02 8:52 ` Jaco Kroon
2025-04-02 9:10 ` Bernd Schubert
0 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-04-02 8:52 UTC (permalink / raw)
To: Miklos Szeredi
Cc: bernd.schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
Hi,
On 2025/04/02 10:18, Miklos Szeredi wrote:
> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>> Hi,
>>
>> I can definitely build on that, thank you.
>>
>> What's the advantage of kvmalloc over folio's here, why should it be
>> preferred?
> It offers the best of both worlds: first tries plain malloc (which
> just does a folio alloc internally for size > PAGE_SIZE) and if that
> fails, falls back to vmalloc, which should always succeed since it
> uses order 0 pages.
So basically assigns the space, but doesn't commit physical pages for
the allocation, meaning first access will cause a page fault, and single
page allocation at that point in time? Or is it merely the fact that
vmalloc may return a virtual contiguous block that's not physically
contiguous?
Sorry if I'm asking notoriously dumb questions, I've got a VERY BASIC
grasp of memory management at kernel level, I work much more in
userspace, and I know there usually first access generates a page fault
which will then result in memory being physically allocated by the
kernel. Generally I ignore these complexities and just assume that the
"lower down" layers know what they're doing and I've got a "flat,
contiguous" memory space, and that malloc knows what it's doing and will
communicate with the kernel regarding which regions of virtual space
should be mapped. Love the learning though, so appreciate the feedback
very much.
>
> This saves the trouble of iterating the folio alloc until it succeeds,
> which is both undeterministic and complex, neither of which is
> desirable.
Agreed.
>
> Thanks,
> Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 8:52 ` Jaco Kroon
@ 2025-04-02 9:10 ` Bernd Schubert
2025-04-02 11:13 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2025-04-02 9:10 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
On 4/2/25 10:52, Jaco Kroon wrote:
> Hi,
>
> On 2025/04/02 10:18, Miklos Szeredi wrote:
>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>> Hi,
>>>
>>> I can definitely build on that, thank you.
>>>
>>> What's the advantage of kvmalloc over folio's here, why should it be
>>> preferred?
>> It offers the best of both worlds: first tries plain malloc (which
>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>> fails, falls back to vmalloc, which should always succeed since it
>> uses order 0 pages.
>
> So basically assigns the space, but doesn't commit physical pages for
> the allocation, meaning first access will cause a page fault, and single
> page allocation at that point in time? Or is it merely the fact that
> vmalloc may return a virtual contiguous block that's not physically
> contiguous?
Yes vmalloc return buffers might not be physically contiguous - not
suitable for hardware DMA. And AFAIK it is also a blocking allocation.
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 9:10 ` Bernd Schubert
@ 2025-04-02 11:13 ` Jaco Kroon
2025-04-02 11:35 ` Miklos Szeredi
2025-04-02 11:59 ` Bernd Schubert
0 siblings, 2 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-04-02 11:13 UTC (permalink / raw)
To: Bernd Schubert, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
Hi,
On 2025/04/02 11:10, Bernd Schubert wrote:
>
> On 4/2/25 10:52, Jaco Kroon wrote:
>> Hi,
>>
>> On 2025/04/02 10:18, Miklos Szeredi wrote:
>>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>>> Hi,
>>>>
>>>> I can definitely build on that, thank you.
>>>>
>>>> What's the advantage of kvmalloc over folio's here, why should it be
>>>> preferred?
>>> It offers the best of both worlds: first tries plain malloc (which
>>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>>> fails, falls back to vmalloc, which should always succeed since it
>>> uses order 0 pages.
>> So basically assigns the space, but doesn't commit physical pages for
>> the allocation, meaning first access will cause a page fault, and single
>> page allocation at that point in time? Or is it merely the fact that
>> vmalloc may return a virtual contiguous block that's not physically
>> contiguous?
>
> Yes vmalloc return buffers might not be physically contiguous - not
> suitable for hardware DMA. And AFAIK it is also a blocking allocation.
How do I go about confirming? Can that behaviour be stopped so that in
the case where it would block we can return an EAGAIN or EWOULDBLOCK
error code instead? Is that even desired?
Don't think hardware DMA is an issue here, so that's at least not an
issue, but the blocking might be?
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 11:13 ` Jaco Kroon
@ 2025-04-02 11:35 ` Miklos Szeredi
2025-04-02 11:59 ` Bernd Schubert
1 sibling, 0 replies; 50+ messages in thread
From: Miklos Szeredi @ 2025-04-02 11:35 UTC (permalink / raw)
To: Jaco Kroon
Cc: Bernd Schubert, linux-fsdevel, linux-kernel, christophe.jaillet,
joannelkoong, rdunlap, trapexit, david.laight.linux
On Wed, 2 Apr 2025 at 13:13, Jaco Kroon <jaco@uls.co.za> wrote:
> How do I go about confirming? Can that behaviour be stopped so that in
> the case where it would block we can return an EAGAIN or EWOULDBLOCK
> error code instead? Is that even desired?
All allocations except GFP_ATOMIC may block (that applies to
folio_alloc() and kmalloc() too). This shouldn't be a worry in the
readdir path. Freeing can safely be done in an atomic context.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-02 11:13 ` Jaco Kroon
2025-04-02 11:35 ` Miklos Szeredi
@ 2025-04-02 11:59 ` Bernd Schubert
1 sibling, 0 replies; 50+ messages in thread
From: Bernd Schubert @ 2025-04-02 11:59 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
On 4/2/25 13:13, Jaco Kroon wrote:
> Hi,
>
> On 2025/04/02 11:10, Bernd Schubert wrote:
>>
>> On 4/2/25 10:52, Jaco Kroon wrote:
>>> Hi,
>>>
>>> On 2025/04/02 10:18, Miklos Szeredi wrote:
>>>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>>>> Hi,
>>>>>
>>>>> I can definitely build on that, thank you.
>>>>>
>>>>> What's the advantage of kvmalloc over folio's here, why should it be
>>>>> preferred?
>>>> It offers the best of both worlds: first tries plain malloc (which
>>>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>>>> fails, falls back to vmalloc, which should always succeed since it
>>>> uses order 0 pages.
>>> So basically assigns the space, but doesn't commit physical pages for
>>> the allocation, meaning first access will cause a page fault, and single
>>> page allocation at that point in time? Or is it merely the fact that
>>> vmalloc may return a virtual contiguous block that's not physically
>>> contiguous?
>>
>> Yes vmalloc return buffers might not be physically contiguous - not
>> suitable for hardware DMA. And AFAIK it is also a blocking allocation.
> How do I go about confirming? Can that behaviour be stopped so that in
> the case where it would block we can return an EAGAIN or EWOULDBLOCK
> error code instead? Is that even desired?
>
> Don't think hardware DMA is an issue here, so that's at least not an
> issue, but the blocking might be?
I was just writing what vmalloc does - neither of its disadvantages will
have an issue here
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-01 15:33 ` Miklos Szeredi
2025-04-02 7:54 ` Jaco Kroon
@ 2025-04-08 14:19 ` Bernd Schubert
2025-04-09 7:12 ` Jaco Kroon
1 sibling, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2025-04-08 14:19 UTC (permalink / raw)
To: Miklos Szeredi, Jaco Kroon
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
On 4/1/25 17:33, Miklos Szeredi wrote:
> On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:
>
>> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
>> (ia) expects folios and changing that is more than what I'm capable off,
>> and has larger overall impact.
>
> Attaching a minimally tested patch.
Just tested this and looks good to me. Could we change to
- size_t bufsize = 131072;
+ size_t bufsize = fc->max_pages << PAGE_SHIFT;
?
I'm testing with that in my branch, going to run xfstests over night.
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Thanks,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-08 14:19 ` Bernd Schubert
@ 2025-04-09 7:12 ` Jaco Kroon
2025-04-09 8:31 ` Bernd Schubert
0 siblings, 1 reply; 50+ messages in thread
From: Jaco Kroon @ 2025-04-09 7:12 UTC (permalink / raw)
To: Bernd Schubert, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
Hi Bernd,
You sure you're looking at the newest version?
Or where does the below need to get applied?
I'm not seeing that in the patch to which you replied.
Kind regards,
Jaco
On 2025/04/08 16:19, Bernd Schubert wrote:
>
> On 4/1/25 17:33, Miklos Szeredi wrote:
>> On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:
>>
>>> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
>>> (ia) expects folios and changing that is more than what I'm capable off,
>>> and has larger overall impact.
>> Attaching a minimally tested patch.
> Just tested this and looks good to me. Could we change to
>
> - size_t bufsize = 131072;
> + size_t bufsize = fc->max_pages << PAGE_SHIFT;
>
> ?
>
> I'm testing with that in my branch, going to run xfstests over night.
>
>
> Reviewed-by: Bernd Schubert <bschubert@ddn.com>
>
>
> Thanks,
> Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-09 7:12 ` Jaco Kroon
@ 2025-04-09 8:31 ` Bernd Schubert
2025-04-09 15:03 ` Jaco Kroon
0 siblings, 1 reply; 50+ messages in thread
From: Bernd Schubert @ 2025-04-09 8:31 UTC (permalink / raw)
To: Jaco Kroon, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
Hi Jaco,
On 4/9/25 09:12, Jaco Kroon wrote:
> Hi Bernd,
>
> You sure you're looking at the newest version?
>
> Or where does the below need to get applied?
>
> I'm not seeing that in the patch to which you replied.
I just applied Miklos' patch that uses vmalloc. It applies
to the repo without your patch
I also
pushed to a branch
https://github.com/bsbernd/linux/commit/2b5ca68656a4a47e35b8cf1dfcf39c4335066497
In this branch: https://github.com/bsbernd/linux/tree/fuse-large-readdir
Cheers,
Bernd
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size.
2025-04-09 8:31 ` Bernd Schubert
@ 2025-04-09 15:03 ` Jaco Kroon
0 siblings, 0 replies; 50+ messages in thread
From: Jaco Kroon @ 2025-04-09 15:03 UTC (permalink / raw)
To: Bernd Schubert, Miklos Szeredi
Cc: linux-fsdevel, linux-kernel, christophe.jaillet, joannelkoong,
rdunlap, trapexit, david.laight.linux
Hi,
On 2025/04/09 10:31, Bernd Schubert wrote:
> Hi Jaco,
>
> On 4/9/25 09:12, Jaco Kroon wrote:
>> Hi Bernd,
>>
>> You sure you're looking at the newest version?
>>
>> Or where does the below need to get applied?
>>
>> I'm not seeing that in the patch to which you replied.
>
> I just applied Miklos' patch that uses vmalloc. It applies
> to the repo without your patch
Ok, I was looking at the wrong patch.
>
> I also
> pushed to a branch
>
> https://github.com/bsbernd/linux/commit/2b5ca68656a4a47e35b8cf1dfcf39c4335066497
Eyebal on that looks good, I'll test in the next few days too.
>
> In this branch: https://github.com/bsbernd/linux/tree/fuse-large-readdir
https://github.com/torvalds/linux/commit/fb34f89d8a8635ae0fed6568269d319b32155fdb
I think that should be authored by/attributed to Mikros, not myself. I
did test and verified as best I can so happy to sign-off on it, but I
was not the original author here.
Kind regards,
Jaco
^ permalink raw reply [flat|nested] 50+ messages in thread
end of thread, other threads:[~2025-04-09 15:03 UTC | newest]
Thread overview: 50+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-07-26 10:59 [PATCH] fuse: enable larger read buffers for readdir Jaco Kroon
2023-07-26 11:43 ` Jaco Kroon
2023-07-26 13:53 ` Bernd Schubert
2023-07-26 15:26 ` Jaco Kroon
2023-07-26 15:30 ` Bernd Schubert
2023-07-26 15:45 ` Bernd Schubert
2023-07-26 17:23 ` Antonio SJ Musumeci
2023-07-26 18:25 ` Jaco Kroon
2023-07-27 15:21 ` Miklos Szeredi
2023-07-27 19:21 ` Miklos Szeredi
2023-07-27 19:43 ` Bernd Schubert
2023-07-28 8:42 ` Miklos Szeredi
2023-07-26 15:19 ` Randy Dunlap
2023-07-27 8:12 ` [PATCH] fuse: enable larger read buffers for readdir [v2] Jaco Kroon
2023-07-27 15:35 ` Miklos Szeredi
2023-07-27 16:58 ` Jaco Kroon
2023-07-27 19:17 ` Miklos Szeredi
2023-07-27 19:16 ` Bernd Schubert
2023-07-27 20:35 ` Bernd Schubert
2023-07-28 5:05 ` Jaco Kroon
2025-03-14 22:16 ` fuse: increase readdir() buffer size Jaco Kroon
2025-03-14 22:16 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
2025-03-29 9:20 ` Christophe JAILLET
2025-03-30 14:27 ` Jaco Kroon
2025-03-14 22:16 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
2025-03-31 16:41 ` Joanne Koong
2025-03-31 20:43 ` Jaco Kroon
2025-03-31 21:48 ` Joanne Koong
2025-03-31 23:01 ` Joanne Koong
2025-03-28 10:15 ` fuse: increase readdir() " Jaco Kroon
2025-03-28 10:16 ` Bernd Schubert
2025-03-28 19:40 ` David Laight
2025-03-29 8:59 ` Jaco Kroon
2025-04-01 14:18 ` fuse: increase readdir() buffer size [v4] Jaco Kroon
2025-04-01 14:18 ` [PATCH 1/2] fs: Supply dir_context.count as readdir buffer size hint Jaco Kroon
2025-04-01 14:18 ` [PATCH 2/2] fuse: Adjust readdir() buffer to requesting buffer size Jaco Kroon
2025-04-01 14:40 ` Miklos Szeredi
2025-04-01 15:03 ` Jaco Kroon
2025-04-01 15:33 ` Miklos Szeredi
2025-04-02 7:54 ` Jaco Kroon
2025-04-02 8:18 ` Miklos Szeredi
2025-04-02 8:52 ` Jaco Kroon
2025-04-02 9:10 ` Bernd Schubert
2025-04-02 11:13 ` Jaco Kroon
2025-04-02 11:35 ` Miklos Szeredi
2025-04-02 11:59 ` Bernd Schubert
2025-04-08 14:19 ` Bernd Schubert
2025-04-09 7:12 ` Jaco Kroon
2025-04-09 8:31 ` Bernd Schubert
2025-04-09 15:03 ` Jaco Kroon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).