* [RFC] ext4: use kmem_cache for short fname allocation in readdir
@ 2025-05-29 14:42 David Wang
2025-05-29 21:44 ` Andreas Dilger
2025-05-30 4:35 ` Theodore Ts'o
0 siblings, 2 replies; 5+ messages in thread
From: David Wang @ 2025-05-29 14:42 UTC (permalink / raw)
To: tytso, adilger.kernel; +Cc: linux-ext4, linux-kernel, David Wang
When searching files, ext4_readdir would kzalloc() a fname
object for each entry. It would be faster if a dedicated
kmem_cache is used for fname.
But fnames are of variable length.
This patch suggests using kmem_cache for fname with short
length, and resorting to kzalloc when fname needs larger buffer.
Assuming long file names are not very common.
Profiling when searching files in kernel code base, with following
command:
# perf record -g -e cpu-clock --freq=max bash -c \
"for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
And using sample counts as indicator of performance improvement.
(The faster, the less samples collected. And the tests are carried out
when system is under no memory pressure.)
Before without the change:
1232868--ext4_readdir
|
|--839085--ext4_htree_fill_tree
| |
| --829223--htree_dirblock_to_tree
| |
| |--365869--ext4_htree_store_dirent
| | |
| | |--43169--0xffffffffa7f8d094
| | |
| | --21947--0xffffffffa7f8d0f7
| |
| |--213124--ext4fs_dirhash
| | |
| | --86339--str2hashbuf_signed
| |
| |--145839--__ext4_read_dirblock
and with the change, ~3% less samples:
1202922--ext4_readdir
|
|--805105--ext4_htree_fill_tree
| |
| --795055--htree_dirblock_to_tree
| |
| |--328876--ext4_htree_store_dirent
| | |
| | |--123207--kmem_cache_alloc_noprof
| | | |
| | | |--26453--__alloc_tagging_slab_alloc_hook
| | | |
| | | --20413--__slab_alloc.isra.0
| | |
| | --31566--rb_insert_color
| |
| |--212915--ext4fs_dirhash
| | |
| | --86004--str2hashbuf_signed
| |
| |--149146--__ext4_read_dirblock
readdir() would have sigfinicant improvement, but the overall
improvements for searching files is only ~0.5%, might be more
sigfinicant if the system is under some memory pressures.
The slab stats after the test:
ext4_dir_fname 1242 1242 176 23 1 : tunables 0 0 0 : slabdata 54 54 0
Signed-off-by: David Wang <00107082@163.com>
---
fs/ext4/dir.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/ext4.h | 4 ++++
fs/ext4/super.c | 3 +++
3 files changed, 51 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d4164c507a90..3adfa0d038cd 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -424,6 +424,48 @@ struct fname {
char name[] __counted_by(name_len);
};
+#define EXT4_DIR_FNAME_SHORT_LENGTH 127
+static struct kmem_cache *ext4_dir_fname_cachep;
+
+void __init ext4_init_dir(void)
+{
+ ext4_dir_fname_cachep = kmem_cache_create_usercopy("ext4_dir_fname",
+ struct_size_t(struct fname, name, EXT4_DIR_FNAME_SHORT_LENGTH + 1),
+ 0, SLAB_RECLAIM_ACCOUNT,
+ offsetof(struct fname, name), EXT4_DIR_FNAME_SHORT_LENGTH + 1,
+ NULL);
+}
+
+void ext4_exit_dir(void)
+{
+ if (ext4_dir_fname_cachep)
+ kmem_cache_destroy(ext4_dir_fname_cachep);
+}
+
+static struct fname *rb_node_fname_zalloc(__u8 name_len)
+{
+ struct fname *p;
+ if (ext4_dir_fname_cachep && name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
+ p = kmem_cache_alloc(ext4_dir_fname_cachep, GFP_KERNEL);
+ else
+ p = kmalloc(struct_size(p, name, name_len + 1), GFP_KERNEL);
+ if (p) {
+ /* no need to fill name with zeroes*/
+ memset(p, 0, offsetof(struct fname, name));
+ p->name[name_len] = 0;
+ }
+ return p;
+}
+
+static void rb_node_fname_free(struct fname *p) {
+ if (!p)
+ return;
+ if (ext4_dir_fname_cachep && p->name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
+ kmem_cache_free(ext4_dir_fname_cachep, p);
+ else
+ kfree(p);
+}
+
/*
* This function implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
@@ -436,7 +478,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (fname) {
struct fname *old = fname;
fname = fname->next;
- kfree(old);
+ rb_node_fname_free(old);
}
*root = RB_ROOT;
@@ -479,8 +521,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
- GFP_KERNEL);
+ new_fn = rb_node_fname_zalloc(ent_name->len);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a20e9cd7184..33ab97143000 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3795,6 +3795,10 @@ extern void ext4_orphan_file_block_trigger(
struct buffer_head *bh,
void *data, size_t size);
+/* dir.c */
+extern void __init ext4_init_dir(void);
+extern void ext4_exit_dir(void);
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 181934499624..21ce3d78912a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -7457,6 +7457,8 @@ static int __init ext4_init_fs(void)
if (err)
goto out;
+ ext4_init_dir();
+
return 0;
out:
unregister_as_ext2();
@@ -7497,6 +7499,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_post_read_processing();
ext4_exit_es();
ext4_exit_pending();
+ ext4_exit_dir();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
--
2.39.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [RFC] ext4: use kmem_cache for short fname allocation in readdir
2025-05-29 14:42 [RFC] ext4: use kmem_cache for short fname allocation in readdir David Wang
@ 2025-05-29 21:44 ` Andreas Dilger
2025-05-30 2:27 ` David Wang
2025-05-30 4:35 ` Theodore Ts'o
1 sibling, 1 reply; 5+ messages in thread
From: Andreas Dilger @ 2025-05-29 21:44 UTC (permalink / raw)
To: David Wang; +Cc: Theodore Ts'o, Ext4 Developers List, LKML
[-- Attachment #1: Type: text/plain, Size: 7085 bytes --]
On May 29, 2025, at 8:42 AM, David Wang <00107082@163.com> wrote:
>
> When searching files, ext4_readdir would kzalloc() a fname
> object for each entry. It would be faster if a dedicated
> kmem_cache is used for fname.
>
> But fnames are of variable length.
>
> This patch suggests using kmem_cache for fname with short
> length, and resorting to kzalloc when fname needs larger buffer.
> Assuming long file names are not very common.
It may be reasonable to have a cache, but I suspect that 127 bytes
is too large for most common workloads. Statistics that I've seen
show 99-percentile filename length is <= 48 bytes on most filesystems,
so allocating 128 bytes is probably sub-optimal (most is wasted).
That said, kmalloc() is mostly a wrapper for the kmalloc-* slabs, so
creating a 128-byte slab for this doesn't seem super useful?
> Profiling when searching files in kernel code base, with following
> command:
> # perf record -g -e cpu-clock --freq=max bash -c \
> "for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
> And using sample counts as indicator of performance improvement.
> (The faster, the less samples collected. And the tests are carried out
> when system is under no memory pressure.)
>
> Before without the change:
> 1232868--ext4_readdir
> |
> |--839085--ext4_htree_fill_tree
> | |
> | --829223--htree_dirblock_to_tree
> | |
> | |--365869--ext4_htree_store_dirent
> | | |
> | | |--43169--0xffffffffa7f8d094
> | | |
> | | --21947--0xffffffffa7f8d0f7
> | |
> | |--213124--ext4fs_dirhash
> | | |
> | | --86339--str2hashbuf_signed
> | |
> | |--145839--__ext4_read_dirblock
>
> and with the change, ~3% less samples:
> 1202922--ext4_readdir
> |
> |--805105--ext4_htree_fill_tree
> | |
> | --795055--htree_dirblock_to_tree
> | |
> | |--328876--ext4_htree_store_dirent
> | | |
> | | |--123207--kmem_cache_alloc_noprof
> | | | |
> | | | |--26453--__alloc_tagging_slab_alloc_hook
> | | | |
> | | | --20413--__slab_alloc.isra.0
> | | |
> | | --31566--rb_insert_color
> | |
> | |--212915--ext4fs_dirhash
> | | |
> | | --86004--str2hashbuf_signed
> | |
> | |--149146--__ext4_read_dirblock
>
> readdir() would have sigfinicant improvement, but the overall
> improvements for searching files is only ~0.5%, might be more
> sigfinicant if the system is under some memory pressures.
>
> The slab stats after the test:
> ext4_dir_fname 1242 1242 176 23 1 : tunables 0 0 0 : slabdata 54 54 0
>
> Signed-off-by: David Wang <00107082@163.com>
> ---
> fs/ext4/dir.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
> fs/ext4/ext4.h | 4 ++++
> fs/ext4/super.c | 3 +++
> 3 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
> index d4164c507a90..3adfa0d038cd 100644
> --- a/fs/ext4/dir.c
> +++ b/fs/ext4/dir.c
> @@ -424,6 +424,48 @@ struct fname {
> char name[] __counted_by(name_len);
> };
>
> +#define EXT4_DIR_FNAME_SHORT_LENGTH 127
> +static struct kmem_cache *ext4_dir_fname_cachep;
> +
> +void __init ext4_init_dir(void)
> +{
> + ext4_dir_fname_cachep = kmem_cache_create_usercopy("ext4_dir_fname",
> + struct_size_t(struct fname, name, EXT4_DIR_FNAME_SHORT_LENGTH + 1),
> + 0, SLAB_RECLAIM_ACCOUNT,
> + offsetof(struct fname, name), EXT4_DIR_FNAME_SHORT_LENGTH + 1,
> + NULL);
> +}
> +
> +void ext4_exit_dir(void)
> +{
> + if (ext4_dir_fname_cachep)
> + kmem_cache_destroy(ext4_dir_fname_cachep);
> +}
> +
> +static struct fname *rb_node_fname_zalloc(__u8 name_len)
> +{
> + struct fname *p;
> + if (ext4_dir_fname_cachep && name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
> + p = kmem_cache_alloc(ext4_dir_fname_cachep, GFP_KERNEL);
> + else
> + p = kmalloc(struct_size(p, name, name_len + 1), GFP_KERNEL);
> + if (p) {
> + /* no need to fill name with zeroes*/
> + memset(p, 0, offsetof(struct fname, name));
> + p->name[name_len] = 0;
> + }
> + return p;
> +}
> +
> +static void rb_node_fname_free(struct fname *p) {
> + if (!p)
> + return;
> + if (ext4_dir_fname_cachep && p->name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
> + kmem_cache_free(ext4_dir_fname_cachep, p);
> + else
> + kfree(p);
> +}
> +
> /*
> * This function implements a non-recursive way of freeing all of the
> * nodes in the red-black tree.
> @@ -436,7 +478,7 @@ static void free_rb_tree_fname(struct rb_root *root)
> while (fname) {
> struct fname *old = fname;
> fname = fname->next;
> - kfree(old);
> + rb_node_fname_free(old);
> }
>
> *root = RB_ROOT;
> @@ -479,8 +521,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
> p = &info->root.rb_node;
>
> /* Create and allocate the fname structure */
> - new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
> - GFP_KERNEL);
> + new_fn = rb_node_fname_zalloc(ent_name->len);
> if (!new_fn)
> return -ENOMEM;
> new_fn->hash = hash;
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5a20e9cd7184..33ab97143000 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3795,6 +3795,10 @@ extern void ext4_orphan_file_block_trigger(
> struct buffer_head *bh,
> void *data, size_t size);
>
> +/* dir.c */
> +extern void __init ext4_init_dir(void);
> +extern void ext4_exit_dir(void);
> +
> /*
> * Add new method to test whether block and inode bitmaps are properly
> * initialized. With uninit_bg reading the block from disk is not enough
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 181934499624..21ce3d78912a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -7457,6 +7457,8 @@ static int __init ext4_init_fs(void)
> if (err)
> goto out;
>
> + ext4_init_dir();
> +
> return 0;
> out:
> unregister_as_ext2();
> @@ -7497,6 +7499,7 @@ static void __exit ext4_exit_fs(void)
> ext4_exit_post_read_processing();
> ext4_exit_es();
> ext4_exit_pending();
> + ext4_exit_dir();
> }
>
> MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
> --
> 2.39.2
>
Cheers, Andreas
[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC] ext4: use kmem_cache for short fname allocation in readdir
2025-05-29 21:44 ` Andreas Dilger
@ 2025-05-30 2:27 ` David Wang
0 siblings, 0 replies; 5+ messages in thread
From: David Wang @ 2025-05-30 2:27 UTC (permalink / raw)
To: Andreas Dilger; +Cc: Theodore Ts'o, Ext4 Developers List, LKML
At 2025-05-30 05:44:26, "Andreas Dilger" <adilger@dilger.ca> wrote:
>On May 29, 2025, at 8:42 AM, David Wang <00107082@163.com> wrote:
>>
>> When searching files, ext4_readdir would kzalloc() a fname
>> object for each entry. It would be faster if a dedicated
>> kmem_cache is used for fname.
>>
>> But fnames are of variable length.
>>
>> This patch suggests using kmem_cache for fname with short
>> length, and resorting to kzalloc when fname needs larger buffer.
>> Assuming long file names are not very common.
>
>It may be reasonable to have a cache, but I suspect that 127 bytes
>is too large for most common workloads. Statistics that I've seen
>show 99-percentile filename length is <= 48 bytes on most filesystems,
>so allocating 128 bytes is probably sub-optimal (most is wasted).
>
>That said, kmalloc() is mostly a wrapper for the kmalloc-* slabs, so
>creating a 128-byte slab for this doesn't seem super useful?
Thanks for the feedback~!
Yes, there would be memory wasted. I choose 127 is just for testing on my system, it can be
adjusted based on empirical data.
kmalloc is globally used, while a dedicated/isolcated fname-slab is only accessed by
ext4-dir. Adding a new slab can avoid the contention between ext4-dir and other users of
kmalloc, especially when alloc frequency is high. (When searching in a "large" dir, the
memory allocation frequency would be very high, a new slab can reduce the impact on other users
of kmalloc )
Thanks
David
>
>> Profiling when searching files in kernel code base, with following
>> command:
>> # perf record -g -e cpu-clock --freq=max bash -c \
>> "for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
>> And using sample counts as indicator of performance improvement.
>> (The faster, the less samples collected. And the tests are carried out
>> when system is under no memory pressure.)
>>
>> Before without the change:
>> 1232868--ext4_readdir
>> |
>> |--839085--ext4_htree_fill_tree
>> | |
>> | --829223--htree_dirblock_to_tree
>> | |
>> | |--365869--ext4_htree_store_dirent
>> | | |
>> | | |--43169--0xffffffffa7f8d094
>> | | |
>> | | --21947--0xffffffffa7f8d0f7
>> | |
>> | |--213124--ext4fs_dirhash
>> | | |
>> | | --86339--str2hashbuf_signed
>> | |
>> | |--145839--__ext4_read_dirblock
>>
>> and with the change, ~3% less samples:
>> 1202922--ext4_readdir
>
>> |
>> |--805105--ext4_htree_fill_tree
>> | |
>> | --795055--htree_dirblock_to_tree
>> | |
>> | |--328876--ext4_htree_store_dirent
>> | | |
>> | | |--123207--kmem_cache_alloc_noprof
>> | | | |
>> | | | |--26453--__alloc_tagging_slab_alloc_hook
>> | | | |
>> | | | --20413--__slab_alloc.isra.0
>> | | |
>> | | --31566--rb_insert_color
>> | |
>> | |--212915--ext4fs_dirhash
>> | | |
>> | | --86004--str2hashbuf_signed
>> | |
>> | |--149146--__ext4_read_dirblock
>>
>> readdir() would have sigfinicant improvement, but the overall
>> improvements for searching files is only ~0.5%, might be more
>> sigfinicant if the system is under some memory pressures.
>>
>> The slab stats after the test:
>> ext4_dir_fname 1242 1242 176 23 1 : tunables 0 0 0 : slabdata 54 54 0
>>
>> Signed-off-by: David Wang <00107082@163.com>
>> ---
>> fs/ext4/dir.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>> fs/ext4/ext4.h | 4 ++++
>> fs/ext4/super.c | 3 +++
>> 3 files changed, 51 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
>> index d4164c507a90..3adfa0d038cd 100644
>> --- a/fs/ext4/dir.c
>> +++ b/fs/ext4/dir.c
>> @@ -424,6 +424,48 @@ struct fname {
>> char name[] __counted_by(name_len);
>> };
>>
>> +#define EXT4_DIR_FNAME_SHORT_LENGTH 127
>> +static struct kmem_cache *ext4_dir_fname_cachep;
>> +
>> +void __init ext4_init_dir(void)
>> +{
>> + ext4_dir_fname_cachep = kmem_cache_create_usercopy("ext4_dir_fname",
>> + struct_size_t(struct fname, name, EXT4_DIR_FNAME_SHORT_LENGTH + 1),
>> + 0, SLAB_RECLAIM_ACCOUNT,
>> + offsetof(struct fname, name), EXT4_DIR_FNAME_SHORT_LENGTH + 1,
>> + NULL);
>> +}
>> +
>> +void ext4_exit_dir(void)
>> +{
>> + if (ext4_dir_fname_cachep)
>> + kmem_cache_destroy(ext4_dir_fname_cachep);
>> +}
>> +
>> +static struct fname *rb_node_fname_zalloc(__u8 name_len)
>> +{
>> + struct fname *p;
>> + if (ext4_dir_fname_cachep && name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
>> + p = kmem_cache_alloc(ext4_dir_fname_cachep, GFP_KERNEL);
>> + else
>> + p = kmalloc(struct_size(p, name, name_len + 1), GFP_KERNEL);
>> + if (p) {
>> + /* no need to fill name with zeroes*/
>> + memset(p, 0, offsetof(struct fname, name));
>> + p->name[name_len] = 0;
>> + }
>> + return p;
>> +}
>> +
>> +static void rb_node_fname_free(struct fname *p) {
>> + if (!p)
>> + return;
>> + if (ext4_dir_fname_cachep && p->name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
>> + kmem_cache_free(ext4_dir_fname_cachep, p);
>> + else
>> + kfree(p);
>> +}
>> +
>> /*
>> * This function implements a non-recursive way of freeing all of the
>> * nodes in the red-black tree.
>> @@ -436,7 +478,7 @@ static void free_rb_tree_fname(struct rb_root *root)
>> while (fname) {
>> struct fname *old = fname;
>> fname = fname->next;
>> - kfree(old);
>> + rb_node_fname_free(old);
>> }
>>
>> *root = RB_ROOT;
>> @@ -479,8 +521,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
>> p = &info->root.rb_node;
>>
>> /* Create and allocate the fname structure */
>> - new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
>> - GFP_KERNEL);
>> + new_fn = rb_node_fname_zalloc(ent_name->len);
>> if (!new_fn)
>> return -ENOMEM;
>> new_fn->hash = hash;
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 5a20e9cd7184..33ab97143000 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3795,6 +3795,10 @@ extern void ext4_orphan_file_block_trigger(
>> struct buffer_head *bh,
>> void *data, size_t size);
>>
>> +/* dir.c */
>> +extern void __init ext4_init_dir(void);
>> +extern void ext4_exit_dir(void);
>> +
>> /*
>> * Add new method to test whether block and inode bitmaps are properly
>> * initialized. With uninit_bg reading the block from disk is not enough
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 181934499624..21ce3d78912a 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -7457,6 +7457,8 @@ static int __init ext4_init_fs(void)
>> if (err)
>> goto out;
>>
>> + ext4_init_dir();
>> +
>> return 0;
>> out:
>> unregister_as_ext2();
>> @@ -7497,6 +7499,7 @@ static void __exit ext4_exit_fs(void)
>> ext4_exit_post_read_processing();
>> ext4_exit_es();
>> ext4_exit_pending();
>> + ext4_exit_dir();
>> }
>>
>> MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
>> --
>> 2.39.2
>>
>
>
>Cheers, Andreas
>
>
>
>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC] ext4: use kmem_cache for short fname allocation in readdir
2025-05-29 14:42 [RFC] ext4: use kmem_cache for short fname allocation in readdir David Wang
2025-05-29 21:44 ` Andreas Dilger
@ 2025-05-30 4:35 ` Theodore Ts'o
2025-05-30 6:12 ` David Wang
1 sibling, 1 reply; 5+ messages in thread
From: Theodore Ts'o @ 2025-05-30 4:35 UTC (permalink / raw)
To: David Wang; +Cc: adilger.kernel, linux-ext4, linux-kernel
On Thu, May 29, 2025 at 10:42:56PM +0800, David Wang wrote:
> When searching files, ext4_readdir would kzalloc() a fname
> object for each entry. It would be faster if a dedicated
> kmem_cache is used for fname.
>
> But fnames are of variable length.
>
> This patch suggests using kmem_cache for fname with short
> length, and resorting to kzalloc when fname needs larger buffer.
> Assuming long file names are not very common.
>
> Profiling when searching files in kernel code base, with following
> command:
> # perf record -g -e cpu-clock --freq=max bash -c \
> "for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
> And using sample counts as indicator of performance improvement.
I would think a better indicator of performance improvement would be
to measure the system time when running the find commands. (i.e.,
either using getrusange with RUSAGE_CHILDREN or wait3 or wait4).
We're trading off some extra memory usage and code complexity with
less CPU time because entries in the kmem_cache might be more TLB
friendly. But this is only really going to be applicable if the
directory is large enough such that the cycles spent in readdir is
significant compared to the rest of the userspace program, *and* you
are reading the directory multiple times (e.g., calling find on a
directory hierarchy many, many times) such that the disk blocks are
cahed and you don't need to read them from the storage device.
Otherwise the I/O costs will completely dominate and swamp the
marginal TLB cache savings.
Given that it's really rare for readdir() to be the bottleneck of many
workloads, the question is, is it worth it?
- Ted
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC] ext4: use kmem_cache for short fname allocation in readdir
2025-05-30 4:35 ` Theodore Ts'o
@ 2025-05-30 6:12 ` David Wang
0 siblings, 0 replies; 5+ messages in thread
From: David Wang @ 2025-05-30 6:12 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: adilger.kernel, linux-ext4, linux-kernel
At 2025-05-30 12:35:16, "Theodore Ts'o" <tytso@mit.edu> wrote:
>On Thu, May 29, 2025 at 10:42:56PM +0800, David Wang wrote:
>> When searching files, ext4_readdir would kzalloc() a fname
>> object for each entry. It would be faster if a dedicated
>> kmem_cache is used for fname.
>>
>> But fnames are of variable length.
>>
>> This patch suggests using kmem_cache for fname with short
>> length, and resorting to kzalloc when fname needs larger buffer.
>> Assuming long file names are not very common.
>>
>> Profiling when searching files in kernel code base, with following
>> command:
>> # perf record -g -e cpu-clock --freq=max bash -c \
>> "for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
>> And using sample counts as indicator of performance improvement.
>
>I would think a better indicator of performance improvement would be
>to measure the system time when running the find commands. (i.e.,
>either using getrusange with RUSAGE_CHILDREN or wait3 or wait4).
I did use `time` to compare system time when search files with find,
and I did see slight improvement.
The std deviation is quite high for the whole `find` process though.
>
>We're trading off some extra memory usage and code complexity with
>less CPU time because entries in the kmem_cache might be more TLB
>friendly. But this is only really going to be applicable if the
>directory is large enough such that the cycles spent in readdir is
>significant compared to the rest of the userspace program, *and* you
>are reading the directory multiple times (e.g., calling find on a
>directory hierarchy many, many times) such that the disk blocks are
>cahed and you don't need to read them from the storage device.
>Otherwise the I/O costs will completely dominate and swamp the
>marginal TLB cache savings.
Yes, the test was run with cache-hot.
But repeating search files is not uncommon practice, `find` would run with cache-hot
except the first round.
>
>Given that it's really rare for readdir() to be the bottleneck of many
>workloads, the question is, is it worth it?
That's the question I have been thinking about.
Beside marginal improvement for readdir(), I would argue with the impact on other parts in system
when searching files. Even with cache-code, searching large dir would involving high frequent of
malloc() for a short interval, This might have transient negative impact on others which also request malloc(), but with
low frequency. But I don't have a convincing examples for this, it's all theoretical .
Thanks
David
>
> - Ted
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-05-30 6:12 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-29 14:42 [RFC] ext4: use kmem_cache for short fname allocation in readdir David Wang
2025-05-29 21:44 ` Andreas Dilger
2025-05-30 2:27 ` David Wang
2025-05-30 4:35 ` Theodore Ts'o
2025-05-30 6:12 ` David Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).