* [RFC PATCH v5 0/4] erofs: page cache share feature
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel
Hi everyone,
The patch in this version has made the following changes compared to
the previous versionv(patch v4):
- adjusted the code style;
- introduced erofs_pcshr_{read,readahead}_{begin,end}() to switch
between anonymous inodes and real inodes;
- cleanup work for erofs_pcshr_fadvise();
- adjusted some variable names, etc.
The experiments were repeated, and the results were almost consistent.
v4: https://lore.kernel.org/all/20240902110620.2202586-1-hongzhen@linux.alibaba.com/
v3: https://lore.kernel.org/all/20240828111959.3677011-1-hongzhen@linux.alibaba.com/
v2: https://lore.kernel.org/all/20240731080704.678259-1-hongzhen@linux.alibaba.com/
v1: https://lore.kernel.org/all/20240722065355.1396365-1-hongzhen@linux.alibaba.com/
Hongzhen Luo (4):
erofs: move `struct erofs_anon_fs_type` to super.c
erofs: introduce the page cache share feature
erofs: apply the page cache share feature
erofs: introduce .fadvise for page cache share
fs/erofs/Kconfig | 10 +
fs/erofs/Makefile | 1 +
fs/erofs/data.c | 15 +-
fs/erofs/fscache.c | 13 --
fs/erofs/inode.c | 5 +-
fs/erofs/internal.h | 9 +
fs/erofs/pagecache_share.c | 430 +++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 39 ++++
fs/erofs/super.c | 42 ++++
fs/erofs/zdata.c | 10 +-
10 files changed, 556 insertions(+), 18 deletions(-)
create mode 100644 fs/erofs/pagecache_share.c
create mode 100644 fs/erofs/pagecache_share.h
--
2.43.5
^ permalink raw reply [flat|nested] 20+ messages in thread
* [RFC PATCH v5 0/4] erofs: page cache share feature
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel, Hongzhen Luo
Hi everyone,
The patch in this version has made the following changes compared to
the previous versionv(patch v4):
- adjusted the code style;
- introduced erofs_pcshr_{read,readahead}_{begin,end}() to switch
between anonymous inodes and real inodes;
- cleanup work for erofs_pcshr_fadvise();
- adjusted some variable names, etc.
The experiments were repeated, and the results were almost consistent.
v4: https://lore.kernel.org/all/20240902110620.2202586-1-hongzhen@linux.alibaba.com/
v3: https://lore.kernel.org/all/20240828111959.3677011-1-hongzhen@linux.alibaba.com/
v2: https://lore.kernel.org/all/20240731080704.678259-1-hongzhen@linux.alibaba.com/
v1: https://lore.kernel.org/all/20240722065355.1396365-1-hongzhen@linux.alibaba.com/
Hongzhen Luo (4):
erofs: move `struct erofs_anon_fs_type` to super.c
erofs: introduce the page cache share feature
erofs: apply the page cache share feature
erofs: introduce .fadvise for page cache share
fs/erofs/Kconfig | 10 +
fs/erofs/Makefile | 1 +
fs/erofs/data.c | 15 +-
fs/erofs/fscache.c | 13 --
fs/erofs/inode.c | 5 +-
fs/erofs/internal.h | 9 +
fs/erofs/pagecache_share.c | 430 +++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 39 ++++
fs/erofs/super.c | 42 ++++
fs/erofs/zdata.c | 10 +-
10 files changed, 556 insertions(+), 18 deletions(-)
create mode 100644 fs/erofs/pagecache_share.c
create mode 100644 fs/erofs/pagecache_share.h
--
2.43.5
^ permalink raw reply [flat|nested] 20+ messages in thread
* [RFC PATCH v5 1/4] erofs: move `struct erofs_anon_fs_type` to super.c
2025-01-05 15:12 ` Hongzhen Luo
@ 2025-01-05 15:12 ` Hongzhen Luo
-1 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel
Move the `struct erofs_anon_fs_type` to the super.c and
expose it in preparation for the upcoming page cache share
feature.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/fscache.c | 13 -------------
fs/erofs/internal.h | 2 ++
fs/erofs/super.c | 13 +++++++++++++
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index ce3d8737df85..ae7bd9ebff38 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,7 +3,6 @@
* Copyright (C) 2022, Alibaba Cloud
* Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
-#include <linux/pseudo_fs.h>
#include <linux/fscache.h>
#include "internal.h"
@@ -13,18 +12,6 @@ static LIST_HEAD(erofs_domain_list);
static LIST_HEAD(erofs_domain_cookies_list);
static struct vfsmount *erofs_pseudo_mnt;
-static int erofs_anon_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type erofs_anon_fs_type = {
- .owner = THIS_MODULE,
- .name = "pseudo_erofs",
- .init_fs_context = erofs_anon_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
struct erofs_fscache_io {
struct netfs_cache_resources cres;
struct iov_iter iter;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 686d835eb533..47004eb89838 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -379,6 +379,8 @@ extern const struct file_operations erofs_dir_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
+extern struct file_system_type erofs_anon_fs_type;
+
/* flags for erofs_fscache_register_cookie() */
#define EROFS_REG_COOKIE_SHARE 0x0001
#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1fc5623c3a4d..25d2c2b44d0a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
#include <linux/fs_parser.h>
#include <linux/exportfs.h>
#include <linux/backing-dev.h>
+#include <linux/pseudo_fs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -852,6 +853,18 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+struct file_system_type erofs_anon_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pseudo_erofs",
+ .init_fs_context = erofs_anon_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
static int __init erofs_module_init(void)
{
int err;
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 1/4] erofs: move `struct erofs_anon_fs_type` to super.c
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel, Hongzhen Luo
Move the `struct erofs_anon_fs_type` to the super.c and
expose it in preparation for the upcoming page cache share
feature.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/fscache.c | 13 -------------
fs/erofs/internal.h | 2 ++
fs/erofs/super.c | 13 +++++++++++++
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index ce3d8737df85..ae7bd9ebff38 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,7 +3,6 @@
* Copyright (C) 2022, Alibaba Cloud
* Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
-#include <linux/pseudo_fs.h>
#include <linux/fscache.h>
#include "internal.h"
@@ -13,18 +12,6 @@ static LIST_HEAD(erofs_domain_list);
static LIST_HEAD(erofs_domain_cookies_list);
static struct vfsmount *erofs_pseudo_mnt;
-static int erofs_anon_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type erofs_anon_fs_type = {
- .owner = THIS_MODULE,
- .name = "pseudo_erofs",
- .init_fs_context = erofs_anon_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
struct erofs_fscache_io {
struct netfs_cache_resources cres;
struct iov_iter iter;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 686d835eb533..47004eb89838 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -379,6 +379,8 @@ extern const struct file_operations erofs_dir_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
+extern struct file_system_type erofs_anon_fs_type;
+
/* flags for erofs_fscache_register_cookie() */
#define EROFS_REG_COOKIE_SHARE 0x0001
#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1fc5623c3a4d..25d2c2b44d0a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
#include <linux/fs_parser.h>
#include <linux/exportfs.h>
#include <linux/backing-dev.h>
+#include <linux/pseudo_fs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -852,6 +853,18 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+struct file_system_type erofs_anon_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pseudo_erofs",
+ .init_fs_context = erofs_anon_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
static int __init erofs_module_init(void)
{
int err;
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
2025-01-05 15:12 ` Hongzhen Luo
@ 2025-01-05 15:12 ` Hongzhen Luo
-1 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel
Currently, reading files with different paths (or names) but the same
content will consume multiple copies of the page cache, even if the
content of these page caches is the same. For example, reading identical
files (e.g., *.so files) from two different minor versions of container
images will cost multiple copies of the same page cache, since different
containers have different mount points. Therefore, sharing the page cache
for files with the same content can save memory.
This introduces the page cache share feature in erofs. During the mkfs
phase, the file content is hashed and the hash value is stored in the
`trusted.erofs.fingerprint` extended attribute. Inodes of files with the
same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
(indicated by the `ano_inode` field). When a read request occurs, the
anonymous inode serves as a "container" whose page cache is shared. The
actual operations involving the iomap are carried out by the original
inode which is mapped to the anonymous inode.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/Kconfig | 10 ++
fs/erofs/Makefile | 1 +
fs/erofs/internal.h | 4 +
fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 26 +++++
fs/erofs/super.c | 24 +++-
6 files changed, 292 insertions(+), 1 deletion(-)
create mode 100644 fs/erofs/pagecache_share.c
create mode 100644 fs/erofs/pagecache_share.h
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..3aa5f946b5f1 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
at higher priority.
If unsure, say N.
+
+config EROFS_FS_PAGE_CACHE_SHARE
+ bool "EROFS page cache share support"
+ depends on EROFS_FS
+ default n
+ help
+ This permits EROFS to share page cache for files with same
+ fingerprints.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..d035c9063ef8 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
\ No newline at end of file
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 47004eb89838..6c87621d86ba 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -280,6 +280,9 @@ struct erofs_inode {
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ struct inode *ano_inode;
+#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
};
@@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
+extern const struct file_operations erofs_pcshr_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
new file mode 100644
index 000000000000..703fd17c002c
--- /dev/null
+++ b/fs/erofs/pagecache_share.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include <linux/xxhash.h>
+#include <linux/refcount.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include "pagecache_share.h"
+#include "internal.h"
+#include "xattr.h"
+
+#define PCSHR_FPRT_IDX 4
+#define PCSHR_FPRT_NAME "erofs.fingerprint"
+#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
+
+struct erofs_pcshr_counter {
+ struct mutex mutex;
+ struct kref ref;
+ struct vfsmount *mnt;
+};
+
+struct erofs_pcshr_private {
+ char fprt[PCSHR_FPRT_MAXLEN];
+};
+
+static struct erofs_pcshr_counter mnt_counter = {
+ .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
+ .mnt = NULL,
+};
+
+static void erofs_pcshr_counter_release(struct kref *ref)
+{
+ struct erofs_pcshr_counter *counter = container_of(ref,
+ struct erofs_pcshr_counter, ref);
+
+ DBG_BUGON(!counter->mnt);
+ kern_unmount(counter->mnt);
+ counter->mnt = NULL;
+}
+
+int erofs_pcshr_init_mnt(void)
+{
+ int ret;
+ struct vfsmount *tmp;
+
+ mutex_lock(&mnt_counter.mutex);
+ if (!mnt_counter.mnt) {
+ tmp = kern_mount(&erofs_anon_fs_type);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out;
+ }
+ mnt_counter.mnt = tmp;
+ kref_init(&mnt_counter.ref);
+ } else
+ kref_get(&mnt_counter.ref);
+ ret = 0;
+out:
+ mutex_unlock(&mnt_counter.mutex);
+ return ret;
+}
+
+void erofs_pcshr_free_mnt(void)
+{
+ mutex_lock(&mnt_counter.mutex);
+ kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
+ mutex_unlock(&mnt_counter.mutex);
+}
+
+static int erofs_fprt_eq(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private = inode->i_private;
+
+ return ano_private && memcmp(ano_private->fprt, data,
+ sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
+}
+
+static int erofs_fprt_set(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
+ if (!ano_private)
+ return -ENOMEM;
+ memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ inode->i_private = ano_private;
+ return 0;
+}
+
+int erofs_pcshr_fill_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ /* | fingerprint length | fingerprint content | */
+ char fprt[PCSHR_FPRT_MAXLEN];
+ struct inode *ano_inode;
+ unsigned long fprt_hash;
+ size_t fprt_len;
+ int ret = -1;
+
+ vi->ano_inode = NULL;
+ memset(fprt, 0, sizeof(fprt));
+ fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
+ fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
+ if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
+ *(size_t *)fprt = fprt_len;
+ fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
+ ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
+ erofs_fprt_eq, erofs_fprt_set, fprt);
+ DBG_BUGON(!ano_inode);
+ vi->ano_inode = ano_inode;
+ if (ano_inode->i_state & I_NEW) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ ano_inode->i_mapping->a_ops = &z_erofs_aops;
+ else
+ ano_inode->i_mapping->a_ops = &erofs_aops;
+ ano_inode->i_size = inode->i_size;
+ unlock_new_inode(ano_inode);
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+void erofs_pcshr_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ if (S_ISREG(inode->i_mode) && vi->ano_inode) {
+ iput(vi->ano_inode);
+ vi->ano_inode = NULL;
+ }
+}
+
+static struct file *erofs_pcshr_alloc_file(struct file *file,
+ struct inode *ano_inode)
+{
+ struct file *ano_file;
+
+ ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
+ "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
+ if (IS_ERR(ano_file))
+ return ano_file;
+
+ file_ra_state_init(&ano_file->f_ra, file->f_mapping);
+ ano_file->private_data = EROFS_I(file_inode(file));
+ return ano_file;
+}
+
+static int erofs_pcshr_file_open(struct inode *inode, struct file *file)
+{
+ struct file *ano_file;
+ struct inode *ano_inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ ano_inode = vi->ano_inode;
+ if (!ano_inode)
+ return -EINVAL;
+
+ ano_file = erofs_pcshr_alloc_file(file, ano_inode);
+ if (IS_ERR(ano_file))
+ return PTR_ERR(ano_file);
+
+ ihold(ano_inode);
+ file->private_data = (void *)ano_file;
+ return 0;
+}
+
+static int erofs_pcshr_file_release(struct inode *inode, struct file *file)
+{
+ if (!file->private_data)
+ return -EINVAL;
+
+ fput((struct file *)file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
+ struct file *file, *ano_file;
+ struct kiocb ano_iocb;
+ ssize_t res;
+
+ if (!iov_iter_count(to))
+ return 0;
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return erofs_file_fops.read_iter(iocb, to);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return erofs_file_fops.read_iter(iocb, to);
+
+ memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
+ file = iocb->ki_filp;
+ ano_file = file->private_data;
+ if (!ano_file)
+ return -EINVAL;
+ ano_iocb.ki_filp = ano_file;
+ res = filemap_read(&ano_iocb, to, 0);
+ memcpy(iocb, &ano_iocb, sizeof(struct kiocb));
+ iocb->ki_filp = file;
+ file_accessed(file);
+ return res;
+}
+
+extern const struct vm_operations_struct generic_file_vm_ops;
+
+static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *ano_file = file->private_data;
+
+ vma_set_file(vma, ano_file);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+const struct file_operations erofs_pcshr_fops = {
+ .open = erofs_pcshr_file_open,
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_pcshr_file_read_iter,
+ .mmap = erofs_pcshr_mmap,
+ .release = erofs_pcshr_file_release,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
+};
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
new file mode 100644
index 000000000000..f3889d6889e5
--- /dev/null
+++ b/fs/erofs/pagecache_share.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#ifndef __EROFS_PAGECACHE_SHARE_H
+#define __EROFS_PAGECACHE_SHARE_H
+
+#include <linux/fs.h>
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+int erofs_pcshr_init_mnt(void);
+void erofs_pcshr_free_mnt(void);
+int erofs_pcshr_fill_inode(struct inode *inode);
+void erofs_pcshr_free_inode(struct inode *inode);
+
+#else
+
+static inline int erofs_pcshr_init_mnt(void) { return 0; }
+static inline void erofs_pcshr_free_mnt(void) {}
+static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
+static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+
+#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 25d2c2b44d0a..b4ce07dc931c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -853,9 +853,31 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+static void erofs_free_anon_inode(struct inode *inode)
+{
+ kfree(inode->i_private);
+ inode->i_private = NULL;
+}
+#else
+#define erofs_free_anon_inode NULL
+#endif
+
+static const struct super_operations erofs_anon_sops = {
+ .statfs = simple_statfs,
+ .free_inode = erofs_free_anon_inode,
+};
+
+
static int erofs_anon_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
+ if (ctx)
+ ctx->ops = &erofs_anon_sops;
+
+ return ctx ? 0 : -ENOMEM;
}
struct file_system_type erofs_anon_fs_type = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel, Hongzhen Luo
Currently, reading files with different paths (or names) but the same
content will consume multiple copies of the page cache, even if the
content of these page caches is the same. For example, reading identical
files (e.g., *.so files) from two different minor versions of container
images will cost multiple copies of the same page cache, since different
containers have different mount points. Therefore, sharing the page cache
for files with the same content can save memory.
This introduces the page cache share feature in erofs. During the mkfs
phase, the file content is hashed and the hash value is stored in the
`trusted.erofs.fingerprint` extended attribute. Inodes of files with the
same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
(indicated by the `ano_inode` field). When a read request occurs, the
anonymous inode serves as a "container" whose page cache is shared. The
actual operations involving the iomap are carried out by the original
inode which is mapped to the anonymous inode.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/Kconfig | 10 ++
fs/erofs/Makefile | 1 +
fs/erofs/internal.h | 4 +
fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 26 +++++
fs/erofs/super.c | 24 +++-
6 files changed, 292 insertions(+), 1 deletion(-)
create mode 100644 fs/erofs/pagecache_share.c
create mode 100644 fs/erofs/pagecache_share.h
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..3aa5f946b5f1 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
at higher priority.
If unsure, say N.
+
+config EROFS_FS_PAGE_CACHE_SHARE
+ bool "EROFS page cache share support"
+ depends on EROFS_FS
+ default n
+ help
+ This permits EROFS to share page cache for files with same
+ fingerprints.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..d035c9063ef8 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
\ No newline at end of file
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 47004eb89838..6c87621d86ba 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -280,6 +280,9 @@ struct erofs_inode {
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ struct inode *ano_inode;
+#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
};
@@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
+extern const struct file_operations erofs_pcshr_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
new file mode 100644
index 000000000000..703fd17c002c
--- /dev/null
+++ b/fs/erofs/pagecache_share.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include <linux/xxhash.h>
+#include <linux/refcount.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include "pagecache_share.h"
+#include "internal.h"
+#include "xattr.h"
+
+#define PCSHR_FPRT_IDX 4
+#define PCSHR_FPRT_NAME "erofs.fingerprint"
+#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
+
+struct erofs_pcshr_counter {
+ struct mutex mutex;
+ struct kref ref;
+ struct vfsmount *mnt;
+};
+
+struct erofs_pcshr_private {
+ char fprt[PCSHR_FPRT_MAXLEN];
+};
+
+static struct erofs_pcshr_counter mnt_counter = {
+ .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
+ .mnt = NULL,
+};
+
+static void erofs_pcshr_counter_release(struct kref *ref)
+{
+ struct erofs_pcshr_counter *counter = container_of(ref,
+ struct erofs_pcshr_counter, ref);
+
+ DBG_BUGON(!counter->mnt);
+ kern_unmount(counter->mnt);
+ counter->mnt = NULL;
+}
+
+int erofs_pcshr_init_mnt(void)
+{
+ int ret;
+ struct vfsmount *tmp;
+
+ mutex_lock(&mnt_counter.mutex);
+ if (!mnt_counter.mnt) {
+ tmp = kern_mount(&erofs_anon_fs_type);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out;
+ }
+ mnt_counter.mnt = tmp;
+ kref_init(&mnt_counter.ref);
+ } else
+ kref_get(&mnt_counter.ref);
+ ret = 0;
+out:
+ mutex_unlock(&mnt_counter.mutex);
+ return ret;
+}
+
+void erofs_pcshr_free_mnt(void)
+{
+ mutex_lock(&mnt_counter.mutex);
+ kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
+ mutex_unlock(&mnt_counter.mutex);
+}
+
+static int erofs_fprt_eq(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private = inode->i_private;
+
+ return ano_private && memcmp(ano_private->fprt, data,
+ sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
+}
+
+static int erofs_fprt_set(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
+ if (!ano_private)
+ return -ENOMEM;
+ memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ inode->i_private = ano_private;
+ return 0;
+}
+
+int erofs_pcshr_fill_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ /* | fingerprint length | fingerprint content | */
+ char fprt[PCSHR_FPRT_MAXLEN];
+ struct inode *ano_inode;
+ unsigned long fprt_hash;
+ size_t fprt_len;
+ int ret = -1;
+
+ vi->ano_inode = NULL;
+ memset(fprt, 0, sizeof(fprt));
+ fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
+ fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
+ if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
+ *(size_t *)fprt = fprt_len;
+ fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
+ ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
+ erofs_fprt_eq, erofs_fprt_set, fprt);
+ DBG_BUGON(!ano_inode);
+ vi->ano_inode = ano_inode;
+ if (ano_inode->i_state & I_NEW) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ ano_inode->i_mapping->a_ops = &z_erofs_aops;
+ else
+ ano_inode->i_mapping->a_ops = &erofs_aops;
+ ano_inode->i_size = inode->i_size;
+ unlock_new_inode(ano_inode);
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+void erofs_pcshr_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ if (S_ISREG(inode->i_mode) && vi->ano_inode) {
+ iput(vi->ano_inode);
+ vi->ano_inode = NULL;
+ }
+}
+
+static struct file *erofs_pcshr_alloc_file(struct file *file,
+ struct inode *ano_inode)
+{
+ struct file *ano_file;
+
+ ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
+ "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
+ if (IS_ERR(ano_file))
+ return ano_file;
+
+ file_ra_state_init(&ano_file->f_ra, file->f_mapping);
+ ano_file->private_data = EROFS_I(file_inode(file));
+ return ano_file;
+}
+
+static int erofs_pcshr_file_open(struct inode *inode, struct file *file)
+{
+ struct file *ano_file;
+ struct inode *ano_inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ ano_inode = vi->ano_inode;
+ if (!ano_inode)
+ return -EINVAL;
+
+ ano_file = erofs_pcshr_alloc_file(file, ano_inode);
+ if (IS_ERR(ano_file))
+ return PTR_ERR(ano_file);
+
+ ihold(ano_inode);
+ file->private_data = (void *)ano_file;
+ return 0;
+}
+
+static int erofs_pcshr_file_release(struct inode *inode, struct file *file)
+{
+ if (!file->private_data)
+ return -EINVAL;
+
+ fput((struct file *)file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
+ struct file *file, *ano_file;
+ struct kiocb ano_iocb;
+ ssize_t res;
+
+ if (!iov_iter_count(to))
+ return 0;
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return erofs_file_fops.read_iter(iocb, to);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return erofs_file_fops.read_iter(iocb, to);
+
+ memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
+ file = iocb->ki_filp;
+ ano_file = file->private_data;
+ if (!ano_file)
+ return -EINVAL;
+ ano_iocb.ki_filp = ano_file;
+ res = filemap_read(&ano_iocb, to, 0);
+ memcpy(iocb, &ano_iocb, sizeof(struct kiocb));
+ iocb->ki_filp = file;
+ file_accessed(file);
+ return res;
+}
+
+extern const struct vm_operations_struct generic_file_vm_ops;
+
+static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *ano_file = file->private_data;
+
+ vma_set_file(vma, ano_file);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+const struct file_operations erofs_pcshr_fops = {
+ .open = erofs_pcshr_file_open,
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_pcshr_file_read_iter,
+ .mmap = erofs_pcshr_mmap,
+ .release = erofs_pcshr_file_release,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
+};
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
new file mode 100644
index 000000000000..f3889d6889e5
--- /dev/null
+++ b/fs/erofs/pagecache_share.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#ifndef __EROFS_PAGECACHE_SHARE_H
+#define __EROFS_PAGECACHE_SHARE_H
+
+#include <linux/fs.h>
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+int erofs_pcshr_init_mnt(void);
+void erofs_pcshr_free_mnt(void);
+int erofs_pcshr_fill_inode(struct inode *inode);
+void erofs_pcshr_free_inode(struct inode *inode);
+
+#else
+
+static inline int erofs_pcshr_init_mnt(void) { return 0; }
+static inline void erofs_pcshr_free_mnt(void) {}
+static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
+static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+
+#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 25d2c2b44d0a..b4ce07dc931c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -853,9 +853,31 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+static void erofs_free_anon_inode(struct inode *inode)
+{
+ kfree(inode->i_private);
+ inode->i_private = NULL;
+}
+#else
+#define erofs_free_anon_inode NULL
+#endif
+
+static const struct super_operations erofs_anon_sops = {
+ .statfs = simple_statfs,
+ .free_inode = erofs_free_anon_inode,
+};
+
+
static int erofs_anon_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
+ if (ctx)
+ ctx->ops = &erofs_anon_sops;
+
+ return ctx ? 0 : -ENOMEM;
}
struct file_system_type erofs_anon_fs_type = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 3/4] erofs: apply the page cache share feature
2025-01-05 15:12 ` Hongzhen Luo
@ 2025-01-05 15:12 ` Hongzhen Luo
-1 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel
This modifies relevant functions to apply the page cache
share feature.
Below is the memory usage for reading all files in two different minor
versions of container images:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 241 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 163 | 33% |
+-------------------+------------------+-------------+---------------+
| | No | 872 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 630 | 28% |
+-------------------+------------------+-------------+---------------+
| | No | 2771 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
+-------------------+------------------+-------------+---------------+
| | No | 926 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 735 | 21% |
+-------------------+------------------+-------------+---------------+
| | No | 390 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 219 | 44% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 924 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 474 | 49% |
+-------------------+------------------+-------------+---------------+
Additionally, the table below shows the runtime memory usage of the
container:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 35 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 28 | 20% |
+-------------------+------------------+-------------+---------------+
| | No | 149 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 95 | 37% |
+-------------------+------------------+-------------+---------------+
| | No | 1028 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 930 | 10% |
+-------------------+------------------+-------------+---------------+
| | No | 155 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 132 | 15% |
+-------------------+------------------+-------------+---------------+
| | No | 25 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 20 | 20% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 186 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 98 | 48% |
+-------------------+------------------+-------------+---------------+
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/data.c | 14 +++++++--
fs/erofs/inode.c | 5 ++-
fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 11 +++++++
fs/erofs/super.c | 7 +++++
fs/erofs/zdata.c | 9 ++++--
6 files changed, 104 insertions(+), 5 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..fb08acbeaab6 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
+#include "pagecache_share.h"
#include <linux/sched/mm.h>
#include <trace/events/erofs.h>
@@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ int ret, pcshr;
+
+ pcshr = erofs_pcshr_read_begin(file, folio);
+ ret = iomap_read_folio(folio, &erofs_iomap_ops);
+ erofs_pcshr_read_end(file, folio, pcshr);
+ return ret;
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ int pcshr;
+
+ pcshr = erofs_pcshr_readahead_begin(rac);
+ iomap_readahead(rac, &erofs_iomap_ops);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..0b070f4b46b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
+#include "pagecache_share.h"
#include <trace/events/erofs.h>
static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
+ if (erofs_pcshr_fill_inode(inode) == 0)
+ inode->i_fop = &erofs_pcshr_fops;
+ else if (erofs_inode_is_data_compressed(vi->datalayout))
inode->i_fop = &generic_ro_fops;
else
inode->i_fop = &erofs_file_fops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 703fd17c002c..22172b5e21c7 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
struct erofs_pcshr_private {
char fprt[PCSHR_FPRT_MAXLEN];
+ struct mutex mutex;
};
static struct erofs_pcshr_counter mnt_counter = {
@@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
if (!ano_private)
return -ENOMEM;
memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ mutex_init(&ano_private->mutex);
inode->i_private = ano_private;
return 0;
}
@@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
+
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
+{
+ struct erofs_inode *vi;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = vi->ano_inode->i_private;
+ mutex_lock(&ano_private->mutex);
+ folio->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(file)->i_private;
+ folio->mapping->host = file_inode(file);
+ mutex_unlock(&ano_private->mutex);
+}
+
+int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+{
+ struct erofs_inode *vi;
+ struct file *file = rac->file;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = file_inode(file)->i_private;
+ mutex_lock(&ano_private->mutex);
+ rac->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(rac->file)->i_private;
+ rac->mapping->host = file_inode(rac->file);
+ mutex_unlock(&ano_private->mutex);
+}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index f3889d6889e5..abda2a60278b 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
int erofs_pcshr_fill_inode(struct inode *inode);
void erofs_pcshr_free_inode(struct inode *inode);
+/* switch between the anonymous inode and the real inode */
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+
#else
static inline int erofs_pcshr_init_mnt(void) { return 0; }
@@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
+static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
+
#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b4ce07dc931c..1b690eb6c1f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/backing-dev.h>
#include <linux/pseudo_fs.h>
#include "xattr.h"
+#include "pagecache_share.h"
#define CREATE_TRACE_POINTS
#include <trace/events/erofs.h>
@@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
+ erofs_pcshr_free_inode(inode);
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
@@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ err = erofs_pcshr_init_mnt();
+ if (err)
+ return err;
+
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
kill_anon_super(sb);
else
kill_block_super(sb);
+ erofs_pcshr_free_mnt();
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
erofs_sb_free(sbi);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 19ef4ff2a134..fc2ed01eaabe 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,6 +5,7 @@
* Copyright (C) 2022 Alibaba Cloud
*/
#include "compress.h"
+#include "pagecache_share.h"
#include <linux/psi.h>
#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>
@@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- int err;
+ int err, pcshr;
trace_erofs_read_folio(folio, false);
+ pcshr = erofs_pcshr_read_begin(file, folio);
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
@@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_read_end(file, folio, pcshr);
return err;
}
@@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
- int err;
+ int err, pcshr;
+ pcshr = erofs_pcshr_readahead_begin(rac);
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
@@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
const struct address_space_operations z_erofs_aops = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 3/4] erofs: apply the page cache share feature
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel, Hongzhen Luo
This modifies relevant functions to apply the page cache
share feature.
Below is the memory usage for reading all files in two different minor
versions of container images:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 241 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 163 | 33% |
+-------------------+------------------+-------------+---------------+
| | No | 872 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 630 | 28% |
+-------------------+------------------+-------------+---------------+
| | No | 2771 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
+-------------------+------------------+-------------+---------------+
| | No | 926 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 735 | 21% |
+-------------------+------------------+-------------+---------------+
| | No | 390 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 219 | 44% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 924 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 474 | 49% |
+-------------------+------------------+-------------+---------------+
Additionally, the table below shows the runtime memory usage of the
container:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 35 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 28 | 20% |
+-------------------+------------------+-------------+---------------+
| | No | 149 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 95 | 37% |
+-------------------+------------------+-------------+---------------+
| | No | 1028 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 930 | 10% |
+-------------------+------------------+-------------+---------------+
| | No | 155 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 132 | 15% |
+-------------------+------------------+-------------+---------------+
| | No | 25 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 20 | 20% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 186 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 98 | 48% |
+-------------------+------------------+-------------+---------------+
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/data.c | 14 +++++++--
fs/erofs/inode.c | 5 ++-
fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 11 +++++++
fs/erofs/super.c | 7 +++++
fs/erofs/zdata.c | 9 ++++--
6 files changed, 104 insertions(+), 5 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..fb08acbeaab6 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
+#include "pagecache_share.h"
#include <linux/sched/mm.h>
#include <trace/events/erofs.h>
@@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ int ret, pcshr;
+
+ pcshr = erofs_pcshr_read_begin(file, folio);
+ ret = iomap_read_folio(folio, &erofs_iomap_ops);
+ erofs_pcshr_read_end(file, folio, pcshr);
+ return ret;
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ int pcshr;
+
+ pcshr = erofs_pcshr_readahead_begin(rac);
+ iomap_readahead(rac, &erofs_iomap_ops);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..0b070f4b46b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
+#include "pagecache_share.h"
#include <trace/events/erofs.h>
static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
+ if (erofs_pcshr_fill_inode(inode) == 0)
+ inode->i_fop = &erofs_pcshr_fops;
+ else if (erofs_inode_is_data_compressed(vi->datalayout))
inode->i_fop = &generic_ro_fops;
else
inode->i_fop = &erofs_file_fops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 703fd17c002c..22172b5e21c7 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
struct erofs_pcshr_private {
char fprt[PCSHR_FPRT_MAXLEN];
+ struct mutex mutex;
};
static struct erofs_pcshr_counter mnt_counter = {
@@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
if (!ano_private)
return -ENOMEM;
memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ mutex_init(&ano_private->mutex);
inode->i_private = ano_private;
return 0;
}
@@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
+
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
+{
+ struct erofs_inode *vi;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = vi->ano_inode->i_private;
+ mutex_lock(&ano_private->mutex);
+ folio->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(file)->i_private;
+ folio->mapping->host = file_inode(file);
+ mutex_unlock(&ano_private->mutex);
+}
+
+int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+{
+ struct erofs_inode *vi;
+ struct file *file = rac->file;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = file_inode(file)->i_private;
+ mutex_lock(&ano_private->mutex);
+ rac->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(rac->file)->i_private;
+ rac->mapping->host = file_inode(rac->file);
+ mutex_unlock(&ano_private->mutex);
+}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index f3889d6889e5..abda2a60278b 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
int erofs_pcshr_fill_inode(struct inode *inode);
void erofs_pcshr_free_inode(struct inode *inode);
+/* switch between the anonymous inode and the real inode */
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+
#else
static inline int erofs_pcshr_init_mnt(void) { return 0; }
@@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
+static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
+
#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b4ce07dc931c..1b690eb6c1f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/backing-dev.h>
#include <linux/pseudo_fs.h>
#include "xattr.h"
+#include "pagecache_share.h"
#define CREATE_TRACE_POINTS
#include <trace/events/erofs.h>
@@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
+ erofs_pcshr_free_inode(inode);
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
@@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ err = erofs_pcshr_init_mnt();
+ if (err)
+ return err;
+
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
kill_anon_super(sb);
else
kill_block_super(sb);
+ erofs_pcshr_free_mnt();
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
erofs_sb_free(sbi);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 19ef4ff2a134..fc2ed01eaabe 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,6 +5,7 @@
* Copyright (C) 2022 Alibaba Cloud
*/
#include "compress.h"
+#include "pagecache_share.h"
#include <linux/psi.h>
#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>
@@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- int err;
+ int err, pcshr;
trace_erofs_read_folio(folio, false);
+ pcshr = erofs_pcshr_read_begin(file, folio);
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
@@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_read_end(file, folio, pcshr);
return err;
}
@@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
- int err;
+ int err, pcshr;
+ pcshr = erofs_pcshr_readahead_begin(rac);
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
@@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
const struct address_space_operations z_erofs_aops = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
2025-01-05 15:12 ` Hongzhen Luo
@ 2025-01-05 15:12 ` Hongzhen Luo
-1 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel
When using .fadvice to release a file's page cache, it frees
those page caches that were firstly read by this file. To achieve
this, an interval tree is added in the inode of that file to track
the segments firstly read by that inode.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/data.c | 5 +-
fs/erofs/internal.h | 3 +
fs/erofs/pagecache_share.c | 151 +++++++++++++++++++++++++++++++++++--
fs/erofs/pagecache_share.h | 10 ++-
fs/erofs/zdata.c | 5 +-
5 files changed, 160 insertions(+), 14 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fb08acbeaab6..ebb9a79e5f0e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -382,10 +382,11 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
static void erofs_readahead(struct readahead_control *rac)
{
int pcshr;
+ unsigned long start;
- pcshr = erofs_pcshr_readahead_begin(rac);
+ pcshr = erofs_pcshr_readahead_begin(rac, &start);
iomap_readahead(rac, &erofs_iomap_ops);
- erofs_pcshr_readahead_end(rac, pcshr);
+ erofs_pcshr_readahead_end(rac, pcshr, start);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6c87621d86ba..593c79abfb79 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -282,6 +282,9 @@ struct erofs_inode {
};
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
struct inode *ano_inode;
+ /* first-read segments */
+ struct rb_root_cached segs;
+ struct mutex segs_mutex;
#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 22172b5e21c7..46b022de5f17 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -6,6 +6,9 @@
#include <linux/refcount.h>
#include <linux/mount.h>
#include <linux/mutex.h>
+#include <uapi/linux/fadvise.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
#include "pagecache_share.h"
#include "internal.h"
#include "xattr.h"
@@ -18,6 +21,8 @@ struct erofs_pcshr_counter {
struct mutex mutex;
struct kref ref;
struct vfsmount *mnt;
+ /* kmem cache for each inode's first-read segments */
+ struct kmem_cache *segsp;
};
struct erofs_pcshr_private {
@@ -38,6 +43,8 @@ static void erofs_pcshr_counter_release(struct kref *ref)
DBG_BUGON(!counter->mnt);
kern_unmount(counter->mnt);
counter->mnt = NULL;
+ kmem_cache_destroy(counter->segsp);
+ counter->segsp = NULL;
}
int erofs_pcshr_init_mnt(void)
@@ -54,6 +61,14 @@ int erofs_pcshr_init_mnt(void)
}
mnt_counter.mnt = tmp;
kref_init(&mnt_counter.ref);
+
+ mnt_counter.segsp = kmem_cache_create("erofs_segs",
+ sizeof(struct interval_tree_node), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL);
+ if (!mnt_counter.segsp) {
+ ret = -ENOMEM;
+ goto out;
+ }
} else
kref_get(&mnt_counter.ref);
ret = 0;
@@ -69,6 +84,16 @@ void erofs_pcshr_free_mnt(void)
mutex_unlock(&mnt_counter.mutex);
}
+static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
+{
+ return kmem_cache_alloc(mnt_counter.segsp, GFP_KERNEL);
+}
+
+static void erofs_pcshr_free_seg(struct interval_tree_node *seg)
+{
+ kmem_cache_free(mnt_counter.segsp, seg);
+}
+
static int erofs_fprt_eq(struct inode *inode, void *data)
{
struct erofs_pcshr_private *ano_private = inode->i_private;
@@ -111,6 +136,8 @@ int erofs_pcshr_fill_inode(struct inode *inode)
erofs_fprt_eq, erofs_fprt_set, fprt);
DBG_BUGON(!ano_inode);
vi->ano_inode = ano_inode;
+ vi->segs = RB_ROOT_CACHED;
+ mutex_init(&vi->segs_mutex);
if (ano_inode->i_state & I_NEW) {
if (erofs_inode_is_data_compressed(vi->datalayout))
ano_inode->i_mapping->a_ops = &z_erofs_aops;
@@ -126,12 +153,20 @@ int erofs_pcshr_fill_inode(struct inode *inode)
void erofs_pcshr_free_inode(struct inode *inode)
{
+ struct interval_tree_node *seg, *next_seg;
struct erofs_inode *vi = EROFS_I(inode);
if (S_ISREG(inode->i_mode) && vi->ano_inode) {
iput(vi->ano_inode);
vi->ano_inode = NULL;
}
+ seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
+ while (seg) {
+ next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
+ interval_tree_remove(seg, &vi->segs);
+ erofs_pcshr_free_seg(seg);
+ seg = next_seg;
+ }
}
static struct file *erofs_pcshr_alloc_file(struct file *file,
@@ -219,6 +254,65 @@ static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int erofs_pcshr_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct erofs_inode *vi = EROFS_I(file_inode(file));
+ struct interval_tree_node *seg, *next_seg, *new_seg;
+ struct file *ano_file = file->private_data;
+ struct erofs_pcshr_private *ano_private;
+ erofs_off_t start, end, l, r;
+ int err = 0;
+
+ if (advice != POSIX_FADV_DONTNEED)
+ return generic_fadvise(ano_file, offset, len, advice);
+
+ ano_private = file_inode(ano_file)->i_private;
+
+ start = offset >> PAGE_SHIFT;
+ /* len = 0 means EOF */
+ end = ((!len ? LLONG_MAX : offset + len) >> PAGE_SHIFT) + 1;
+
+ mutex_lock(&vi->segs_mutex);
+ seg = interval_tree_iter_first(&vi->segs, start, end);
+ while (seg) {
+ next_seg = interval_tree_iter_next(seg, start, end);
+ /*
+ * calculate the overlap between [start, end)
+ * and [seg->start, seg->last)
+ */
+ l = max_t(u64, seg->start | 0ULL, start);
+ r = min_t(u64, seg->last | 0ULL, end);
+ if (l >= r)
+ continue;
+
+ /* a new smaller interval on the left side */
+ if (seg->start < l) {
+ new_seg = erofs_pcshr_alloc_seg();
+ new_seg->start = seg->start;
+ new_seg->last = l;
+ interval_tree_insert(new_seg, &vi->segs);
+ }
+
+ /* a new smaller interval on the right side */
+ if (r < seg->last) {
+ new_seg = erofs_pcshr_alloc_seg();
+ new_seg->start = r;
+ new_seg->last = seg->last;
+ interval_tree_insert(new_seg, &vi->segs);
+ }
+ mutex_lock(&ano_private->mutex);
+ truncate_inode_pages_range(file_inode(ano_file)->i_mapping,
+ l << PAGE_SHIFT,
+ (r - 1) << PAGE_SHIFT);
+ mutex_unlock(&ano_private->mutex);
+ interval_tree_remove(seg, &vi->segs);
+ erofs_pcshr_free_seg(seg);
+ seg = next_seg;
+ }
+ mutex_unlock(&vi->segs_mutex);
+ return err;
+}
+
const struct file_operations erofs_pcshr_fops = {
.open = erofs_pcshr_file_open,
.llseek = generic_file_llseek,
@@ -227,6 +321,7 @@ const struct file_operations erofs_pcshr_fops = {
.release = erofs_pcshr_file_release,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
+ .fadvise = erofs_pcshr_fadvise,
};
int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
@@ -240,9 +335,11 @@ int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
vi = file->private_data;
if (vi->ano_inode != file_inode(file))
return 0;
-
ano_private = vi->ano_inode->i_private;
+
+ mutex_lock(&vi->segs_mutex);
mutex_lock(&ano_private->mutex);
+
folio->mapping->host = &vi->vfs_inode;
return 1;
}
@@ -250,16 +347,36 @@ int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
{
struct erofs_pcshr_private *ano_private;
+ struct interval_tree_node *seg;
+ struct erofs_inode *vi;
if (pcshr == 0)
return;
-
+ vi = file->private_data;
ano_private = file_inode(file)->i_private;
+
+ /* switch host inode */
folio->mapping->host = file_inode(file);
+
+ /* record first-read segment */
+ seg = erofs_pcshr_alloc_seg();
+ if (!seg) {
+ DBG_BUGON(1);
+ goto unlock;
+ }
+ seg->start = folio_index(folio);
+ seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
+ if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
+ seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
+ DBG_BUGON(seg->last < seg->start);
+ interval_tree_insert(seg, &vi->segs);
+unlock:
mutex_unlock(&ano_private->mutex);
+ mutex_unlock(&vi->segs_mutex);
}
-int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+int erofs_pcshr_readahead_begin(struct readahead_control *rac,
+ unsigned long *start)
{
struct erofs_inode *vi;
struct file *file = rac->file;
@@ -271,21 +388,43 @@ int erofs_pcshr_readahead_begin(struct readahead_control *rac)
vi = file->private_data;
if (vi->ano_inode != file_inode(file))
return 0;
-
ano_private = file_inode(file)->i_private;
+
+ mutex_lock(&vi->segs_mutex);
mutex_lock(&ano_private->mutex);
+
rac->mapping->host = &vi->vfs_inode;
+ *start = readahead_pos(rac) >> PAGE_SHIFT;
return 1;
}
-void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr,
+ unsigned long start)
{
struct erofs_pcshr_private *ano_private;
+ struct interval_tree_node *seg;
+ struct erofs_inode *vi;
if (pcshr == 0)
return;
-
+ vi = rac->file->private_data;
ano_private = file_inode(rac->file)->i_private;
+
+ /* switch host inode */
rac->mapping->host = file_inode(rac->file);
+
+ /* record first-read segments */
+ seg = erofs_pcshr_alloc_seg();
+ if (!seg) {
+ DBG_BUGON(1);
+ goto unlock;
+ }
+ seg->start = start;
+ seg->last = readahead_pos(rac) >> PAGE_SHIFT;
+ if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
+ seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
+ interval_tree_insert(seg, &vi->segs);
+unlock:
mutex_unlock(&ano_private->mutex);
+ mutex_unlock(&vi->segs_mutex);
}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index abda2a60278b..2c4ac7e45227 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -17,8 +17,10 @@ void erofs_pcshr_free_inode(struct inode *inode);
/* switch between the anonymous inode and the real inode */
int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
-int erofs_pcshr_readahead_begin(struct readahead_control *rac);
-void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac,
+ unsigned long *start);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr,
+ unsigned long start);
#else
@@ -29,8 +31,8 @@ static inline void erofs_pcshr_free_inode(struct inode *inode) {}
static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
-static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
-static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac, unsigned long *start) { return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr, unsigned long start) {}
#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fc2ed01eaabe..f646ec70cd7a 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1921,9 +1921,10 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
+ unsigned long start;
int err, pcshr;
- pcshr = erofs_pcshr_readahead_begin(rac);
+ pcshr = erofs_pcshr_readahead_begin(rac, &start);
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
@@ -1951,7 +1952,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
- erofs_pcshr_readahead_end(rac, pcshr);
+ erofs_pcshr_readahead_end(rac, pcshr, start);
}
const struct address_space_operations z_erofs_aops = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
@ 2025-01-05 15:12 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-05 15:12 UTC (permalink / raw)
To: linux-erofs; +Cc: linux-kernel, Hongzhen Luo
When using .fadvice to release a file's page cache, it frees
those page caches that were firstly read by this file. To achieve
this, an interval tree is added in the inode of that file to track
the segments firstly read by that inode.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/data.c | 5 +-
fs/erofs/internal.h | 3 +
fs/erofs/pagecache_share.c | 151 +++++++++++++++++++++++++++++++++++--
fs/erofs/pagecache_share.h | 10 ++-
fs/erofs/zdata.c | 5 +-
5 files changed, 160 insertions(+), 14 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fb08acbeaab6..ebb9a79e5f0e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -382,10 +382,11 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
static void erofs_readahead(struct readahead_control *rac)
{
int pcshr;
+ unsigned long start;
- pcshr = erofs_pcshr_readahead_begin(rac);
+ pcshr = erofs_pcshr_readahead_begin(rac, &start);
iomap_readahead(rac, &erofs_iomap_ops);
- erofs_pcshr_readahead_end(rac, pcshr);
+ erofs_pcshr_readahead_end(rac, pcshr, start);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6c87621d86ba..593c79abfb79 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -282,6 +282,9 @@ struct erofs_inode {
};
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
struct inode *ano_inode;
+ /* first-read segments */
+ struct rb_root_cached segs;
+ struct mutex segs_mutex;
#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 22172b5e21c7..46b022de5f17 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -6,6 +6,9 @@
#include <linux/refcount.h>
#include <linux/mount.h>
#include <linux/mutex.h>
+#include <uapi/linux/fadvise.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
#include "pagecache_share.h"
#include "internal.h"
#include "xattr.h"
@@ -18,6 +21,8 @@ struct erofs_pcshr_counter {
struct mutex mutex;
struct kref ref;
struct vfsmount *mnt;
+ /* kmem cache for each inode's first-read segments */
+ struct kmem_cache *segsp;
};
struct erofs_pcshr_private {
@@ -38,6 +43,8 @@ static void erofs_pcshr_counter_release(struct kref *ref)
DBG_BUGON(!counter->mnt);
kern_unmount(counter->mnt);
counter->mnt = NULL;
+ kmem_cache_destroy(counter->segsp);
+ counter->segsp = NULL;
}
int erofs_pcshr_init_mnt(void)
@@ -54,6 +61,14 @@ int erofs_pcshr_init_mnt(void)
}
mnt_counter.mnt = tmp;
kref_init(&mnt_counter.ref);
+
+ mnt_counter.segsp = kmem_cache_create("erofs_segs",
+ sizeof(struct interval_tree_node), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL);
+ if (!mnt_counter.segsp) {
+ ret = -ENOMEM;
+ goto out;
+ }
} else
kref_get(&mnt_counter.ref);
ret = 0;
@@ -69,6 +84,16 @@ void erofs_pcshr_free_mnt(void)
mutex_unlock(&mnt_counter.mutex);
}
+static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
+{
+ return kmem_cache_alloc(mnt_counter.segsp, GFP_KERNEL);
+}
+
+static void erofs_pcshr_free_seg(struct interval_tree_node *seg)
+{
+ kmem_cache_free(mnt_counter.segsp, seg);
+}
+
static int erofs_fprt_eq(struct inode *inode, void *data)
{
struct erofs_pcshr_private *ano_private = inode->i_private;
@@ -111,6 +136,8 @@ int erofs_pcshr_fill_inode(struct inode *inode)
erofs_fprt_eq, erofs_fprt_set, fprt);
DBG_BUGON(!ano_inode);
vi->ano_inode = ano_inode;
+ vi->segs = RB_ROOT_CACHED;
+ mutex_init(&vi->segs_mutex);
if (ano_inode->i_state & I_NEW) {
if (erofs_inode_is_data_compressed(vi->datalayout))
ano_inode->i_mapping->a_ops = &z_erofs_aops;
@@ -126,12 +153,20 @@ int erofs_pcshr_fill_inode(struct inode *inode)
void erofs_pcshr_free_inode(struct inode *inode)
{
+ struct interval_tree_node *seg, *next_seg;
struct erofs_inode *vi = EROFS_I(inode);
if (S_ISREG(inode->i_mode) && vi->ano_inode) {
iput(vi->ano_inode);
vi->ano_inode = NULL;
}
+ seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
+ while (seg) {
+ next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
+ interval_tree_remove(seg, &vi->segs);
+ erofs_pcshr_free_seg(seg);
+ seg = next_seg;
+ }
}
static struct file *erofs_pcshr_alloc_file(struct file *file,
@@ -219,6 +254,65 @@ static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int erofs_pcshr_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct erofs_inode *vi = EROFS_I(file_inode(file));
+ struct interval_tree_node *seg, *next_seg, *new_seg;
+ struct file *ano_file = file->private_data;
+ struct erofs_pcshr_private *ano_private;
+ erofs_off_t start, end, l, r;
+ int err = 0;
+
+ if (advice != POSIX_FADV_DONTNEED)
+ return generic_fadvise(ano_file, offset, len, advice);
+
+ ano_private = file_inode(ano_file)->i_private;
+
+ start = offset >> PAGE_SHIFT;
+ /* len = 0 means EOF */
+ end = ((!len ? LLONG_MAX : offset + len) >> PAGE_SHIFT) + 1;
+
+ mutex_lock(&vi->segs_mutex);
+ seg = interval_tree_iter_first(&vi->segs, start, end);
+ while (seg) {
+ next_seg = interval_tree_iter_next(seg, start, end);
+ /*
+ * calculate the overlap between [start, end)
+ * and [seg->start, seg->last)
+ */
+ l = max_t(u64, seg->start | 0ULL, start);
+ r = min_t(u64, seg->last | 0ULL, end);
+ if (l >= r)
+ continue;
+
+ /* a new smaller interval on the left side */
+ if (seg->start < l) {
+ new_seg = erofs_pcshr_alloc_seg();
+ new_seg->start = seg->start;
+ new_seg->last = l;
+ interval_tree_insert(new_seg, &vi->segs);
+ }
+
+ /* a new smaller interval on the right side */
+ if (r < seg->last) {
+ new_seg = erofs_pcshr_alloc_seg();
+ new_seg->start = r;
+ new_seg->last = seg->last;
+ interval_tree_insert(new_seg, &vi->segs);
+ }
+ mutex_lock(&ano_private->mutex);
+ truncate_inode_pages_range(file_inode(ano_file)->i_mapping,
+ l << PAGE_SHIFT,
+ (r - 1) << PAGE_SHIFT);
+ mutex_unlock(&ano_private->mutex);
+ interval_tree_remove(seg, &vi->segs);
+ erofs_pcshr_free_seg(seg);
+ seg = next_seg;
+ }
+ mutex_unlock(&vi->segs_mutex);
+ return err;
+}
+
const struct file_operations erofs_pcshr_fops = {
.open = erofs_pcshr_file_open,
.llseek = generic_file_llseek,
@@ -227,6 +321,7 @@ const struct file_operations erofs_pcshr_fops = {
.release = erofs_pcshr_file_release,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
+ .fadvise = erofs_pcshr_fadvise,
};
int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
@@ -240,9 +335,11 @@ int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
vi = file->private_data;
if (vi->ano_inode != file_inode(file))
return 0;
-
ano_private = vi->ano_inode->i_private;
+
+ mutex_lock(&vi->segs_mutex);
mutex_lock(&ano_private->mutex);
+
folio->mapping->host = &vi->vfs_inode;
return 1;
}
@@ -250,16 +347,36 @@ int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
{
struct erofs_pcshr_private *ano_private;
+ struct interval_tree_node *seg;
+ struct erofs_inode *vi;
if (pcshr == 0)
return;
-
+ vi = file->private_data;
ano_private = file_inode(file)->i_private;
+
+ /* switch host inode */
folio->mapping->host = file_inode(file);
+
+ /* record first-read segment */
+ seg = erofs_pcshr_alloc_seg();
+ if (!seg) {
+ DBG_BUGON(1);
+ goto unlock;
+ }
+ seg->start = folio_index(folio);
+ seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
+ if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
+ seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
+ DBG_BUGON(seg->last < seg->start);
+ interval_tree_insert(seg, &vi->segs);
+unlock:
mutex_unlock(&ano_private->mutex);
+ mutex_unlock(&vi->segs_mutex);
}
-int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+int erofs_pcshr_readahead_begin(struct readahead_control *rac,
+ unsigned long *start)
{
struct erofs_inode *vi;
struct file *file = rac->file;
@@ -271,21 +388,43 @@ int erofs_pcshr_readahead_begin(struct readahead_control *rac)
vi = file->private_data;
if (vi->ano_inode != file_inode(file))
return 0;
-
ano_private = file_inode(file)->i_private;
+
+ mutex_lock(&vi->segs_mutex);
mutex_lock(&ano_private->mutex);
+
rac->mapping->host = &vi->vfs_inode;
+ *start = readahead_pos(rac) >> PAGE_SHIFT;
return 1;
}
-void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr,
+ unsigned long start)
{
struct erofs_pcshr_private *ano_private;
+ struct interval_tree_node *seg;
+ struct erofs_inode *vi;
if (pcshr == 0)
return;
-
+ vi = rac->file->private_data;
ano_private = file_inode(rac->file)->i_private;
+
+ /* switch host inode */
rac->mapping->host = file_inode(rac->file);
+
+ /* record first-read segments */
+ seg = erofs_pcshr_alloc_seg();
+ if (!seg) {
+ DBG_BUGON(1);
+ goto unlock;
+ }
+ seg->start = start;
+ seg->last = readahead_pos(rac) >> PAGE_SHIFT;
+ if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
+ seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
+ interval_tree_insert(seg, &vi->segs);
+unlock:
mutex_unlock(&ano_private->mutex);
+ mutex_unlock(&vi->segs_mutex);
}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index abda2a60278b..2c4ac7e45227 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -17,8 +17,10 @@ void erofs_pcshr_free_inode(struct inode *inode);
/* switch between the anonymous inode and the real inode */
int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
-int erofs_pcshr_readahead_begin(struct readahead_control *rac);
-void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac,
+ unsigned long *start);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr,
+ unsigned long start);
#else
@@ -29,8 +31,8 @@ static inline void erofs_pcshr_free_inode(struct inode *inode) {}
static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
-static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
-static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac, unsigned long *start) { return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr, unsigned long start) {}
#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fc2ed01eaabe..f646ec70cd7a 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1921,9 +1921,10 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
+ unsigned long start;
int err, pcshr;
- pcshr = erofs_pcshr_readahead_begin(rac);
+ pcshr = erofs_pcshr_readahead_begin(rac, &start);
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
@@ -1951,7 +1952,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
- erofs_pcshr_readahead_end(rac, pcshr);
+ erofs_pcshr_readahead_end(rac, pcshr, start);
}
const struct address_space_operations z_erofs_aops = {
--
2.43.5
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
2025-01-05 15:12 ` Hongzhen Luo
(?)
@ 2025-01-05 18:52 ` kernel test robot
-1 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-01-05 18:52 UTC (permalink / raw)
To: Hongzhen Luo; +Cc: oe-kbuild-all
Hi Hongzhen,
[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:
[auto build test ERROR on xiang-erofs/dev-test]
[also build test ERROR on xiang-erofs/dev xiang-erofs/fixes linus/master v6.13-rc5 next-20241220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hongzhen-Luo/erofs-move-struct-erofs_anon_fs_type-to-super-c/20250105-231438
base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
patch link: https://lore.kernel.org/r/20250105151208.3797385-5-hongzhen%40linux.alibaba.com
patch subject: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
config: loongarch-randconfig-001-20250106 (https://download.01.org/0day-ci/archive/20250106/202501060221.LPQZaeA4-lkp@intel.com/config)
compiler: loongarch64-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250106/202501060221.LPQZaeA4-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202501060221.LPQZaeA4-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from fs/erofs/pagecache_share.c:10:
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_init_mnt':
>> fs/erofs/pagecache_share.c:66:32: error: invalid application of 'sizeof' to incomplete type 'struct interval_tree_node'
66 | sizeof(struct interval_tree_node), 0,
| ^~~~~~
include/linux/slab.h:430:55: note: in definition of macro 'kmem_cache_create'
430 | default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)
| ^~~~~~~~~~~~~
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_free_inode':
>> fs/erofs/pagecache_share.c:163:15: error: implicit declaration of function 'interval_tree_iter_first'; did you mean 'vma_interval_tree_iter_first'? [-Wimplicit-function-declaration]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^~~~~~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_iter_first
>> fs/erofs/pagecache_share.c:163:13: error: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^
>> fs/erofs/pagecache_share.c:165:28: error: implicit declaration of function 'interval_tree_iter_next'; did you mean 'vma_interval_tree_iter_next'? [-Wimplicit-function-declaration]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^~~~~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_iter_next
fs/erofs/pagecache_share.c:165:26: error: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^
>> fs/erofs/pagecache_share.c:166:17: error: implicit declaration of function 'interval_tree_remove'; did you mean 'vma_interval_tree_remove'? [-Wimplicit-function-declaration]
166 | interval_tree_remove(seg, &vi->segs);
| ^~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_remove
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_fadvise':
fs/erofs/pagecache_share.c:276:13: error: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
276 | seg = interval_tree_iter_first(&vi->segs, start, end);
| ^
fs/erofs/pagecache_share.c:278:26: error: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
278 | next_seg = interval_tree_iter_next(seg, start, end);
| ^
In file included from include/linux/kernel.h:28,
from include/linux/cpumask.h:11,
from arch/loongarch/include/asm/processor.h:9,
from arch/loongarch/include/asm/thread_info.h:15,
from include/linux/thread_info.h:60,
from include/asm-generic/current.h:6,
from ./arch/loongarch/include/generated/asm/current.h:1,
from include/linux/mutex.h:14,
from fs/erofs/pagecache_share.c:8:
>> fs/erofs/pagecache_share.c:283:35: error: invalid use of undefined type 'struct interval_tree_node'
283 | l = max_t(u64, seg->start | 0ULL, start);
| ^~
include/linux/minmax.h:93:23: note: in definition of macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
include/linux/minmax.h:221:27: note: in expansion of macro '__cmp_once'
221 | #define max_t(type, x, y) __cmp_once(max, type, x, y)
| ^~~~~~~~~~
fs/erofs/pagecache_share.c:283:21: note: in expansion of macro 'max_t'
283 | l = max_t(u64, seg->start | 0ULL, start);
| ^~~~~
fs/erofs/pagecache_share.c:284:35: error: invalid use of undefined type 'struct interval_tree_node'
284 | r = min_t(u64, seg->last | 0ULL, end);
| ^~
include/linux/minmax.h:93:23: note: in definition of macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
include/linux/minmax.h:213:27: note: in expansion of macro '__cmp_once'
213 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
| ^~~~~~~~~~
fs/erofs/pagecache_share.c:284:21: note: in expansion of macro 'min_t'
284 | r = min_t(u64, seg->last | 0ULL, end);
| ^~~~~
fs/erofs/pagecache_share.c:289:24: error: invalid use of undefined type 'struct interval_tree_node'
289 | if (seg->start < l) {
| ^~
fs/erofs/pagecache_share.c:291:32: error: invalid use of undefined type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ^~
fs/erofs/pagecache_share.c:291:45: error: invalid use of undefined type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ^~
fs/erofs/pagecache_share.c:292:32: error: invalid use of undefined type 'struct interval_tree_node'
292 | new_seg->last = l;
| ^~
>> fs/erofs/pagecache_share.c:293:25: error: implicit declaration of function 'interval_tree_insert'; did you mean 'vma_interval_tree_insert'? [-Wimplicit-function-declaration]
293 | interval_tree_insert(new_seg, &vi->segs);
| ^~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_insert
fs/erofs/pagecache_share.c:297:28: error: invalid use of undefined type 'struct interval_tree_node'
297 | if (r < seg->last) {
| ^~
fs/erofs/pagecache_share.c:299:32: error: invalid use of undefined type 'struct interval_tree_node'
299 | new_seg->start = r;
| ^~
fs/erofs/pagecache_share.c:300:32: error: invalid use of undefined type 'struct interval_tree_node'
300 | new_seg->last = seg->last;
| ^~
fs/erofs/pagecache_share.c:300:44: error: invalid use of undefined type 'struct interval_tree_node'
300 | new_seg->last = seg->last;
| ^~
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_read_end':
fs/erofs/pagecache_share.c:367:12: error: invalid use of undefined type 'struct interval_tree_node'
367 | seg->start = folio_index(folio);
| ^~
fs/erofs/pagecache_share.c:368:12: error: invalid use of undefined type 'struct interval_tree_node'
368 | seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
| ^~
fs/erofs/pagecache_share.c:368:24: error: invalid use of undefined type 'struct interval_tree_node'
368 | seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
| ^~
fs/erofs/pagecache_share.c:369:16: error: invalid use of undefined type 'struct interval_tree_node'
369 | if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
| ^~
fs/erofs/pagecache_share.c:370:20: error: invalid use of undefined type 'struct interval_tree_node'
370 | seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
| ^~
In file included from fs/erofs/pagecache_share.c:13:
fs/erofs/pagecache_share.c:371:22: error: invalid use of undefined type 'struct interval_tree_node'
371 | DBG_BUGON(seg->last < seg->start);
| ^~
fs/erofs/internal.h:32:41: note: in definition of macro 'DBG_BUGON'
32 | #define DBG_BUGON(x) ((void)(x))
| ^
fs/erofs/pagecache_share.c:371:34: error: invalid use of undefined type 'struct interval_tree_node'
371 | DBG_BUGON(seg->last < seg->start);
| ^~
fs/erofs/internal.h:32:41: note: in definition of macro 'DBG_BUGON'
32 | #define DBG_BUGON(x) ((void)(x))
| ^
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_readahead_end':
fs/erofs/pagecache_share.c:422:12: error: invalid use of undefined type 'struct interval_tree_node'
422 | seg->start = start;
| ^~
fs/erofs/pagecache_share.c:423:12: error: invalid use of undefined type 'struct interval_tree_node'
423 | seg->last = readahead_pos(rac) >> PAGE_SHIFT;
| ^~
fs/erofs/pagecache_share.c:424:16: error: invalid use of undefined type 'struct interval_tree_node'
424 | if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
| ^~
fs/erofs/pagecache_share.c:425:20: error: invalid use of undefined type 'struct interval_tree_node'
425 | seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
| ^~
vim +66 fs/erofs/pagecache_share.c
> 8 #include <linux/mutex.h>
9 #include <uapi/linux/fadvise.h>
> 10 #include <linux/slab.h>
11 #include <linux/pagemap.h>
12 #include "pagecache_share.h"
13 #include "internal.h"
14 #include "xattr.h"
15
16 #define PCSHR_FPRT_IDX 4
17 #define PCSHR_FPRT_NAME "erofs.fingerprint"
18 #define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
19
20 struct erofs_pcshr_counter {
21 struct mutex mutex;
22 struct kref ref;
23 struct vfsmount *mnt;
24 /* kmem cache for each inode's first-read segments */
25 struct kmem_cache *segsp;
26 };
27
28 struct erofs_pcshr_private {
29 char fprt[PCSHR_FPRT_MAXLEN];
30 struct mutex mutex;
31 };
32
33 static struct erofs_pcshr_counter mnt_counter = {
34 .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
35 .mnt = NULL,
36 };
37
38 static void erofs_pcshr_counter_release(struct kref *ref)
39 {
40 struct erofs_pcshr_counter *counter = container_of(ref,
41 struct erofs_pcshr_counter, ref);
42
43 DBG_BUGON(!counter->mnt);
44 kern_unmount(counter->mnt);
45 counter->mnt = NULL;
46 kmem_cache_destroy(counter->segsp);
47 counter->segsp = NULL;
48 }
49
50 int erofs_pcshr_init_mnt(void)
51 {
52 int ret;
53 struct vfsmount *tmp;
54
55 mutex_lock(&mnt_counter.mutex);
56 if (!mnt_counter.mnt) {
57 tmp = kern_mount(&erofs_anon_fs_type);
58 if (IS_ERR(tmp)) {
59 ret = PTR_ERR(tmp);
60 goto out;
61 }
62 mnt_counter.mnt = tmp;
63 kref_init(&mnt_counter.ref);
64
65 mnt_counter.segsp = kmem_cache_create("erofs_segs",
> 66 sizeof(struct interval_tree_node), 0,
67 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL);
68 if (!mnt_counter.segsp) {
69 ret = -ENOMEM;
70 goto out;
71 }
72 } else
73 kref_get(&mnt_counter.ref);
74 ret = 0;
75 out:
76 mutex_unlock(&mnt_counter.mutex);
77 return ret;
78 }
79
80 void erofs_pcshr_free_mnt(void)
81 {
82 mutex_lock(&mnt_counter.mutex);
83 kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
84 mutex_unlock(&mnt_counter.mutex);
85 }
86
87 static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
88 {
89 return kmem_cache_alloc(mnt_counter.segsp, GFP_KERNEL);
90 }
91
92 static void erofs_pcshr_free_seg(struct interval_tree_node *seg)
93 {
94 kmem_cache_free(mnt_counter.segsp, seg);
95 }
96
97 static int erofs_fprt_eq(struct inode *inode, void *data)
98 {
99 struct erofs_pcshr_private *ano_private = inode->i_private;
100
101 return ano_private && memcmp(ano_private->fprt, data,
102 sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
103 }
104
105 static int erofs_fprt_set(struct inode *inode, void *data)
106 {
107 struct erofs_pcshr_private *ano_private;
108
109 ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
110 if (!ano_private)
111 return -ENOMEM;
112 memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
113 mutex_init(&ano_private->mutex);
114 inode->i_private = ano_private;
115 return 0;
116 }
117
118 int erofs_pcshr_fill_inode(struct inode *inode)
119 {
120 struct erofs_inode *vi = EROFS_I(inode);
121 /* | fingerprint length | fingerprint content | */
122 char fprt[PCSHR_FPRT_MAXLEN];
123 struct inode *ano_inode;
124 unsigned long fprt_hash;
125 size_t fprt_len;
126 int ret = -1;
127
128 vi->ano_inode = NULL;
129 memset(fprt, 0, sizeof(fprt));
130 fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
131 fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
132 if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
133 *(size_t *)fprt = fprt_len;
134 fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
135 ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
136 erofs_fprt_eq, erofs_fprt_set, fprt);
137 DBG_BUGON(!ano_inode);
138 vi->ano_inode = ano_inode;
139 vi->segs = RB_ROOT_CACHED;
140 mutex_init(&vi->segs_mutex);
141 if (ano_inode->i_state & I_NEW) {
142 if (erofs_inode_is_data_compressed(vi->datalayout))
143 ano_inode->i_mapping->a_ops = &z_erofs_aops;
144 else
145 ano_inode->i_mapping->a_ops = &erofs_aops;
146 ano_inode->i_size = inode->i_size;
147 unlock_new_inode(ano_inode);
148 }
149 ret = 0;
150 }
151 return ret;
152 }
153
154 void erofs_pcshr_free_inode(struct inode *inode)
155 {
156 struct interval_tree_node *seg, *next_seg;
157 struct erofs_inode *vi = EROFS_I(inode);
158
159 if (S_ISREG(inode->i_mode) && vi->ano_inode) {
160 iput(vi->ano_inode);
161 vi->ano_inode = NULL;
162 }
> 163 seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
164 while (seg) {
> 165 next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
> 166 interval_tree_remove(seg, &vi->segs);
167 erofs_pcshr_free_seg(seg);
168 seg = next_seg;
169 }
170 }
171
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 3/4] erofs: apply the page cache share feature
2025-01-05 15:12 ` Hongzhen Luo
(?)
@ 2025-01-06 2:15 ` Gao Xiang
-1 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2025-01-06 2:15 UTC (permalink / raw)
To: Hongzhen Luo, linux-erofs; +Cc: linux-kernel
On 2025/1/5 23:12, Hongzhen Luo wrote:
...
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0cd6b5c4df98..fb08acbeaab6 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "internal.h"
> +#include "pagecache_share.h"
> #include <linux/sched/mm.h>
> #include <trace/events/erofs.h>
>
> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + int ret, pcshr;
> +
> + pcshr = erofs_pcshr_read_begin(file, folio);
> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
> + erofs_pcshr_read_end(file, folio, pcshr);
> + return ret;
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + int pcshr;
> +
> + pcshr = erofs_pcshr_readahead_begin(rac);
> + iomap_readahead(rac, &erofs_iomap_ops);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index d4b89407822a..0b070f4b46b8 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "xattr.h"
> +#include "pagecache_share.h"
> #include <trace/events/erofs.h>
>
> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> inode->i_op = &erofs_generic_iops;
> - if (erofs_inode_is_data_compressed(vi->datalayout))
> + if (erofs_pcshr_fill_inode(inode) == 0)
> + inode->i_fop = &erofs_pcshr_fops;
> + else if (erofs_inode_is_data_compressed(vi->datalayout))
> inode->i_fop = &generic_ro_fops;
> else
> inode->i_fop = &erofs_file_fops;
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> index 703fd17c002c..22172b5e21c7 100644
> --- a/fs/erofs/pagecache_share.c
> +++ b/fs/erofs/pagecache_share.c
> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>
> struct erofs_pcshr_private {
> char fprt[PCSHR_FPRT_MAXLEN];
> + struct mutex mutex;
> };
>
> static struct erofs_pcshr_counter mnt_counter = {
> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
> if (!ano_private)
> return -ENOMEM;
> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + mutex_init(&ano_private->mutex);
> inode->i_private = ano_private;
> return 0;
> }
> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
> .get_unmapped_area = thp_get_unmapped_area,
> .splice_read = filemap_splice_read,
> };
> +
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
> +{
> + struct erofs_inode *vi;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = vi->ano_inode->i_private;
> + mutex_lock(&ano_private->mutex);
> + folio->mapping->host = &vi->vfs_inode;
you shouldn't change `folio->mapping->host` directly.
> + return 1;
> +}
> +
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(file)->i_private;
> + folio->mapping->host = file_inode(file);
you shouldn't change `folio->mapping->host` directly
and then switch back. It's too hacky.
> + mutex_unlock(&ano_private->mutex);
> +}
> +
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
> +{
> + struct erofs_inode *vi;
> + struct file *file = rac->file;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = file_inode(file)->i_private;
> + mutex_lock(&ano_private->mutex);
> + rac->mapping->host = &vi->vfs_inode;
Same here.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
2025-01-05 15:12 ` Hongzhen Luo
(?)
@ 2025-01-06 2:27 ` Gao Xiang
2025-01-06 3:03 ` Hongzhen Luo
-1 siblings, 1 reply; 20+ messages in thread
From: Gao Xiang @ 2025-01-06 2:27 UTC (permalink / raw)
To: Hongzhen Luo, linux-erofs; +Cc: linux-kernel
On 2025/1/5 23:12, Hongzhen Luo wrote:
> Currently, reading files with different paths (or names) but the same
> content will consume multiple copies of the page cache, even if the
> content of these page caches is the same. For example, reading identical
> files (e.g., *.so files) from two different minor versions of container
> images will cost multiple copies of the same page cache, since different
> containers have different mount points. Therefore, sharing the page cache
> for files with the same content can save memory.
>
> This introduces the page cache share feature in erofs. During the mkfs
> phase, the file content is hashed and the hash value is stored in the
> `trusted.erofs.fingerprint` extended attribute. Inodes of files with the
> same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
> (indicated by the `ano_inode` field). When a read request occurs, the
> anonymous inode serves as a "container" whose page cache is shared. The
> actual operations involving the iomap are carried out by the original
> inode which is mapped to the anonymous inode.
>
> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
> ---
> fs/erofs/Kconfig | 10 ++
> fs/erofs/Makefile | 1 +
> fs/erofs/internal.h | 4 +
> fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
> fs/erofs/pagecache_share.h | 26 +++++
> fs/erofs/super.c | 24 +++-
> 6 files changed, 292 insertions(+), 1 deletion(-)
> create mode 100644 fs/erofs/pagecache_share.c
> create mode 100644 fs/erofs/pagecache_share.h
>
> diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
> index 6ea60661fa55..3aa5f946b5f1 100644
> --- a/fs/erofs/Kconfig
> +++ b/fs/erofs/Kconfig
> @@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
> at higher priority.
>
> If unsure, say N.
> +
> +config EROFS_FS_PAGE_CACHE_SHARE
> + bool "EROFS page cache share support"
> + depends on EROFS_FS
> + default n
> + help
> + This permits EROFS to share page cache for files with same
> + fingerprints.
> +
> + If unsure, say N.
> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
> index 4331d53c7109..d035c9063ef8 100644
> --- a/fs/erofs/Makefile
> +++ b/fs/erofs/Makefile
> @@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
> erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
> erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
> erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
> +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
> \ No newline at end of file
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 47004eb89838..6c87621d86ba 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -280,6 +280,9 @@ struct erofs_inode {
> };
> #endif /* CONFIG_EROFS_FS_ZIP */
> };
> +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
> + struct inode *ano_inode;
ano_inode has no extra meaning, we'd better to think out a meaningful name.
> +#endif
> /* the corresponding vfs inode */
> struct inode vfs_inode;
> };
> @@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
>
> extern const struct file_operations erofs_file_fops;
> extern const struct file_operations erofs_dir_fops;
> +extern const struct file_operations erofs_pcshr_fops;
>
> extern const struct iomap_ops z_erofs_iomap_report_ops;
>
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> new file mode 100644
> index 000000000000..703fd17c002c
> --- /dev/null
> +++ b/fs/erofs/pagecache_share.c
> @@ -0,0 +1,228 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2024, Alibaba Cloud
> + */
> +#include <linux/xxhash.h>
> +#include <linux/refcount.h>
> +#include <linux/mount.h>
> +#include <linux/mutex.h>
> +#include "pagecache_share.h"
> +#include "internal.h"
> +#include "xattr.h"
> +
> +#define PCSHR_FPRT_IDX 4
> +#define PCSHR_FPRT_NAME "erofs.fingerprint"
> +#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
> +
> +struct erofs_pcshr_counter {
> + struct mutex mutex;
> + struct kref ref;
> + struct vfsmount *mnt;
> +};
> +
> +struct erofs_pcshr_private {
> + char fprt[PCSHR_FPRT_MAXLEN];
> +};
> +
> +static struct erofs_pcshr_counter mnt_counter = {
> + .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
> + .mnt = NULL,
> +};
> +
> +static void erofs_pcshr_counter_release(struct kref *ref)
> +{
> + struct erofs_pcshr_counter *counter = container_of(ref,
> + struct erofs_pcshr_counter, ref);
> +
> + DBG_BUGON(!counter->mnt);
> + kern_unmount(counter->mnt);
> + counter->mnt = NULL;
> +}
> +
> +int erofs_pcshr_init_mnt(void)
> +{
> + int ret;
> + struct vfsmount *tmp;
> +
> + mutex_lock(&mnt_counter.mutex);
> + if (!mnt_counter.mnt) {
> + tmp = kern_mount(&erofs_anon_fs_type);
> + if (IS_ERR(tmp)) {
> + ret = PTR_ERR(tmp);
> + goto out;
> + }
> + mnt_counter.mnt = tmp;
> + kref_init(&mnt_counter.ref);
> + } else
> + kref_get(&mnt_counter.ref);
> + ret = 0;
> +out:
> + mutex_unlock(&mnt_counter.mutex);
> + return ret;
> +}
> +
> +void erofs_pcshr_free_mnt(void)
> +{
> + mutex_lock(&mnt_counter.mutex);
> + kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
> + mutex_unlock(&mnt_counter.mutex);
> +}
> +
> +static int erofs_fprt_eq(struct inode *inode, void *data)
> +{
> + struct erofs_pcshr_private *ano_private = inode->i_private;
> +
> + return ano_private && memcmp(ano_private->fprt, data,
> + sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
> +}
> +
> +static int erofs_fprt_set(struct inode *inode, void *data)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
> + if (!ano_private)
> + return -ENOMEM;
> + memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + inode->i_private = ano_private;
> + return 0;
> +}
> +
> +int erofs_pcshr_fill_inode(struct inode *inode)
> +{
> + struct erofs_inode *vi = EROFS_I(inode);
> + /* | fingerprint length | fingerprint content | */
> + char fprt[PCSHR_FPRT_MAXLEN];
we shouldn't allocate too large space on stack.
> + struct inode *ano_inode;
> + unsigned long fprt_hash;
> + size_t fprt_len;
> + int ret = -1;
> +
> + vi->ano_inode = NULL;
> + memset(fprt, 0, sizeof(fprt));
> + fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
> + fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
Now, I think it'd be better that users could have a way to configure
the xattr key name. Since in that way, we could reuse fsverity-like
root hash digest likewise.
> + if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
> + *(size_t *)fprt = fprt_len;
> + fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
> + ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
> + erofs_fprt_eq, erofs_fprt_set, fprt);
> + DBG_BUGON(!ano_inode);
Why iget5_locked() won't return NULL?
> + vi->ano_inode = ano_inode;
> + if (ano_inode->i_state & I_NEW) {
> + if (erofs_inode_is_data_compressed(vi->datalayout))
> + ano_inode->i_mapping->a_ops = &z_erofs_aops;
> + else
> + ano_inode->i_mapping->a_ops = &erofs_aops;
> + ano_inode->i_size = inode->i_size;
> + unlock_new_inode(ano_inode);
> + }
> + ret = 0;
> + }
> + return ret;
> +}
> +
> +void erofs_pcshr_free_inode(struct inode *inode)
> +{redundant
> + struct erofs_inode *vi = EROFS_I(inode);
> +
> + if (S_ISREG(inode->i_mode) && vi->ano_inode) {
redundant space.
> + iput(vi->ano_inode);
> + vi->ano_inode = NULL;
> + }
> +}
> +
> +static struct file *erofs_pcshr_alloc_file(struct file *file,
> + struct inode *ano_inode)
> +{
> + struct file *ano_file;
> +
> + ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
> + "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
> + if (IS_ERR(ano_file))
> + return ano_file;
> +
> + file_ra_state_init(&ano_file->f_ra, file->f_mapping);
> + ano_file->private_data = EROFS_I(file_inode(file));
> + return ano_file;
> +}
> +
> +static int erofs_pcshr_file_open(struct inode *inode, struct file *file)
> +{
> + struct file *ano_file;
> + struct inode *ano_inode;
> + struct erofs_inode *vi = EROFS_I(inode);
> +
> + ano_inode = vi->ano_inode;
> + if (!ano_inode)
> + return -EINVAL;
> +
> + ano_file = erofs_pcshr_alloc_file(file, ano_inode);
> + if (IS_ERR(ano_file))
> + return PTR_ERR(ano_file);
> +
> + ihold(ano_inode);
> + file->private_data = (void *)ano_file;
> + return 0;
> +}
> +
> +static int erofs_pcshr_file_release(struct inode *inode, struct file *file)
> +{
> + if (!file->private_data)
> + return -EINVAL;
> +
> + fput((struct file *)file->private_data);
> + file->private_data = NULL;
> + return 0;
> +}
> +
> +static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
> + struct iov_iter *to)
> +{
> + struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
> + struct file *file, *ano_file;
> + struct kiocb ano_iocb;
> + ssize_t res;
> +
> + if (!iov_iter_count(to))
> + return 0;
> +#ifdef CONFIG_FS_DAX
> + if (IS_DAX(inode))
> + return erofs_file_fops.read_iter(iocb, to);
> +#endif
> + if (iocb->ki_flags & IOCB_DIRECT)
> + return erofs_file_fops.read_iter(iocb, to);
> +
> + memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
> + file = iocb->ki_filp;
> + ano_file = file->private_data;
> + if (!ano_file)
> + return -EINVAL;
> + ano_iocb.ki_filp = ano_file;
> + res = filemap_read(&ano_iocb, to, 0);
why we need to use this? what is "erofs_pcshr_file_read_iter" used for?
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
2025-01-06 2:27 ` Gao Xiang
@ 2025-01-06 3:03 ` Hongzhen Luo
0 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-06 3:03 UTC (permalink / raw)
To: Gao Xiang, linux-erofs; +Cc: linux-kernel
On 2025/1/6 10:27, Gao Xiang wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> Currently, reading files with different paths (or names) but the same
>> content will consume multiple copies of the page cache, even if the
>> content of these page caches is the same. For example, reading identical
>> files (e.g., *.so files) from two different minor versions of container
>> images will cost multiple copies of the same page cache, since different
>> containers have different mount points. Therefore, sharing the page
>> cache
>> for files with the same content can save memory.
>>
>> This introduces the page cache share feature in erofs. During the mkfs
>> phase, the file content is hashed and the hash value is stored in the
>> `trusted.erofs.fingerprint` extended attribute. Inodes of files with the
>> same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
>> (indicated by the `ano_inode` field). When a read request occurs, the
>> anonymous inode serves as a "container" whose page cache is shared. The
>> actual operations involving the iomap are carried out by the original
>> inode which is mapped to the anonymous inode.
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
>> ---
>> fs/erofs/Kconfig | 10 ++
>> fs/erofs/Makefile | 1 +
>> fs/erofs/internal.h | 4 +
>> fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
>> fs/erofs/pagecache_share.h | 26 +++++
>> fs/erofs/super.c | 24 +++-
>> 6 files changed, 292 insertions(+), 1 deletion(-)
>> create mode 100644 fs/erofs/pagecache_share.c
>> create mode 100644 fs/erofs/pagecache_share.h
>>
>> diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
>> index 6ea60661fa55..3aa5f946b5f1 100644
>> --- a/fs/erofs/Kconfig
>> +++ b/fs/erofs/Kconfig
>> @@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
>> at higher priority.
>> If unsure, say N.
>> +
>> +config EROFS_FS_PAGE_CACHE_SHARE
>> + bool "EROFS page cache share support"
>> + depends on EROFS_FS
>> + default n
>> + help
>> + This permits EROFS to share page cache for files with same
>> + fingerprints.
>> +
>> + If unsure, say N.
>> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
>> index 4331d53c7109..d035c9063ef8 100644
>> --- a/fs/erofs/Makefile
>> +++ b/fs/erofs/Makefile
>> @@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) +=
>> decompressor_deflate.o
>> erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
>> erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
>> erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
>> +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
>> \ No newline at end of file
>> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
>> index 47004eb89838..6c87621d86ba 100644
>> --- a/fs/erofs/internal.h
>> +++ b/fs/erofs/internal.h
>> @@ -280,6 +280,9 @@ struct erofs_inode {
>> };
>> #endif /* CONFIG_EROFS_FS_ZIP */
>> };
>> +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>> + struct inode *ano_inode;
>
> ano_inode has no extra meaning, we'd better to think out a meaningful
> name.
>
>> +#endif
>> /* the corresponding vfs inode */
>> struct inode vfs_inode;
>> };
>> @@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
>> extern const struct file_operations erofs_file_fops;
>> extern const struct file_operations erofs_dir_fops;
>> +extern const struct file_operations erofs_pcshr_fops;
>> extern const struct iomap_ops z_erofs_iomap_report_ops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> new file mode 100644
>> index 000000000000..703fd17c002c
>> --- /dev/null
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -0,0 +1,228 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * Copyright (C) 2024, Alibaba Cloud
>> + */
>> +#include <linux/xxhash.h>
>> +#include <linux/refcount.h>
>> +#include <linux/mount.h>
>> +#include <linux/mutex.h>
>> +#include "pagecache_share.h"
>> +#include "internal.h"
>> +#include "xattr.h"
>> +
>> +#define PCSHR_FPRT_IDX 4
>> +#define PCSHR_FPRT_NAME "erofs.fingerprint"
>> +#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
>> +
>> +struct erofs_pcshr_counter {
>> + struct mutex mutex;
>> + struct kref ref;
>> + struct vfsmount *mnt;
>> +};
>> +
>> +struct erofs_pcshr_private {
>> + char fprt[PCSHR_FPRT_MAXLEN];
>> +};
>> +
>> +static struct erofs_pcshr_counter mnt_counter = {
>> + .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
>> + .mnt = NULL,
>> +};
>> +
>> +static void erofs_pcshr_counter_release(struct kref *ref)
>> +{
>> + struct erofs_pcshr_counter *counter = container_of(ref,
>> + struct erofs_pcshr_counter, ref);
>> +
>> + DBG_BUGON(!counter->mnt);
>> + kern_unmount(counter->mnt);
>> + counter->mnt = NULL;
>> +}
>> +
>> +int erofs_pcshr_init_mnt(void)
>> +{
>> + int ret;
>> + struct vfsmount *tmp;
>> +
>> + mutex_lock(&mnt_counter.mutex);
>> + if (!mnt_counter.mnt) {
>> + tmp = kern_mount(&erofs_anon_fs_type);
>> + if (IS_ERR(tmp)) {
>> + ret = PTR_ERR(tmp);
>> + goto out;
>> + }
>> + mnt_counter.mnt = tmp;
>> + kref_init(&mnt_counter.ref);
>> + } else
>> + kref_get(&mnt_counter.ref);
>> + ret = 0;
>> +out:
>> + mutex_unlock(&mnt_counter.mutex);
>> + return ret;
>> +}
>> +
>> +void erofs_pcshr_free_mnt(void)
>> +{
>> + mutex_lock(&mnt_counter.mutex);
>> + kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
>> + mutex_unlock(&mnt_counter.mutex);
>> +}
>> +
>> +static int erofs_fprt_eq(struct inode *inode, void *data)
>> +{
>> + struct erofs_pcshr_private *ano_private = inode->i_private;
>> +
>> + return ano_private && memcmp(ano_private->fprt, data,
>> + sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
>> +}
>> +
>> +static int erofs_fprt_set(struct inode *inode, void *data)
>> +{
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + ano_private = kmalloc(sizeof(struct erofs_pcshr_private),
>> GFP_KERNEL);
>> + if (!ano_private)
>> + return -ENOMEM;
>> + memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> + inode->i_private = ano_private;
>> + return 0;
>> +}
>> +
>> +int erofs_pcshr_fill_inode(struct inode *inode)
>> +{
>> + struct erofs_inode *vi = EROFS_I(inode);
>> + /* | fingerprint length | fingerprint content | */
>> + char fprt[PCSHR_FPRT_MAXLEN];
>
> we shouldn't allocate too large space on stack.
>
>> + struct inode *ano_inode;
>> + unsigned long fprt_hash;
>> + size_t fprt_len;
>> + int ret = -1;
>> +
>> + vi->ano_inode = NULL;
>> + memset(fprt, 0, sizeof(fprt));
>> + fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
>> + fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
>
> Now, I think it'd be better that users could have a way to configure
> the xattr key name. Since in that way, we could reuse fsverity-like
> root hash digest likewise.
>
>> + if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
>> + *(size_t *)fprt = fprt_len;
>> + fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
>> + ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
>> + erofs_fprt_eq, erofs_fprt_set, fprt);
>> + DBG_BUGON(!ano_inode);
>
> Why iget5_locked() won't return NULL?
>
>> + vi->ano_inode = ano_inode;
>> + if (ano_inode->i_state & I_NEW) {
>> + if (erofs_inode_is_data_compressed(vi->datalayout))
>> + ano_inode->i_mapping->a_ops = &z_erofs_aops;
>> + else
>> + ano_inode->i_mapping->a_ops = &erofs_aops;
>> + ano_inode->i_size = inode->i_size;
>> + unlock_new_inode(ano_inode);
>> + }
>> + ret = 0;
>> + }
>> + return ret;
>> +}
>> +
>> +void erofs_pcshr_free_inode(struct inode *inode)
>> +{redundant
>> + struct erofs_inode *vi = EROFS_I(inode);
>> +
>> + if (S_ISREG(inode->i_mode) && vi->ano_inode) {
>
> redundant space.
>
>> + iput(vi->ano_inode);
>> + vi->ano_inode = NULL;
>> + }
>> +}
>> +
>> +static struct file *erofs_pcshr_alloc_file(struct file *file,
>> + struct inode *ano_inode)
>> +{
>> + struct file *ano_file;
>> +
>> + ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
>> + "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
>> + if (IS_ERR(ano_file))
>> + return ano_file;
>> +
>> + file_ra_state_init(&ano_file->f_ra, file->f_mapping);
>> + ano_file->private_data = EROFS_I(file_inode(file));
>> + return ano_file;
>> +}
>> +
>> +static int erofs_pcshr_file_open(struct inode *inode, struct file
>> *file)
>> +{
>> + struct file *ano_file;
>> + struct inode *ano_inode;
>> + struct erofs_inode *vi = EROFS_I(inode);
>> +
>> + ano_inode = vi->ano_inode;
>> + if (!ano_inode)
>> + return -EINVAL;
>> +
>> + ano_file = erofs_pcshr_alloc_file(file, ano_inode);
>> + if (IS_ERR(ano_file))
>> + return PTR_ERR(ano_file);
>> +
>> + ihold(ano_inode);
>> + file->private_data = (void *)ano_file;
>> + return 0;
>> +}
>> +
>> +static int erofs_pcshr_file_release(struct inode *inode, struct file
>> *file)
>> +{
>> + if (!file->private_data)
>> + return -EINVAL;
>> +
>> + fput((struct file *)file->private_data);
>> + file->private_data = NULL;
>> + return 0;
>> +}
>> +
>> +static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
>> + struct iov_iter *to)
>> +{
>> + struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
>> + struct file *file, *ano_file;
>> + struct kiocb ano_iocb;
>> + ssize_t res;
>> +
>> + if (!iov_iter_count(to))
>> + return 0;
>> +#ifdef CONFIG_FS_DAX
>> + if (IS_DAX(inode))
>> + return erofs_file_fops.read_iter(iocb, to);
>> +#endif
>> + if (iocb->ki_flags & IOCB_DIRECT)
>> + return erofs_file_fops.read_iter(iocb, to);
>> +
>> + memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
>> + file = iocb->ki_filp;
>> + ano_file = file->private_data;
>> + if (!ano_file)
>> + return -EINVAL;
>> + ano_iocb.ki_filp = ano_file;
>> + res = filemap_read(&ano_iocb, to, 0);
>
> why we need to use this? what is "erofs_pcshr_file_read_iter" used for?
>
We need this because at this point, we need to switch to anonymous files
so that `filemap_read` uses
the page cache of the anonymous inode. Files with the same fingerprint
can share the same page cache
of the anonymous inode in this way.
Thanks,
Hongzhen Luo
> Thanks,
> Gao Xiang
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
2025-01-05 15:12 ` Hongzhen Luo
(?)
(?)
@ 2025-01-06 3:40 ` kernel test robot
-1 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-01-06 3:40 UTC (permalink / raw)
To: Hongzhen Luo; +Cc: llvm, oe-kbuild-all
Hi Hongzhen,
[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:
[auto build test ERROR on xiang-erofs/dev-test]
[also build test ERROR on xiang-erofs/dev xiang-erofs/fixes linus/master v6.13-rc5 next-20241220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hongzhen-Luo/erofs-move-struct-erofs_anon_fs_type-to-super-c/20250105-231438
base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
patch link: https://lore.kernel.org/r/20250105151208.3797385-5-hongzhen%40linux.alibaba.com
patch subject: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
config: s390-allmodconfig (https://download.01.org/0day-ci/archive/20250106/202501061117.0i32iTXy-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250106/202501061117.0i32iTXy-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202501061117.0i32iTXy-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from fs/erofs/pagecache_share.c:11:
In file included from include/linux/pagemap.h:8:
In file included from include/linux/mm.h:2224:
include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
505 | item];
| ~~~~
include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
512 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
525 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
>> fs/erofs/pagecache_share.c:66:4: error: invalid application of 'sizeof' to an incomplete type 'struct interval_tree_node'
66 | sizeof(struct interval_tree_node), 0,
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/slab.h:430:41: note: expanded from macro 'kmem_cache_create'
430 | default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)
| ^~~~~~~~~~~~~
fs/erofs/pagecache_share.c:66:18: note: forward declaration of 'struct interval_tree_node'
66 | sizeof(struct interval_tree_node), 0,
| ^
>> fs/erofs/pagecache_share.c:163:8: error: call to undeclared function 'interval_tree_iter_first'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^
fs/erofs/pagecache_share.c:163:8: note: did you mean 'vma_interval_tree_iter_first'?
include/linux/mm.h:3295:24: note: 'vma_interval_tree_iter_first' declared here
3295 | struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
| ^
>> fs/erofs/pagecache_share.c:163:6: error: incompatible integer to pointer conversion assigning to 'struct interval_tree_node *' from 'int' [-Wint-conversion]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> fs/erofs/pagecache_share.c:165:14: error: call to undeclared function 'interval_tree_iter_next'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^
fs/erofs/pagecache_share.c:165:12: error: incompatible integer to pointer conversion assigning to 'struct interval_tree_node *' from 'int' [-Wint-conversion]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> fs/erofs/pagecache_share.c:166:3: error: call to undeclared function 'interval_tree_remove'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
166 | interval_tree_remove(seg, &vi->segs);
| ^
fs/erofs/pagecache_share.c:166:3: note: did you mean 'vma_interval_tree_remove'?
include/linux/mm.h:3293:6: note: 'vma_interval_tree_remove' declared here
3293 | void vma_interval_tree_remove(struct vm_area_struct *node,
| ^
fs/erofs/pagecache_share.c:276:8: error: call to undeclared function 'interval_tree_iter_first'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
276 | seg = interval_tree_iter_first(&vi->segs, start, end);
| ^
fs/erofs/pagecache_share.c:276:6: error: incompatible integer to pointer conversion assigning to 'struct interval_tree_node *' from 'int' [-Wint-conversion]
276 | seg = interval_tree_iter_first(&vi->segs, start, end);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fs/erofs/pagecache_share.c:278:14: error: call to undeclared function 'interval_tree_iter_next'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
278 | next_seg = interval_tree_iter_next(seg, start, end);
| ^
fs/erofs/pagecache_share.c:278:12: error: incompatible integer to pointer conversion assigning to 'struct interval_tree_node *' from 'int' [-Wint-conversion]
278 | next_seg = interval_tree_iter_next(seg, start, end);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> fs/erofs/pagecache_share.c:283:21: error: incomplete definition of type 'struct interval_tree_node'
283 | l = max_t(u64, seg->start | 0ULL, start);
| ~~~^
include/linux/minmax.h:221:49: note: expanded from macro 'max_t'
221 | #define max_t(type, x, y) __cmp_once(max, type, x, y)
| ^
include/linux/minmax.h:96:30: note: expanded from macro '__cmp_once'
96 | __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
| ^
include/linux/minmax.h:93:16: note: expanded from macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:284:21: error: incomplete definition of type 'struct interval_tree_node'
284 | r = min_t(u64, seg->last | 0ULL, end);
| ~~~^
include/linux/minmax.h:213:49: note: expanded from macro 'min_t'
213 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
| ^
include/linux/minmax.h:96:30: note: expanded from macro '__cmp_once'
96 | __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
| ^
include/linux/minmax.h:93:16: note: expanded from macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:289:10: error: incomplete definition of type 'struct interval_tree_node'
289 | if (seg->start < l) {
| ~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:291:11: error: incomplete definition of type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ~~~~~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:291:24: error: incomplete definition of type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:292:11: error: incomplete definition of type 'struct interval_tree_node'
292 | new_seg->last = l;
| ~~~~~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
>> fs/erofs/pagecache_share.c:293:4: error: call to undeclared function 'interval_tree_insert'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
293 | interval_tree_insert(new_seg, &vi->segs);
| ^
fs/erofs/pagecache_share.c:293:4: note: did you mean 'vma_interval_tree_insert'?
include/linux/mm.h:3288:6: note: 'vma_interval_tree_insert' declared here
3288 | void vma_interval_tree_insert(struct vm_area_struct *node,
| ^
fs/erofs/pagecache_share.c:297:14: error: incomplete definition of type 'struct interval_tree_node'
297 | if (r < seg->last) {
| ~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fs/erofs/pagecache_share.c:299:11: error: incomplete definition of type 'struct interval_tree_node'
299 | new_seg->start = r;
| ~~~~~~~^
fs/erofs/pagecache_share.c:87:15: note: forward declaration of 'struct interval_tree_node'
87 | static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
| ^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
3 warnings and 20 errors generated.
vim +66 fs/erofs/pagecache_share.c
49
50 int erofs_pcshr_init_mnt(void)
51 {
52 int ret;
53 struct vfsmount *tmp;
54
55 mutex_lock(&mnt_counter.mutex);
56 if (!mnt_counter.mnt) {
57 tmp = kern_mount(&erofs_anon_fs_type);
58 if (IS_ERR(tmp)) {
59 ret = PTR_ERR(tmp);
60 goto out;
61 }
62 mnt_counter.mnt = tmp;
63 kref_init(&mnt_counter.ref);
64
65 mnt_counter.segsp = kmem_cache_create("erofs_segs",
> 66 sizeof(struct interval_tree_node), 0,
67 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL);
68 if (!mnt_counter.segsp) {
69 ret = -ENOMEM;
70 goto out;
71 }
72 } else
73 kref_get(&mnt_counter.ref);
74 ret = 0;
75 out:
76 mutex_unlock(&mnt_counter.mutex);
77 return ret;
78 }
79
80 void erofs_pcshr_free_mnt(void)
81 {
82 mutex_lock(&mnt_counter.mutex);
83 kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
84 mutex_unlock(&mnt_counter.mutex);
85 }
86
> 87 static struct interval_tree_node *erofs_pcshr_alloc_seg(void)
88 {
89 return kmem_cache_alloc(mnt_counter.segsp, GFP_KERNEL);
90 }
91
92 static void erofs_pcshr_free_seg(struct interval_tree_node *seg)
93 {
94 kmem_cache_free(mnt_counter.segsp, seg);
95 }
96
97 static int erofs_fprt_eq(struct inode *inode, void *data)
98 {
99 struct erofs_pcshr_private *ano_private = inode->i_private;
100
101 return ano_private && memcmp(ano_private->fprt, data,
102 sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
103 }
104
105 static int erofs_fprt_set(struct inode *inode, void *data)
106 {
107 struct erofs_pcshr_private *ano_private;
108
109 ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
110 if (!ano_private)
111 return -ENOMEM;
112 memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
113 mutex_init(&ano_private->mutex);
114 inode->i_private = ano_private;
115 return 0;
116 }
117
118 int erofs_pcshr_fill_inode(struct inode *inode)
119 {
120 struct erofs_inode *vi = EROFS_I(inode);
121 /* | fingerprint length | fingerprint content | */
122 char fprt[PCSHR_FPRT_MAXLEN];
123 struct inode *ano_inode;
124 unsigned long fprt_hash;
125 size_t fprt_len;
126 int ret = -1;
127
128 vi->ano_inode = NULL;
129 memset(fprt, 0, sizeof(fprt));
130 fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
131 fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
132 if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
133 *(size_t *)fprt = fprt_len;
134 fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
135 ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
136 erofs_fprt_eq, erofs_fprt_set, fprt);
137 DBG_BUGON(!ano_inode);
138 vi->ano_inode = ano_inode;
139 vi->segs = RB_ROOT_CACHED;
140 mutex_init(&vi->segs_mutex);
141 if (ano_inode->i_state & I_NEW) {
142 if (erofs_inode_is_data_compressed(vi->datalayout))
143 ano_inode->i_mapping->a_ops = &z_erofs_aops;
144 else
145 ano_inode->i_mapping->a_ops = &erofs_aops;
146 ano_inode->i_size = inode->i_size;
147 unlock_new_inode(ano_inode);
148 }
149 ret = 0;
150 }
151 return ret;
152 }
153
154 void erofs_pcshr_free_inode(struct inode *inode)
155 {
156 struct interval_tree_node *seg, *next_seg;
157 struct erofs_inode *vi = EROFS_I(inode);
158
159 if (S_ISREG(inode->i_mode) && vi->ano_inode) {
160 iput(vi->ano_inode);
161 vi->ano_inode = NULL;
162 }
> 163 seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
164 while (seg) {
> 165 next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
> 166 interval_tree_remove(seg, &vi->segs);
167 erofs_pcshr_free_seg(seg);
168 seg = next_seg;
169 }
170 }
171
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
2025-01-05 15:12 ` Hongzhen Luo
` (2 preceding siblings ...)
(?)
@ 2025-01-06 11:52 ` kernel test robot
-1 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2025-01-06 11:52 UTC (permalink / raw)
To: Hongzhen Luo; +Cc: oe-kbuild-all
Hi Hongzhen,
[This is a private test report for your RFC patch.]
kernel test robot noticed the following build warnings:
[auto build test WARNING on xiang-erofs/dev-test]
[also build test WARNING on xiang-erofs/dev xiang-erofs/fixes linus/master v6.13-rc6 next-20250106]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hongzhen-Luo/erofs-move-struct-erofs_anon_fs_type-to-super-c/20250105-231438
base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
patch link: https://lore.kernel.org/r/20250105151208.3797385-5-hongzhen%40linux.alibaba.com
patch subject: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share
config: arc-allmodconfig (https://download.01.org/0day-ci/archive/20250106/202501061957.awS07N4f-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250106/202501061957.awS07N4f-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202501061957.awS07N4f-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from fs/erofs/pagecache_share.c:10:
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_init_mnt':
fs/erofs/pagecache_share.c:66:32: error: invalid application of 'sizeof' to incomplete type 'struct interval_tree_node'
66 | sizeof(struct interval_tree_node), 0,
| ^~~~~~
include/linux/slab.h:430:55: note: in definition of macro 'kmem_cache_create'
430 | default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)
| ^~~~~~~~~~~~~
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_free_inode':
fs/erofs/pagecache_share.c:163:15: error: implicit declaration of function 'interval_tree_iter_first'; did you mean 'vma_interval_tree_iter_first'? [-Werror=implicit-function-declaration]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^~~~~~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_iter_first
>> fs/erofs/pagecache_share.c:163:13: warning: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
163 | seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
| ^
fs/erofs/pagecache_share.c:165:28: error: implicit declaration of function 'interval_tree_iter_next'; did you mean 'vma_interval_tree_iter_next'? [-Werror=implicit-function-declaration]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^~~~~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_iter_next
fs/erofs/pagecache_share.c:165:26: warning: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
165 | next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
| ^
fs/erofs/pagecache_share.c:166:17: error: implicit declaration of function 'interval_tree_remove'; did you mean 'vma_interval_tree_remove'? [-Werror=implicit-function-declaration]
166 | interval_tree_remove(seg, &vi->segs);
| ^~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_remove
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_fadvise':
fs/erofs/pagecache_share.c:276:13: warning: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
276 | seg = interval_tree_iter_first(&vi->segs, start, end);
| ^
fs/erofs/pagecache_share.c:278:26: warning: assignment to 'struct interval_tree_node *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
278 | next_seg = interval_tree_iter_next(seg, start, end);
| ^
In file included from include/linux/kernel.h:28,
from include/linux/cpumask.h:11,
from include/linux/smp.h:13,
from include/linux/lockdep.h:14,
from include/linux/mutex.h:17,
from fs/erofs/pagecache_share.c:8:
fs/erofs/pagecache_share.c:283:35: error: invalid use of undefined type 'struct interval_tree_node'
283 | l = max_t(u64, seg->start | 0ULL, start);
| ^~
include/linux/minmax.h:93:23: note: in definition of macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
include/linux/minmax.h:221:27: note: in expansion of macro '__cmp_once'
221 | #define max_t(type, x, y) __cmp_once(max, type, x, y)
| ^~~~~~~~~~
fs/erofs/pagecache_share.c:283:21: note: in expansion of macro 'max_t'
283 | l = max_t(u64, seg->start | 0ULL, start);
| ^~~~~
fs/erofs/pagecache_share.c:284:35: error: invalid use of undefined type 'struct interval_tree_node'
284 | r = min_t(u64, seg->last | 0ULL, end);
| ^~
include/linux/minmax.h:93:23: note: in definition of macro '__cmp_once_unique'
93 | ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
| ^
include/linux/minmax.h:213:27: note: in expansion of macro '__cmp_once'
213 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
| ^~~~~~~~~~
fs/erofs/pagecache_share.c:284:21: note: in expansion of macro 'min_t'
284 | r = min_t(u64, seg->last | 0ULL, end);
| ^~~~~
fs/erofs/pagecache_share.c:289:24: error: invalid use of undefined type 'struct interval_tree_node'
289 | if (seg->start < l) {
| ^~
fs/erofs/pagecache_share.c:291:32: error: invalid use of undefined type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ^~
fs/erofs/pagecache_share.c:291:45: error: invalid use of undefined type 'struct interval_tree_node'
291 | new_seg->start = seg->start;
| ^~
fs/erofs/pagecache_share.c:292:32: error: invalid use of undefined type 'struct interval_tree_node'
292 | new_seg->last = l;
| ^~
fs/erofs/pagecache_share.c:293:25: error: implicit declaration of function 'interval_tree_insert'; did you mean 'vma_interval_tree_insert'? [-Werror=implicit-function-declaration]
293 | interval_tree_insert(new_seg, &vi->segs);
| ^~~~~~~~~~~~~~~~~~~~
| vma_interval_tree_insert
fs/erofs/pagecache_share.c:297:28: error: invalid use of undefined type 'struct interval_tree_node'
297 | if (r < seg->last) {
| ^~
fs/erofs/pagecache_share.c:299:32: error: invalid use of undefined type 'struct interval_tree_node'
299 | new_seg->start = r;
| ^~
fs/erofs/pagecache_share.c:300:32: error: invalid use of undefined type 'struct interval_tree_node'
300 | new_seg->last = seg->last;
| ^~
fs/erofs/pagecache_share.c:300:44: error: invalid use of undefined type 'struct interval_tree_node'
300 | new_seg->last = seg->last;
| ^~
fs/erofs/pagecache_share.c: In function 'erofs_pcshr_read_end':
fs/erofs/pagecache_share.c:367:12: error: invalid use of undefined type 'struct interval_tree_node'
367 | seg->start = folio_index(folio);
| ^~
fs/erofs/pagecache_share.c:368:12: error: invalid use of undefined type 'struct interval_tree_node'
368 | seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
| ^~
fs/erofs/pagecache_share.c:368:24: error: invalid use of undefined type 'struct interval_tree_node'
368 | seg->last = seg->start + (folio_size(folio) >> PAGE_SHIFT);
| ^~
fs/erofs/pagecache_share.c:369:16: error: invalid use of undefined type 'struct interval_tree_node'
369 | if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT))
| ^~
fs/erofs/pagecache_share.c:370:20: error: invalid use of undefined type 'struct interval_tree_node'
370 | seg->last = vi->vfs_inode.i_size >> PAGE_SHIFT;
| ^~
In file included from arch/arc/include/asm/atomic.h:12,
from include/linux/atomic.h:7,
from include/linux/refcount.h:95,
from fs/erofs/pagecache_share.c:6:
fs/erofs/pagecache_share.c:371:22: error: invalid use of undefined type 'struct interval_tree_node'
371 | DBG_BUGON(seg->last < seg->start);
vim +163 fs/erofs/pagecache_share.c
153
154 void erofs_pcshr_free_inode(struct inode *inode)
155 {
156 struct interval_tree_node *seg, *next_seg;
157 struct erofs_inode *vi = EROFS_I(inode);
158
159 if (S_ISREG(inode->i_mode) && vi->ano_inode) {
160 iput(vi->ano_inode);
161 vi->ano_inode = NULL;
162 }
> 163 seg = interval_tree_iter_first(&vi->segs, 0, LLONG_MAX);
164 while (seg) {
165 next_seg = interval_tree_iter_next(seg, 0, LLONG_MAX);
166 interval_tree_remove(seg, &vi->segs);
167 erofs_pcshr_free_seg(seg);
168 seg = next_seg;
169 }
170 }
171
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 3/4] erofs: apply the page cache share feature
2025-01-05 15:12 ` Hongzhen Luo
@ 2025-01-21 11:59 ` Hongbo Li
-1 siblings, 0 replies; 20+ messages in thread
From: Hongbo Li via Linux-erofs @ 2025-01-21 11:59 UTC (permalink / raw)
To: Hongzhen Luo, linux-erofs; +Cc: linux-kernel
On 2025/1/5 23:12, Hongzhen Luo wrote:
> This modifies relevant functions to apply the page cache
> share feature.
>
> Below is the memory usage for reading all files in two different minor
> versions of container images:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 241 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
> +-------------------+------------------+-------------+---------------+
> | | No | 872 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 630 | 28% |
> +-------------------+------------------+-------------+---------------+
> | | No | 2771 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
> +-------------------+------------------+-------------+---------------+
> | | No | 926 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
> +-------------------+------------------+-------------+---------------+
> | | No | 390 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 924 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 474 | 49% |
> +-------------------+------------------+-------------+---------------+
>
> Additionally, the table below shows the runtime memory usage of the
> container:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 35 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
> +-------------------+------------------+-------------+---------------+
> | | No | 149 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 95 | 37% |
> +-------------------+------------------+-------------+---------------+
> | | No | 1028 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
> +-------------------+------------------+-------------+---------------+
> | | No | 155 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
> +-------------------+------------------+-------------+---------------+
> | | No | 25 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 186 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 98 | 48% |
> +-------------------+------------------+-------------+---------------+
>
> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
> ---
> fs/erofs/data.c | 14 +++++++--
> fs/erofs/inode.c | 5 ++-
> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
> fs/erofs/pagecache_share.h | 11 +++++++
> fs/erofs/super.c | 7 +++++
> fs/erofs/zdata.c | 9 ++++--
> 6 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0cd6b5c4df98..fb08acbeaab6 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "internal.h"
> +#include "pagecache_share.h"
> #include <linux/sched/mm.h>
> #include <trace/events/erofs.h>
>
> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + int ret, pcshr;
> +
> + pcshr = erofs_pcshr_read_begin(file, folio);
> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
> + erofs_pcshr_read_end(file, folio, pcshr);
> + return ret;
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + int pcshr;
> +
> + pcshr = erofs_pcshr_readahead_begin(rac);
> + iomap_readahead(rac, &erofs_iomap_ops);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index d4b89407822a..0b070f4b46b8 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "xattr.h"
> +#include "pagecache_share.h"
> #include <trace/events/erofs.h>
>
> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> inode->i_op = &erofs_generic_iops;
> - if (erofs_inode_is_data_compressed(vi->datalayout))
> + if (erofs_pcshr_fill_inode(inode) == 0)
> + inode->i_fop = &erofs_pcshr_fops;
> + else if (erofs_inode_is_data_compressed(vi->datalayout))
> inode->i_fop = &generic_ro_fops;
> else
> inode->i_fop = &erofs_file_fops;
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> index 703fd17c002c..22172b5e21c7 100644
> --- a/fs/erofs/pagecache_share.c
> +++ b/fs/erofs/pagecache_share.c
> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>
> struct erofs_pcshr_private {
> char fprt[PCSHR_FPRT_MAXLEN];
> + struct mutex mutex;
> };
>
> static struct erofs_pcshr_counter mnt_counter = {
> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
> if (!ano_private)
> return -ENOMEM;
> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + mutex_init(&ano_private->mutex);
> inode->i_private = ano_private;
> return 0;
> }
> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
> .get_unmapped_area = thp_get_unmapped_area,
> .splice_read = filemap_splice_read,
> };
> +
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
> +{
> + struct erofs_inode *vi;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = vi->ano_inode->i_private;
> + mutex_lock(&ano_private->mutex);
Can we lock in folio granularity? The erofs_pcshr_private mutex may
limit the concurrent in reading.
> + folio->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(file)->i_private;
> + folio->mapping->host = file_inode(file);
> + mutex_unlock(&ano_private->mutex);
> +}
> +
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
> +{
May be the begin/end helpers for read and readahead can be used with the
same helpers. They did the similar logic.
> + struct erofs_inode *vi;
> + struct file *file = rac->file;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = file_inode(file)->i_private;
> + mutex_lock(&ano_private->mutex);
> + rac->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(rac->file)->i_private;
> + rac->mapping->host = file_inode(rac->file);
> + mutex_unlock(&ano_private->mutex);
> +}
> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
> index f3889d6889e5..abda2a60278b 100644
> --- a/fs/erofs/pagecache_share.h
> +++ b/fs/erofs/pagecache_share.h
> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
> int erofs_pcshr_fill_inode(struct inode *inode);
> void erofs_pcshr_free_inode(struct inode *inode);
>
> +/* switch between the anonymous inode and the real inode */
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
> +
> #else
>
> static inline int erofs_pcshr_init_mnt(void) { return 0; }
> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
> static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
> static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>
> +static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
> +static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
> +static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
> +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
> +
> #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>
> #endif
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index b4ce07dc931c..1b690eb6c1f1 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -13,6 +13,7 @@
> #include <linux/backing-dev.h>
> #include <linux/pseudo_fs.h>
> #include "xattr.h"
> +#include "pagecache_share.h"
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/erofs.h>
> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
> {
> struct erofs_inode *vi = EROFS_I(inode);
>
> + erofs_pcshr_free_inode(inode);
> if (inode->i_op == &erofs_fast_symlink_iops)
> kfree(inode->i_link);
> kfree(vi->xattr_shared_xattrs);
> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> if (err)
> return err;
>
> + err = erofs_pcshr_init_mnt();
> + if (err)
> + return err;
> +
> erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
> return 0;
> }
> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
> kill_anon_super(sb);
> else
> kill_block_super(sb);
> + erofs_pcshr_free_mnt();
> fs_put_dax(sbi->dif0.dax_dev, NULL);
> erofs_fscache_unregister_fs(sb);
> erofs_sb_free(sbi);
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 19ef4ff2a134..fc2ed01eaabe 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2022 Alibaba Cloud
> */
> #include "compress.h"
> +#include "pagecache_share.h"
> #include <linux/psi.h>
> #include <linux/cpuhotplug.h>
> #include <trace/events/erofs.h>
> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
> {
> struct inode *const inode = folio->mapping->host;
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> - int err;
> + int err, pcshr;
>
> trace_erofs_read_folio(folio, false);
> + pcshr = erofs_pcshr_read_begin(file, folio);
> f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>
> z_erofs_pcluster_readmore(&f, NULL, true);
> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
>
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_read_end(file, folio, pcshr);
> return err;
> }
>
> @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> struct folio *head = NULL, *folio;
> unsigned int nr_folios;
> - int err;
> + int err, pcshr;
>
> + pcshr = erofs_pcshr_readahead_begin(rac);
> f.headoffset = readahead_pos(rac);
>
> z_erofs_pcluster_readmore(&f, rac, true);
> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
> (void)z_erofs_runqueue(&f, nr_folios);
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> const struct address_space_operations z_erofs_aops = {
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 3/4] erofs: apply the page cache share feature
@ 2025-01-21 11:59 ` Hongbo Li
0 siblings, 0 replies; 20+ messages in thread
From: Hongbo Li @ 2025-01-21 11:59 UTC (permalink / raw)
To: Hongzhen Luo, linux-erofs; +Cc: linux-kernel
On 2025/1/5 23:12, Hongzhen Luo wrote:
> This modifies relevant functions to apply the page cache
> share feature.
>
> Below is the memory usage for reading all files in two different minor
> versions of container images:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 241 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
> +-------------------+------------------+-------------+---------------+
> | | No | 872 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 630 | 28% |
> +-------------------+------------------+-------------+---------------+
> | | No | 2771 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
> +-------------------+------------------+-------------+---------------+
> | | No | 926 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
> +-------------------+------------------+-------------+---------------+
> | | No | 390 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 924 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 474 | 49% |
> +-------------------+------------------+-------------+---------------+
>
> Additionally, the table below shows the runtime memory usage of the
> container:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 35 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
> +-------------------+------------------+-------------+---------------+
> | | No | 149 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 95 | 37% |
> +-------------------+------------------+-------------+---------------+
> | | No | 1028 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
> +-------------------+------------------+-------------+---------------+
> | | No | 155 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
> +-------------------+------------------+-------------+---------------+
> | | No | 25 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 186 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 98 | 48% |
> +-------------------+------------------+-------------+---------------+
>
> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
> ---
> fs/erofs/data.c | 14 +++++++--
> fs/erofs/inode.c | 5 ++-
> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
> fs/erofs/pagecache_share.h | 11 +++++++
> fs/erofs/super.c | 7 +++++
> fs/erofs/zdata.c | 9 ++++--
> 6 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0cd6b5c4df98..fb08acbeaab6 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "internal.h"
> +#include "pagecache_share.h"
> #include <linux/sched/mm.h>
> #include <trace/events/erofs.h>
>
> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + int ret, pcshr;
> +
> + pcshr = erofs_pcshr_read_begin(file, folio);
> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
> + erofs_pcshr_read_end(file, folio, pcshr);
> + return ret;
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + int pcshr;
> +
> + pcshr = erofs_pcshr_readahead_begin(rac);
> + iomap_readahead(rac, &erofs_iomap_ops);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index d4b89407822a..0b070f4b46b8 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "xattr.h"
> +#include "pagecache_share.h"
> #include <trace/events/erofs.h>
>
> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> inode->i_op = &erofs_generic_iops;
> - if (erofs_inode_is_data_compressed(vi->datalayout))
> + if (erofs_pcshr_fill_inode(inode) == 0)
> + inode->i_fop = &erofs_pcshr_fops;
> + else if (erofs_inode_is_data_compressed(vi->datalayout))
> inode->i_fop = &generic_ro_fops;
> else
> inode->i_fop = &erofs_file_fops;
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> index 703fd17c002c..22172b5e21c7 100644
> --- a/fs/erofs/pagecache_share.c
> +++ b/fs/erofs/pagecache_share.c
> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>
> struct erofs_pcshr_private {
> char fprt[PCSHR_FPRT_MAXLEN];
> + struct mutex mutex;
> };
>
> static struct erofs_pcshr_counter mnt_counter = {
> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
> if (!ano_private)
> return -ENOMEM;
> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + mutex_init(&ano_private->mutex);
> inode->i_private = ano_private;
> return 0;
> }
> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
> .get_unmapped_area = thp_get_unmapped_area,
> .splice_read = filemap_splice_read,
> };
> +
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
> +{
> + struct erofs_inode *vi;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = vi->ano_inode->i_private;
> + mutex_lock(&ano_private->mutex);
Can we lock in folio granularity? The erofs_pcshr_private mutex may
limit the concurrent in reading.
> + folio->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(file)->i_private;
> + folio->mapping->host = file_inode(file);
> + mutex_unlock(&ano_private->mutex);
> +}
> +
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
> +{
May be the begin/end helpers for read and readahead can be used with the
same helpers. They did the similar logic.
> + struct erofs_inode *vi;
> + struct file *file = rac->file;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = file_inode(file)->i_private;
> + mutex_lock(&ano_private->mutex);
> + rac->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(rac->file)->i_private;
> + rac->mapping->host = file_inode(rac->file);
> + mutex_unlock(&ano_private->mutex);
> +}
> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
> index f3889d6889e5..abda2a60278b 100644
> --- a/fs/erofs/pagecache_share.h
> +++ b/fs/erofs/pagecache_share.h
> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
> int erofs_pcshr_fill_inode(struct inode *inode);
> void erofs_pcshr_free_inode(struct inode *inode);
>
> +/* switch between the anonymous inode and the real inode */
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
> +
> #else
>
> static inline int erofs_pcshr_init_mnt(void) { return 0; }
> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
> static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
> static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>
> +static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
> +static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
> +static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
> +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
> +
> #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>
> #endif
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index b4ce07dc931c..1b690eb6c1f1 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -13,6 +13,7 @@
> #include <linux/backing-dev.h>
> #include <linux/pseudo_fs.h>
> #include "xattr.h"
> +#include "pagecache_share.h"
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/erofs.h>
> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
> {
> struct erofs_inode *vi = EROFS_I(inode);
>
> + erofs_pcshr_free_inode(inode);
> if (inode->i_op == &erofs_fast_symlink_iops)
> kfree(inode->i_link);
> kfree(vi->xattr_shared_xattrs);
> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> if (err)
> return err;
>
> + err = erofs_pcshr_init_mnt();
> + if (err)
> + return err;
> +
> erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
> return 0;
> }
> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
> kill_anon_super(sb);
> else
> kill_block_super(sb);
> + erofs_pcshr_free_mnt();
> fs_put_dax(sbi->dif0.dax_dev, NULL);
> erofs_fscache_unregister_fs(sb);
> erofs_sb_free(sbi);
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 19ef4ff2a134..fc2ed01eaabe 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2022 Alibaba Cloud
> */
> #include "compress.h"
> +#include "pagecache_share.h"
> #include <linux/psi.h>
> #include <linux/cpuhotplug.h>
> #include <trace/events/erofs.h>
> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
> {
> struct inode *const inode = folio->mapping->host;
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> - int err;
> + int err, pcshr;
>
> trace_erofs_read_folio(folio, false);
> + pcshr = erofs_pcshr_read_begin(file, folio);
> f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>
> z_erofs_pcluster_readmore(&f, NULL, true);
> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
>
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_read_end(file, folio, pcshr);
> return err;
> }
>
> @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> struct folio *head = NULL, *folio;
> unsigned int nr_folios;
> - int err;
> + int err, pcshr;
>
> + pcshr = erofs_pcshr_readahead_begin(rac);
> f.headoffset = readahead_pos(rac);
>
> z_erofs_pcluster_readmore(&f, rac, true);
> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
> (void)z_erofs_runqueue(&f, nr_folios);
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> const struct address_space_operations z_erofs_aops = {
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 3/4] erofs: apply the page cache share feature
2025-01-21 11:59 ` Hongbo Li
(?)
@ 2025-01-21 12:16 ` Gao Xiang
-1 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2025-01-21 12:16 UTC (permalink / raw)
To: Hongbo Li, Hongzhen Luo, linux-erofs; +Cc: linux-kernel
On 2025/1/21 19:59, Hongbo Li via Linux-erofs wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> This modifies relevant functions to apply the page cache
>> share feature.
>>
>> Below is the memory usage for reading all files in two different minor
>> versions of container images:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 241 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 872 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 630 | 28% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 2771 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 926 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 390 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 924 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 474 | 49% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Additionally, the table below shows the runtime memory usage of the
>> container:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 35 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 149 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 95 | 37% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 1028 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 155 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 25 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 186 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 98 | 48% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
>> ---
>> fs/erofs/data.c | 14 +++++++--
>> fs/erofs/inode.c | 5 ++-
>> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
>> fs/erofs/pagecache_share.h | 11 +++++++
>> fs/erofs/super.c | 7 +++++
>> fs/erofs/zdata.c | 9 ++++--
>> 6 files changed, 104 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 0cd6b5c4df98..fb08acbeaab6 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "internal.h"
>> +#include "pagecache_share.h"
>> #include <linux/sched/mm.h>
>> #include <trace/events/erofs.h>
>> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>> */
>> static int erofs_read_folio(struct file *file, struct folio *folio)
>> {
>> - return iomap_read_folio(folio, &erofs_iomap_ops);
>> + int ret, pcshr;
>> +
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> + return ret;
>> }
>> static void erofs_readahead(struct readahead_control *rac)
>> {
>> - return iomap_readahead(rac, &erofs_iomap_ops);
>> + int pcshr;
>> +
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> + iomap_readahead(rac, &erofs_iomap_ops);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
>> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
>> index d4b89407822a..0b070f4b46b8 100644
>> --- a/fs/erofs/inode.c
>> +++ b/fs/erofs/inode.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #include <trace/events/erofs.h>
>> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
>> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
>> switch (inode->i_mode & S_IFMT) {
>> case S_IFREG:
>> inode->i_op = &erofs_generic_iops;
>> - if (erofs_inode_is_data_compressed(vi->datalayout))
>> + if (erofs_pcshr_fill_inode(inode) == 0)
>> + inode->i_fop = &erofs_pcshr_fops;
>> + else if (erofs_inode_is_data_compressed(vi->datalayout))
>> inode->i_fop = &generic_ro_fops;
>> else
>> inode->i_fop = &erofs_file_fops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> index 703fd17c002c..22172b5e21c7 100644
>> --- a/fs/erofs/pagecache_share.c
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>> struct erofs_pcshr_private {
>> char fprt[PCSHR_FPRT_MAXLEN];
>> + struct mutex mutex;
>> };
>> static struct erofs_pcshr_counter mnt_counter = {
>> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
>> if (!ano_private)
>> return -ENOMEM;
>> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> + mutex_init(&ano_private->mutex);
>> inode->i_private = ano_private;
>> return 0;
>> }
>> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
>> .get_unmapped_area = thp_get_unmapped_area,
>> .splice_read = filemap_splice_read,
>> };
>> +
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
>> +{
>> + struct erofs_inode *vi;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = vi->ano_inode->i_private;
>> + mutex_lock(&ano_private->mutex);
> Can we lock in folio granularity? The erofs_pcshr_private mutex may limit the concurrent in reading.
I've asked Hongzhen to prepare a new reasonable version,
in this version it shouldn't be such mutex to lock the
whole submit process, but just keep all inodes stable.
Please just ignore this whole series.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH v5 3/4] erofs: apply the page cache share feature
2025-01-21 11:59 ` Hongbo Li
(?)
(?)
@ 2025-01-21 14:48 ` Hongzhen Luo
-1 siblings, 0 replies; 20+ messages in thread
From: Hongzhen Luo @ 2025-01-21 14:48 UTC (permalink / raw)
To: Hongbo Li, linux-erofs; +Cc: linux-kernel
On 2025/1/21 19:59, Hongbo Li wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> This modifies relevant functions to apply the page cache
>> share feature.
>>
>> Below is the memory usage for reading all files in two different minor
>> versions of container images:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 241 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 872 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 630 | 28% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 2771 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 926 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 390 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 924 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 474 | 49% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Additionally, the table below shows the runtime memory usage of the
>> container:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 35 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 149 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 95 | 37% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 1028 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 155 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 25 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 186 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 98 | 48% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
>> ---
>> fs/erofs/data.c | 14 +++++++--
>> fs/erofs/inode.c | 5 ++-
>> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
>> fs/erofs/pagecache_share.h | 11 +++++++
>> fs/erofs/super.c | 7 +++++
>> fs/erofs/zdata.c | 9 ++++--
>> 6 files changed, 104 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 0cd6b5c4df98..fb08acbeaab6 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "internal.h"
>> +#include "pagecache_share.h"
>> #include <linux/sched/mm.h>
>> #include <trace/events/erofs.h>
>> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct
>> fiemap_extent_info *fieinfo,
>> */
>> static int erofs_read_folio(struct file *file, struct folio *folio)
>> {
>> - return iomap_read_folio(folio, &erofs_iomap_ops);
>> + int ret, pcshr;
>> +
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> + return ret;
>> }
>> static void erofs_readahead(struct readahead_control *rac)
>> {
>> - return iomap_readahead(rac, &erofs_iomap_ops);
>> + int pcshr;
>> +
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> + iomap_readahead(rac, &erofs_iomap_ops);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> static sector_t erofs_bmap(struct address_space *mapping,
>> sector_t block)
>> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
>> index d4b89407822a..0b070f4b46b8 100644
>> --- a/fs/erofs/inode.c
>> +++ b/fs/erofs/inode.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #include <trace/events/erofs.h>
>> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
>> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
>> switch (inode->i_mode & S_IFMT) {
>> case S_IFREG:
>> inode->i_op = &erofs_generic_iops;
>> - if (erofs_inode_is_data_compressed(vi->datalayout))
>> + if (erofs_pcshr_fill_inode(inode) == 0)
>> + inode->i_fop = &erofs_pcshr_fops;
>> + else if (erofs_inode_is_data_compressed(vi->datalayout))
>> inode->i_fop = &generic_ro_fops;
>> else
>> inode->i_fop = &erofs_file_fops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> index 703fd17c002c..22172b5e21c7 100644
>> --- a/fs/erofs/pagecache_share.c
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>> struct erofs_pcshr_private {
>> char fprt[PCSHR_FPRT_MAXLEN];
>> + struct mutex mutex;
>> };
>> static struct erofs_pcshr_counter mnt_counter = {
>> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void
>> *data)
>> if (!ano_private)
>> return -ENOMEM;
>> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> + mutex_init(&ano_private->mutex);
>> inode->i_private = ano_private;
>> return 0;
>> }
>> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
>> .get_unmapped_area = thp_get_unmapped_area,
>> .splice_read = filemap_splice_read,
>> };
>> +
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
>> +{
>> + struct erofs_inode *vi;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = vi->ano_inode->i_private;
>> + mutex_lock(&ano_private->mutex);
> Can we lock in folio granularity? The erofs_pcshr_private mutex may
> limit the concurrent in reading.
I’m sorry for the delay in responding; I just saw this message. I will
send an improved version of the patch soon. Thanks for this suggestion.
>> + folio->mapping->host = &vi->vfs_inode;
>> + return 1;
>> +}
>> +
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio,
>> int pcshr)
>> +{
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (pcshr == 0)
>> + return;
>> +
>> + ano_private = file_inode(file)->i_private;
>> + folio->mapping->host = file_inode(file);
>> + mutex_unlock(&ano_private->mutex);
>> +}
>> +
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
>> +{
> May be the begin/end helpers for read and readahead can be used with
> the same helpers. They did the similar logic.
Okay, indeed! I will send an improved version later.
Best wishes,
Hongzhen Luo
>> + struct erofs_inode *vi;
>> + struct file *file = rac->file;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = file_inode(file)->i_private;
>> + mutex_lock(&ano_private->mutex);
>> + rac->mapping->host = &vi->vfs_inode;
>> + return 1;
>> +}
>> +
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int
>> pcshr)
>> +{
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (pcshr == 0)
>> + return;
>> +
>> + ano_private = file_inode(rac->file)->i_private;
>> + rac->mapping->host = file_inode(rac->file);
>> + mutex_unlock(&ano_private->mutex);
>> +}
>> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
>> index f3889d6889e5..abda2a60278b 100644
>> --- a/fs/erofs/pagecache_share.h
>> +++ b/fs/erofs/pagecache_share.h
>> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
>> int erofs_pcshr_fill_inode(struct inode *inode);
>> void erofs_pcshr_free_inode(struct inode *inode);
>> +/* switch between the anonymous inode and the real inode */
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio,
>> int pcshr);
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int
>> pcshr);
>> +
>> #else
>> static inline int erofs_pcshr_init_mnt(void) { return 0; }
>> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
>> static inline int erofs_pcshr_fill_inode(struct inode *inode) {
>> return -1; }
>> static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>> +static inline int erofs_pcshr_read_begin(struct file *file, struct
>> folio *folio) { return 0; }
>> +static inline void erofs_pcshr_read_end(struct file *file, struct
>> folio *folio, int pcshr) {}
>> +static inline int erofs_pcshr_readahead_begin(struct
>> readahead_control *rac) { return 0; }
>> +static inline void erofs_pcshr_readahead_end(struct
>> readahead_control *rac, int pcshr) {}
>> +
>> #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>> #endif
>> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
>> index b4ce07dc931c..1b690eb6c1f1 100644
>> --- a/fs/erofs/super.c
>> +++ b/fs/erofs/super.c
>> @@ -13,6 +13,7 @@
>> #include <linux/backing-dev.h>
>> #include <linux/pseudo_fs.h>
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/erofs.h>
>> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
>> {
>> struct erofs_inode *vi = EROFS_I(inode);
>> + erofs_pcshr_free_inode(inode);
>> if (inode->i_op == &erofs_fast_symlink_iops)
>> kfree(inode->i_link);
>> kfree(vi->xattr_shared_xattrs);
>> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct
>> super_block *sb, struct fs_context *fc)
>> if (err)
>> return err;
>> + err = erofs_pcshr_init_mnt();
>> + if (err)
>> + return err;
>> +
>> erofs_info(sb, "mounted with root inode @ nid %llu.",
>> sbi->root_nid);
>> return 0;
>> }
>> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
>> kill_anon_super(sb);
>> else
>> kill_block_super(sb);
>> + erofs_pcshr_free_mnt();
>> fs_put_dax(sbi->dif0.dax_dev, NULL);
>> erofs_fscache_unregister_fs(sb);
>> erofs_sb_free(sbi);
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 19ef4ff2a134..fc2ed01eaabe 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2022 Alibaba Cloud
>> */
>> #include "compress.h"
>> +#include "pagecache_share.h"
>> #include <linux/psi.h>
>> #include <linux/cpuhotplug.h>
>> #include <trace/events/erofs.h>
>> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file
>> *file, struct folio *folio)
>> {
>> struct inode *const inode = folio->mapping->host;
>> struct z_erofs_decompress_frontend f =
>> DECOMPRESS_FRONTEND_INIT(inode);
>> - int err;
>> + int err, pcshr;
>> trace_erofs_read_folio(folio, false);
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>> z_erofs_pcluster_readmore(&f, NULL, true);
>> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file
>> *file, struct folio *folio)
>> erofs_put_metabuf(&f.map.buf);
>> erofs_release_pages(&f.pagepool);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> return err;
>> }
>> @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct
>> readahead_control *rac)
>> struct z_erofs_decompress_frontend f =
>> DECOMPRESS_FRONTEND_INIT(inode);
>> struct folio *head = NULL, *folio;
>> unsigned int nr_folios;
>> - int err;
>> + int err, pcshr;
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> f.headoffset = readahead_pos(rac);
>> z_erofs_pcluster_readmore(&f, rac, true);
>> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct
>> readahead_control *rac)
>> (void)z_erofs_runqueue(&f, nr_folios);
>> erofs_put_metabuf(&f.map.buf);
>> erofs_release_pages(&f.pagepool);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> const struct address_space_operations z_erofs_aops = {
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2025-01-21 14:48 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-05 15:12 [RFC PATCH v5 0/4] erofs: page cache share feature Hongzhen Luo
2025-01-05 15:12 ` Hongzhen Luo
2025-01-05 15:12 ` [RFC PATCH v5 1/4] erofs: move `struct erofs_anon_fs_type` to super.c Hongzhen Luo
2025-01-05 15:12 ` Hongzhen Luo
2025-01-05 15:12 ` [RFC PATCH v5 2/4] erofs: introduce the page cache share feature Hongzhen Luo
2025-01-05 15:12 ` Hongzhen Luo
2025-01-06 2:27 ` Gao Xiang
2025-01-06 3:03 ` Hongzhen Luo
2025-01-05 15:12 ` [RFC PATCH v5 3/4] erofs: apply " Hongzhen Luo
2025-01-05 15:12 ` Hongzhen Luo
2025-01-06 2:15 ` Gao Xiang
2025-01-21 11:59 ` Hongbo Li via Linux-erofs
2025-01-21 11:59 ` Hongbo Li
2025-01-21 12:16 ` Gao Xiang
2025-01-21 14:48 ` Hongzhen Luo
2025-01-05 15:12 ` [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share Hongzhen Luo
2025-01-05 15:12 ` Hongzhen Luo
2025-01-05 18:52 ` kernel test robot
2025-01-06 3:40 ` kernel test robot
2025-01-06 11:52 ` kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.