All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] fsck.erofs: implement thread-safe global LRU metadata cache
@ 2026-06-11  8:36 Nithurshen
  2026-06-11  9:15 ` Gao Xiang
  2026-06-11 18:46 ` [PATCH v2] " Nithurshen
  0 siblings, 2 replies; 3+ messages in thread
From: Nithurshen @ 2026-06-11  8:36 UTC (permalink / raw)
  To: linux-erofs; +Cc: xiang, hsiangkao, Nithurshen

This patch introduces a thread-safe metadata cache to reduce redundant
I/O and decompression overhead during fsck extraction.

To ensure it remains highly concurrent for worker threads extracting
pclusters, the cache utilizes a bucketed, rw-semaphore protected
architecture modeled after the existing fragment cache.

Furthermore, to prevent out-of-memory (OOM) scenarios on exceptionally
large EROFS images, the cache implements a strict Global Least Recently
Used (LRU) eviction policy. The maximum cache size is dynamically
configurable via the new '--cache-size' parameter, which defaults to a
safe, fixed threshold of 32 MB.

Signed-off-by: Nithurshen <nithurshen.dev@gmail.com>
---
 fsck/main.c              |  12 ++++
 include/erofs/internal.h |   2 +
 lib/data.c               | 149 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/fsck/main.c b/fsck/main.c
index ffe7e29..7a1e573 100644
--- a/fsck/main.c
+++ b/fsck/main.c
@@ -67,6 +67,7 @@ static struct option long_options[] = {
 	{"no-xattrs", no_argument, 0, 14},
 	{"nid", required_argument, 0, 15},
 	{"path", required_argument, 0, 16},
+	{"cache-size", required_argument, 0, 17},
 	{"no-sbcrc", no_argument, 0, 512},
 	{0, 0, 0, 0},
 };
@@ -120,6 +121,7 @@ static void usage(int argc, char **argv)
 		" --offset=#             skip # bytes at the beginning of IMAGE\n"
 		" --nid=#                check or extract from the target inode of nid #\n"
 		" --path=X               check or extract from the target inode of path X\n"
+		" --cache-size=#        set maximum metadata cache size in bytes (default 32MB)\n"
 		" --no-sbcrc             bypass the superblock checksum verification\n"
 		" --[no-]xattrs          whether to dump extended attributes (default off)\n"
 		"\n"
@@ -261,6 +263,16 @@ static int erofsfsck_parse_options_cfg(int argc, char **argv)
 		case 16:
 			fsckcfg.inode_path = optarg;
 			break;
+		case 17: {
+			char *endptr;
+			unsigned long cache_size = strtoul(optarg, &endptr, 0);
+			if (*endptr != '\0') {
+				erofs_err("invalid metadata cache size %s", optarg);
+				return -EINVAL;
+			}
+			erofs_meta_cache_set_capacity(cache_size);
+			break;
+		}
 		case 512:
 			fsckcfg.nosbcrc = true;
 			break;
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 94f14da..34b7eb3 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -459,6 +459,8 @@ struct z_erofs_read_ctx {
 
 void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx);
 
+void erofs_meta_cache_set_capacity(unsigned long bytes);
+
 int liberofs_global_init(void);
 void liberofs_global_exit(void);
 
diff --git a/lib/data.c b/lib/data.c
index e9d2218..9acf2bf 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -29,6 +29,84 @@ struct z_erofs_decompress_task {
 	unsigned int nr_reqs;
 };
 
+#define META_HASHSIZE		65536
+#define META_HASH(c)		((c) & (META_HASHSIZE - 1))
+
+struct erofs_meta_bucket {
+	struct list_head hash;
+	erofs_rwsem_t lock;
+};
+
+struct erofs_meta_item {
+	struct list_head list;
+	struct list_head lru;
+	u64 key;
+	char *data;
+	int length;
+	bool evicting;
+};
+
+static struct erofs_meta_bucket meta_bks[META_HASHSIZE];
+static bool meta_cache_inited = false;
+EROFS_DEFINE_MUTEX(meta_cache_init_lock);
+
+static EROFS_DEFINE_MUTEX(meta_lru_lock);
+static struct list_head meta_lru_list;
+static unsigned long meta_cache_bytes = 0;
+static unsigned long meta_cache_max_bytes = 32 * 1024 * 1024; 
+
+void erofs_meta_cache_set_capacity(unsigned long bytes)
+{
+	meta_cache_max_bytes = bytes;
+}
+
+static void erofs_meta_cache_init(void)
+{
+	int i;
+
+	erofs_mutex_lock(&meta_cache_init_lock);
+	if (meta_cache_inited) {
+		erofs_mutex_unlock(&meta_cache_init_lock);
+		return;
+	}
+
+	for (i = 0; i < META_HASHSIZE; ++i) {
+		init_list_head(&meta_bks[i].hash);
+		erofs_init_rwsem(&meta_bks[i].lock);
+	}
+	init_list_head(&meta_lru_list);
+	meta_cache_inited = true;
+	erofs_mutex_unlock(&meta_cache_init_lock);
+}
+
+static void erofs_meta_cache_evict(void)
+{
+	struct erofs_meta_item *item;
+	struct erofs_meta_bucket *bk;
+
+	erofs_mutex_lock(&meta_lru_lock);
+	while (meta_cache_bytes > meta_cache_max_bytes && !list_empty(&meta_lru_list)) {
+		/* Get the least recently used item (tail of the list) */
+		item = list_last_entry(&meta_lru_list, struct erofs_meta_item, lru);
+		item->evicting = true; /* Mark it dead to block cache hits from resurrecting it */
+		list_del(&item->lru);
+		init_list_head(&item->lru);
+		meta_cache_bytes -= item->length;
+		erofs_mutex_unlock(&meta_lru_lock);
+
+		bk = &meta_bks[META_HASH(item->key)];
+		erofs_down_write(&bk->lock);
+		list_del(&item->list);
+		erofs_up_write(&bk->lock);
+
+		free(item->data);
+		free(item);
+
+		erofs_mutex_lock(&meta_lru_lock);
+	}
+	erofs_mutex_unlock(&meta_lru_lock);
+}
+
 static void z_erofs_decompress_worker(struct erofs_work *work, void *tlsp)
 {
 	struct z_erofs_decompress_task *task = (struct z_erofs_decompress_task *)work;
@@ -604,7 +682,72 @@ static void *erofs_read_metadata_bdi(struct erofs_sb_info *sbi,
 void *erofs_read_metadata(struct erofs_sb_info *sbi, erofs_nid_t nid,
 			  erofs_off_t *offset, int *lengthp)
 {
+	u64 key = nid ? nid : *offset;
+	struct erofs_meta_bucket *bk;
+	struct erofs_meta_item *item;
+	void *buffer = NULL;
+
+	if (__erofs_unlikely(!meta_cache_inited))
+		erofs_meta_cache_init();
+
+	bk = &meta_bks[META_HASH(key)];
+
+	erofs_down_read(&bk->lock);
+	list_for_each_entry(item, &bk->hash, list) {
+		if (item->key == key) {
+			buffer = malloc(item->length);
+			if (buffer) {
+				memcpy(buffer, item->data, item->length);
+				*lengthp = item->length;
+				*offset = round_up(*offset, 4);
+				*offset += sizeof(__le16) + item->length;
+				
+				erofs_mutex_lock(&meta_lru_lock);
+                if (!item->evicting)
+                    list_del(&item->lru);
+					list_add(&item->lru, &meta_lru_list);
+                erofs_mutex_unlock(&meta_lru_lock);
+			}
+			break;
+		}
+	}
+	erofs_up_read(&bk->lock);
+
+	if (buffer)
+		return buffer;
+
 	if (nid)
-		return erofs_read_metadata_nid(sbi, nid, offset, lengthp);
-	return erofs_read_metadata_bdi(sbi, offset, lengthp);
-}
+		buffer = erofs_read_metadata_nid(sbi, nid, offset, lengthp);
+	else
+		buffer = erofs_read_metadata_bdi(sbi, offset, lengthp);
+
+	if (IS_ERR(buffer))
+		return buffer;
+
+	item = malloc(sizeof(*item));
+	if (item) {
+		item->key = key;
+		item->length = *lengthp;
+		item->evicting = false;
+		item->data = malloc(*lengthp);
+		if (item->data) {
+			memcpy(item->data, buffer, *lengthp);
+			
+			erofs_down_write(&bk->lock);
+			list_add_tail(&item->list, &bk->hash);
+			erofs_up_write(&bk->lock);
+
+			erofs_mutex_lock(&meta_lru_lock);
+            list_add(&item->lru, &meta_lru_list);
+            meta_cache_bytes += *lengthp;
+            erofs_mutex_unlock(&meta_lru_lock);
+
+			if (meta_cache_bytes > meta_cache_max_bytes)
+				erofs_meta_cache_evict();
+		} else {
+			free(item);
+		}
+	}
+
+	return buffer;
+}
\ No newline at end of file
-- 
2.52.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v1] fsck.erofs: implement thread-safe global LRU metadata cache
  2026-06-11  8:36 [PATCH v1] fsck.erofs: implement thread-safe global LRU metadata cache Nithurshen
@ 2026-06-11  9:15 ` Gao Xiang
  2026-06-11 18:46 ` [PATCH v2] " Nithurshen
  1 sibling, 0 replies; 3+ messages in thread
From: Gao Xiang @ 2026-06-11  9:15 UTC (permalink / raw)
  To: Nithurshen, linux-erofs; +Cc: xiang



On 2026/6/11 16:36, Nithurshen wrote:
> This patch introduces a thread-safe metadata cache to reduce redundant
> I/O and decompression overhead during fsck extraction.
> 
> To ensure it remains highly concurrent for worker threads extracting
> pclusters, the cache utilizes a bucketed, rw-semaphore protected
> architecture modeled after the existing fragment cache.
> 
> Furthermore, to prevent out-of-memory (OOM) scenarios on exceptionally
> large EROFS images, the cache implements a strict Global Least Recently
> Used (LRU) eviction policy. The maximum cache size is dynamically
> configurable via the new '--cache-size' parameter, which defaults to a
> safe, fixed threshold of 32 MB.
> 
> Signed-off-by: Nithurshen <nithurshen.dev@gmail.com>

why `malloc()` will prevent out-of-memory (OOM)?

First, either erofs_read_metadata_nid() or erofs_read_metadata_bdi()
will read file to the page cache; and currently there is no cache.


But you introduce another cache using malloc(), since it increases
the memory overhead, why it prevents out-of-memory (OOM)?

Thanks,
Gao Xiang


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v2] fsck.erofs: implement thread-safe global LRU metadata cache
  2026-06-11  8:36 [PATCH v1] fsck.erofs: implement thread-safe global LRU metadata cache Nithurshen
  2026-06-11  9:15 ` Gao Xiang
@ 2026-06-11 18:46 ` Nithurshen
  1 sibling, 0 replies; 3+ messages in thread
From: Nithurshen @ 2026-06-11 18:46 UTC (permalink / raw)
  To: linux-erofs; +Cc: nithurshen.dev, hsiangkao, xiang

This patch introduces a thread-safe userspace metadata cache to reduce
redundant decompression cycles and the overhead of repetitive pread()
syscalls across multiple background worker threads.

To ensure it remains highly concurrent for worker threads extracting
pclusters, the cache utilizes a bucketed, rw-semaphore protected
architecture modeled after the existing fragment cache.

While the introduction of a userspace cache inherently increases the
memory footprint compared to relying solely on the kernel's page cache,
this patch implements a strict Global Least Recently Used (LRU) eviction
policy to safely bound this additional memory overhead. This prevents the
cache from growing unbounded on exceptionally large EROFS images. The
maximum cache capacity is dynamically configurable via the new
'--cache-size' parameter, which defaults to a safe threshold of 32 MB.

Signed-off-by: Nithurshen <nithurshen.dev@gmail.com>
---
 fsck/main.c              |  12 ++++
 include/erofs/internal.h |   2 +
 lib/data.c               | 150 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 161 insertions(+), 3 deletions(-)

diff --git a/fsck/main.c b/fsck/main.c
index ffe7e29..7a1e573 100644
--- a/fsck/main.c
+++ b/fsck/main.c
@@ -67,6 +67,7 @@ static struct option long_options[] = {
 	{"no-xattrs", no_argument, 0, 14},
 	{"nid", required_argument, 0, 15},
 	{"path", required_argument, 0, 16},
+	{"cache-size", required_argument, 0, 17},
 	{"no-sbcrc", no_argument, 0, 512},
 	{0, 0, 0, 0},
 };
@@ -120,6 +121,7 @@ static void usage(int argc, char **argv)
 		" --offset=#             skip # bytes at the beginning of IMAGE\n"
 		" --nid=#                check or extract from the target inode of nid #\n"
 		" --path=X               check or extract from the target inode of path X\n"
+		" --cache-size=#        set maximum metadata cache size in bytes (default 32MB)\n"
 		" --no-sbcrc             bypass the superblock checksum verification\n"
 		" --[no-]xattrs          whether to dump extended attributes (default off)\n"
 		"\n"
@@ -261,6 +263,16 @@ static int erofsfsck_parse_options_cfg(int argc, char **argv)
 		case 16:
 			fsckcfg.inode_path = optarg;
 			break;
+		case 17: {
+			char *endptr;
+			unsigned long cache_size = strtoul(optarg, &endptr, 0);
+			if (*endptr != '\0') {
+				erofs_err("invalid metadata cache size %s", optarg);
+				return -EINVAL;
+			}
+			erofs_meta_cache_set_capacity(cache_size);
+			break;
+		}
 		case 512:
 			fsckcfg.nosbcrc = true;
 			break;
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 94f14da..34b7eb3 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -459,6 +459,8 @@ struct z_erofs_read_ctx {
 
 void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx);
 
+void erofs_meta_cache_set_capacity(unsigned long bytes);
+
 int liberofs_global_init(void);
 void liberofs_global_exit(void);
 
diff --git a/lib/data.c b/lib/data.c
index e9d2218..b8d81b3 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -29,6 +29,84 @@ struct z_erofs_decompress_task {
 	unsigned int nr_reqs;
 };
 
+#define META_HASHSIZE		65536
+#define META_HASH(c)		((c) & (META_HASHSIZE - 1))
+
+struct erofs_meta_bucket {
+	struct list_head hash;
+	erofs_rwsem_t lock;
+};
+
+struct erofs_meta_item {
+	struct list_head list;
+	struct list_head lru;
+	u64 key;
+	char *data;
+	int length;
+	bool evicting;
+};
+
+static struct erofs_meta_bucket meta_bks[META_HASHSIZE];
+static bool meta_cache_inited = false;
+EROFS_DEFINE_MUTEX(meta_cache_init_lock);
+
+static EROFS_DEFINE_MUTEX(meta_lru_lock);
+static struct list_head meta_lru_list;
+static unsigned long meta_cache_bytes = 0;
+static unsigned long meta_cache_max_bytes = 32 * 1024 * 1024; 
+
+void erofs_meta_cache_set_capacity(unsigned long bytes)
+{
+	meta_cache_max_bytes = bytes;
+}
+
+static void erofs_meta_cache_init(void)
+{
+	int i;
+
+	erofs_mutex_lock(&meta_cache_init_lock);
+	if (meta_cache_inited) {
+		erofs_mutex_unlock(&meta_cache_init_lock);
+		return;
+	}
+
+	for (i = 0; i < META_HASHSIZE; ++i) {
+		init_list_head(&meta_bks[i].hash);
+		erofs_init_rwsem(&meta_bks[i].lock);
+	}
+	init_list_head(&meta_lru_list);
+	meta_cache_inited = true;
+	erofs_mutex_unlock(&meta_cache_init_lock);
+}
+
+static void erofs_meta_cache_evict(void)
+{
+	struct erofs_meta_item *item;
+	struct erofs_meta_bucket *bk;
+
+	erofs_mutex_lock(&meta_lru_lock);
+	while (meta_cache_bytes > meta_cache_max_bytes && !list_empty(&meta_lru_list)) {
+		/* Get the least recently used item (tail of the list) */
+		item = list_last_entry(&meta_lru_list, struct erofs_meta_item, lru);
+		item->evicting = true; /* Mark it dead to block cache hits from resurrecting it */
+		list_del(&item->lru);
+		init_list_head(&item->lru);
+		meta_cache_bytes -= item->length;
+		erofs_mutex_unlock(&meta_lru_lock);
+
+		bk = &meta_bks[META_HASH(item->key)];
+		erofs_down_write(&bk->lock);
+		list_del(&item->list);
+		erofs_up_write(&bk->lock);
+
+		free(item->data);
+		free(item);
+
+		erofs_mutex_lock(&meta_lru_lock);
+	}
+	erofs_mutex_unlock(&meta_lru_lock);
+}
+
 static void z_erofs_decompress_worker(struct erofs_work *work, void *tlsp)
 {
 	struct z_erofs_decompress_task *task = (struct z_erofs_decompress_task *)work;
@@ -604,7 +682,73 @@ static void *erofs_read_metadata_bdi(struct erofs_sb_info *sbi,
 void *erofs_read_metadata(struct erofs_sb_info *sbi, erofs_nid_t nid,
 			  erofs_off_t *offset, int *lengthp)
 {
+	u64 key = nid ? nid : *offset;
+	struct erofs_meta_bucket *bk;
+	struct erofs_meta_item *item;
+	void *buffer = NULL;
+
+	if (__erofs_unlikely(!meta_cache_inited))
+		erofs_meta_cache_init();
+
+	bk = &meta_bks[META_HASH(key)];
+
+	erofs_down_read(&bk->lock);
+	list_for_each_entry(item, &bk->hash, list) {
+		if (item->key == key) {
+			buffer = malloc(item->length);
+			if (buffer) {
+				memcpy(buffer, item->data, item->length);
+				*lengthp = item->length;
+				*offset = round_up(*offset, 4);
+				*offset += sizeof(__le16) + item->length;
+				
+				erofs_mutex_lock(&meta_lru_lock);
+				if (!item->evicting) {
+					list_del(&item->lru);
+					list_add(&item->lru, &meta_lru_list);
+				}
+				erofs_mutex_unlock(&meta_lru_lock);
+			}
+			break;
+		}
+	}
+	erofs_up_read(&bk->lock);
+
+	if (buffer)
+		return buffer;
+
 	if (nid)
-		return erofs_read_metadata_nid(sbi, nid, offset, lengthp);
-	return erofs_read_metadata_bdi(sbi, offset, lengthp);
-}
+		buffer = erofs_read_metadata_nid(sbi, nid, offset, lengthp);
+	else
+		buffer = erofs_read_metadata_bdi(sbi, offset, lengthp);
+
+	if (IS_ERR(buffer))
+		return buffer;
+
+	item = malloc(sizeof(*item));
+	if (item) {
+		item->key = key;
+		item->length = *lengthp;
+		item->evicting = false;
+		item->data = malloc(*lengthp);
+		if (item->data) {
+			memcpy(item->data, buffer, *lengthp);
+			
+			erofs_down_write(&bk->lock);
+			list_add_tail(&item->list, &bk->hash);
+			erofs_up_write(&bk->lock);
+
+			erofs_mutex_lock(&meta_lru_lock);
+			list_add(&item->lru, &meta_lru_list);
+			meta_cache_bytes += *lengthp;
+			erofs_mutex_unlock(&meta_lru_lock);
+
+			if (meta_cache_bytes > meta_cache_max_bytes)
+				erofs_meta_cache_evict();
+		} else {
+			free(item);
+		}
+	}
+
+	return buffer;
+}
\ No newline at end of file
-- 
2.52.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-11 18:47 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-11  8:36 [PATCH v1] fsck.erofs: implement thread-safe global LRU metadata cache Nithurshen
2026-06-11  9:15 ` Gao Xiang
2026-06-11 18:46 ` [PATCH v2] " Nithurshen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.