* [PATCH 1/2] fsck.erofs: add multi-threaded decompression
2026-06-21 12:01 [PATCH 0/2] fsck.erofs: implement multi-threaded extraction Nithurshen
@ 2026-06-21 12:01 ` Nithurshen
2026-06-21 12:01 ` [PATCH 2/2] fsck.erofs: implement concurrent directory traversal Nithurshen
1 sibling, 0 replies; 3+ messages in thread
From: Nithurshen @ 2026-06-21 12:01 UTC (permalink / raw)
To: linux-erofs; +Cc: hsiangkao, xiang, Nithurshen
Currently, fsck.erofs extracts files synchronously. When decompressing
heavily packed images, the main thread spends the majority of its time
blocked on a combination of synchronous vfs_write() syscalls and
decompression routines, bottlenecking overall extraction speed.
This patch introduces a scalable, multi-threaded decompression framework
using the existing erofs_workqueue infrastructure to decouple compute
from the main thread's I/O.
To prevent massive scheduling overhead (futex contention) where worker
threads spend more CPU time waking up than actually decompressing small
clusters, this implementation introduces a batching context. Because
different compression algorithms exhibit vastly different scheduling
thresholds, the batch size is algorithm-aware:
- Fast algorithms like LZ4 utilize a larger batch limit (up to 32
pclusters) to effectively hide synchronization overhead.
- Compute-heavy algorithms like LZMA or ZSTD trigger at a lower
threshold (8 pclusters) to prevent memory bloat and thread starvation.
Key details of this implementation:
- The worker pool is dynamically sized based on available system CPUs.
- Decompression tasks take strict ownership of the raw and output
buffers (safely tracking memory via a `free_out` flag) to prevent
data races and memory leaks.
- Output buffers are explicitly zero-initialized via calloc() to
prevent trailing garbage bytes from leaking into extracted files.
- Tail-end packed fragments are processed synchronously by the main
thread, as their minimal overhead does not benefit from asynchronous
offloading.
Signed-off-by: Nithurshen <nithurshen.dev@gmail.com>
---
fsck/main.c | 150 ++++++++++++---------------
include/erofs/cond.h | 31 ++++++
include/erofs/internal.h | 20 +++-
include/erofs/lock.h | 3 +
lib/data.c | 216 +++++++++++++++++++++++++++++----------
5 files changed, 277 insertions(+), 143 deletions(-)
create mode 100644 include/erofs/cond.h
diff --git a/fsck/main.c b/fsck/main.c
index 16cc627..ffe7e29 100644
--- a/fsck/main.c
+++ b/fsck/main.c
@@ -8,14 +8,18 @@
#include <time.h>
#include <utime.h>
#include <unistd.h>
+#include "erofs/lock.h"
#include <sys/stat.h>
#include "erofs/print.h"
#include "erofs/decompress.h"
#include "erofs/dir.h"
#include "erofs/xattr.h"
+#include "erofs/workqueue.h"
#include "../lib/compressor.h"
#include "../lib/liberofs_compress.h"
+extern struct erofs_workqueue erofs_wq;
+
static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid);
struct erofsfsck_dirstack {
@@ -505,44 +509,36 @@ out:
static int erofs_verify_inode_data(struct erofs_inode *inode, int outfd)
{
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER,
- };
+ struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER };
bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
int ret = 0;
- bool compressed;
+ bool compressed = erofs_inode_is_data_compressed(inode->datalayout);
erofs_off_t pos = 0;
u64 pchunk_len = 0;
- unsigned int raw_size = 0, buffer_size = 0;
- char *raw = NULL, *buffer = NULL;
- erofs_dbg("verify data chunk of nid(%llu): type(%d)",
- inode->nid | 0ULL, inode->datalayout);
+ struct z_erofs_read_ctx ctx = {
+ .pending_tasks = 0,
+ .final_err = 0,
+ .outfd = outfd,
+ .free_out = true,
+ .current_task = NULL
+ };
+ erofs_mutex_init(&ctx.lock);
+ erofs_cond_init(&ctx.cond);
- compressed = erofs_inode_is_data_compressed(inode->datalayout);
- while (pos < inode->i_size) {
- unsigned int alloc_rawsize;
+ erofs_dbg("verify data chunk of nid(%llu): type(%d)", inode->nid | 0ULL, inode->datalayout);
+ while (pos < inode->i_size) {
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
- if (ret)
- goto out;
-
- if (!compressed && map.m_llen != map.m_plen) {
- erofs_err("broken chunk length m_la %" PRIu64 " m_llen %" PRIu64 " m_plen %" PRIu64,
- map.m_la, map.m_llen, map.m_plen);
- ret = -EFSCORRUPTED;
- goto out;
- }
+ if (ret) goto out;
- /* the last lcluster can be divided into 3 parts */
if (map.m_la + map.m_llen > inode->i_size)
map.m_llen = inode->i_size - map.m_la;
pchunk_len += map.m_plen;
pos += map.m_llen;
- /* should skip decomp? */
if (map.m_la >= inode->i_size || !needdecode)
continue;
@@ -555,85 +551,53 @@ static int erofs_verify_inode_data(struct erofs_inode *inode, int outfd)
continue;
}
- if (map.m_plen > Z_EROFS_PCLUSTER_MAX_SIZE) {
- if (compressed && !(map.m_flags & __EROFS_MAP_FRAGMENT)) {
- erofs_err("invalid pcluster size %" PRIu64 " @ offset %" PRIu64 " of nid %" PRIu64,
- map.m_plen, map.m_la,
- inode->nid | 0ULL);
- ret = -EFSCORRUPTED;
- goto out;
- }
- alloc_rawsize = Z_EROFS_PCLUSTER_MAX_SIZE;
- } else {
- alloc_rawsize = map.m_plen;
- }
-
- if (alloc_rawsize > raw_size) {
- char *newraw = realloc(raw, alloc_rawsize);
-
- if (!newraw) {
+ if (compressed) {
+ char *raw = malloc(map.m_plen);
+ size_t buffer_size = map.m_llen > erofs_blksiz(inode->sbi) ? map.m_llen : erofs_blksiz(inode->sbi);
+ char *buffer = calloc(1, buffer_size);
+
+ if (!raw || !buffer) {
+ free(raw); free(buffer);
ret = -ENOMEM;
goto out;
}
- raw = newraw;
- raw_size = alloc_rawsize;
- }
- if (compressed) {
- if (map.m_llen > buffer_size) {
- char *newbuffer;
-
- buffer_size = map.m_llen;
- newbuffer = realloc(buffer, buffer_size);
- if (!newbuffer) {
- ret = -ENOMEM;
- goto out;
- }
- buffer = newbuffer;
- }
- ret = z_erofs_read_one_data(inode, &map, raw, buffer,
- 0, map.m_llen, false);
- if (ret)
+ ret = z_erofs_read_one_data(inode, &map, raw, buffer, 0, map.m_llen, false, map.m_la, &ctx);
+ if (ret) {
+ /* DO NOT free(raw) or free(buffer) here. z_erofs_read_one_data took ownership! */
goto out;
-
- if (outfd >= 0 && write(outfd, buffer, map.m_llen) < 0)
- goto fail_eio;
+ }
} else {
- u64 p = 0;
-
- do {
- u64 count = min_t(u64, alloc_rawsize,
- map.m_llen);
-
- ret = erofs_read_one_data(inode, &map, raw, p, count);
- if (ret)
- goto out;
-
- if (outfd >= 0 && write(outfd, raw, count) < 0)
- goto fail_eio;
- map.m_llen -= count;
- p += count;
- } while (map.m_llen);
+ char *raw = calloc(1, map.m_llen);
+ ret = erofs_read_one_data(inode, &map, raw, 0, map.m_llen);
+ if (ret >= 0 && outfd >= 0)
+ pwrite(outfd, raw, map.m_llen, map.m_la);
+ free(raw);
+ if (ret) goto out;
}
}
+ z_erofs_read_ctx_enqueue(&ctx);
+
+out:
+ erofs_mutex_lock(&ctx.lock);
+ while (ctx.pending_tasks > 0)
+ erofs_cond_wait(&ctx.cond, &ctx.lock);
+ if (ctx.final_err < 0 && ret >= 0)
+ ret = ctx.final_err;
+ erofs_mutex_unlock(&ctx.lock);
if (fsckcfg.print_comp_ratio) {
if (!erofs_is_packed_inode(inode))
fsckcfg.logical_blocks += BLK_ROUND_UP(inode->sbi, inode->i_size);
fsckcfg.physical_blocks += BLK_ROUND_UP(inode->sbi, pchunk_len);
}
-out:
- if (raw)
- free(raw);
- if (buffer)
- free(buffer);
- return ret < 0 ? ret : 0;
-fail_eio:
- erofs_err("I/O error occurred when verifying data chunk @ nid %llu",
- inode->nid | 0ULL);
- ret = -EIO;
- goto out;
+ if (outfd >= 0 && ret >= 0)
+ ftruncate(outfd, inode->i_size);
+
+ erofs_mutex_destroy(&ctx.lock);
+ erofs_cond_destroy(&ctx.cond);
+ return ret < 0 ? ret : 0;
}
static inline int erofs_extract_dir(struct erofs_inode *inode)
@@ -1043,10 +1007,21 @@ int erofsfsck_fuzz_one(int argc, char *argv[])
int main(int argc, char *argv[])
#endif
{
+
int err;
+#ifdef EROFS_MT_ENABLED
+ int workers;
+#endif
erofs_init_configure();
+#ifdef EROFS_MT_ENABLED
+ workers = erofs_get_available_processors();
+ if (workers < 1)
+ workers = 1;
+ erofs_alloc_workqueue(&erofs_wq, workers, 256, NULL, NULL);
+#endif
+
fsckcfg.physical_blocks = 0;
fsckcfg.logical_blocks = 0;
fsckcfg.extract_path = NULL;
@@ -1181,6 +1156,9 @@ exit_dev_close:
exit:
erofs_blob_closeall(&g_sbi);
erofs_exit_configure();
+#ifdef EROFS_MT_ENABLED
+ erofs_destroy_workqueue(&erofs_wq);
+#endif
return err ? 1 : 0;
}
diff --git a/include/erofs/cond.h b/include/erofs/cond.h
new file mode 100644
index 0000000..90ec838
--- /dev/null
+++ b/include/erofs/cond.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+#ifndef __EROFS_COND_H
+#define __EROFS_COND_H
+
+#include "lock.h"
+
+#if defined(HAVE_PTHREAD_H) && defined(EROFS_MT_ENABLED)
+#include <pthread.h>
+
+typedef pthread_cond_t erofs_cond_t;
+
+static inline void erofs_cond_init(erofs_cond_t *cond)
+{
+ pthread_cond_init(cond, NULL);
+}
+#define erofs_cond_wait pthread_cond_wait
+#define erofs_cond_signal pthread_cond_signal
+#define erofs_cond_broadcast pthread_cond_broadcast
+#define erofs_cond_destroy pthread_cond_destroy
+
+#else
+typedef struct {} erofs_cond_t;
+
+static inline void erofs_cond_init(erofs_cond_t *cond) {}
+static inline int erofs_cond_wait(erofs_cond_t *cond, erofs_mutex_t *mutex) { return 0; }
+static inline int erofs_cond_signal(erofs_cond_t *cond) { return 0; }
+static inline int erofs_cond_broadcast(erofs_cond_t *cond) { return 0; }
+static inline int erofs_cond_destroy(erofs_cond_t *cond) { return 0; }
+#endif
+
+#endif
\ No newline at end of file
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 671880f..94f14da 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -25,6 +25,8 @@ typedef unsigned short umode_t;
#ifdef HAVE_PTHREAD_H
#include <pthread.h>
#endif
+#include <erofs/lock.h>
+#include "erofs/cond.h"
#include <stdlib.h>
#include <string.h>
#include "atomic.h"
@@ -62,6 +64,7 @@ struct erofs_buf {
#define erofs_pos(sbi, nr) ((erofs_off_t)(nr) << (sbi)->blkszbits)
#define BLK_ROUND_UP(sbi, addr) \
(roundup(addr, erofs_blksiz(sbi)) >> (sbi)->blkszbits)
+#define Z_EROFS_PCLUSTER_MAX_BATCH_SIZE 32
struct erofs_buffer_head;
struct erofs_bufmgr;
@@ -442,6 +445,20 @@ struct z_erofs_paramset {
char *extraopts;
};
+struct z_erofs_decompress_task;
+
+struct z_erofs_read_ctx {
+ erofs_mutex_t lock;
+ erofs_cond_t cond;
+ int pending_tasks;
+ int final_err;
+ int outfd;
+ bool free_out;
+ struct z_erofs_decompress_task *current_task;
+};
+
+void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx);
+
int liberofs_global_init(void);
void liberofs_global_exit(void);
@@ -478,7 +495,8 @@ int erofs_read_one_data(struct erofs_inode *inode, struct erofs_map_blocks *map,
char *buffer, u64 offset, size_t len);
int z_erofs_read_one_data(struct erofs_inode *inode,
struct erofs_map_blocks *map, char *raw, char *buffer,
- erofs_off_t skip, erofs_off_t length, bool trimmed);
+ erofs_off_t skip, erofs_off_t length, bool trimmed,
+ erofs_off_t out_offset, struct z_erofs_read_ctx *ctx);
void *erofs_read_metadata(struct erofs_sb_info *sbi, erofs_nid_t nid,
erofs_off_t *offset, int *lengthp);
int z_erofs_parse_cfgs(struct erofs_sb_info *sbi, struct erofs_super_block *dsb);
diff --git a/include/erofs/lock.h b/include/erofs/lock.h
index c6e3093..2e79d52 100644
--- a/include/erofs/lock.h
+++ b/include/erofs/lock.h
@@ -15,6 +15,7 @@ static inline void erofs_mutex_init(erofs_mutex_t *lock)
}
#define erofs_mutex_lock pthread_mutex_lock
#define erofs_mutex_unlock pthread_mutex_unlock
+#define erofs_mutex_destroy pthread_mutex_destroy
#define EROFS_DEFINE_MUTEX(lock) \
erofs_mutex_t lock = PTHREAD_MUTEX_INITIALIZER
@@ -29,12 +30,14 @@ static inline void erofs_init_rwsem(erofs_rwsem_t *lock)
#define erofs_down_write pthread_rwlock_wrlock
#define erofs_up_read pthread_rwlock_unlock
#define erofs_up_write pthread_rwlock_unlock
+
#else
typedef struct {} erofs_mutex_t;
static inline void erofs_mutex_init(erofs_mutex_t *lock) {}
static inline void erofs_mutex_lock(erofs_mutex_t *lock) {}
static inline void erofs_mutex_unlock(erofs_mutex_t *lock) {}
+static inline void erofs_mutex_destroy(erofs_mutex_t *lock) {}
#define EROFS_DEFINE_MUTEX(lock) \
erofs_mutex_t lock = {}
diff --git a/lib/data.c b/lib/data.c
index 6fd1389..e9d2218 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -9,6 +9,73 @@
#include "erofs/trace.h"
#include "erofs/decompress.h"
#include "liberofs_fragments.h"
+#include "erofs/workqueue.h"
+#include "erofs/lock.h"
+
+struct erofs_workqueue erofs_wq;
+
+struct z_erofs_decompress_item {
+ struct z_erofs_decompress_req req;
+ char *raw_buf;
+ char *out_buf;
+ erofs_off_t out_offset;
+ unsigned int out_length;
+};
+
+struct z_erofs_decompress_task {
+ struct erofs_work work;
+ struct z_erofs_read_ctx *ctx;
+ struct z_erofs_decompress_item items[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
+ unsigned int nr_reqs;
+};
+
+static void z_erofs_decompress_worker(struct erofs_work *work, void *tlsp)
+{
+ struct z_erofs_decompress_task *task = (struct z_erofs_decompress_task *)work;
+ struct z_erofs_read_ctx *ctx = task->ctx;
+ int i, ret = 0, first_err = 0;
+
+ for (i = 0; i < task->nr_reqs; ++i) {
+ struct z_erofs_decompress_item *item = &task->items[i];
+ ret = z_erofs_decompress(&item->req);
+
+ if (ret >= 0 && ctx && ctx->outfd >= 0) {
+ if (pwrite(ctx->outfd, item->out_buf,
+ item->out_length, item->out_offset) < 0)
+ ret = -errno;
+ }
+
+ if (ret < 0 && first_err == 0)
+ first_err = ret;
+
+ free(item->raw_buf);
+ if (ctx && ctx->free_out)
+ free(item->out_buf);
+ }
+
+ if (ctx) {
+ erofs_mutex_lock(&ctx->lock);
+ if (first_err < 0 && ctx->final_err == 0)
+ ctx->final_err = first_err;
+ ctx->pending_tasks--;
+ if (ctx->pending_tasks == 0)
+ erofs_cond_signal(&ctx->cond);
+ erofs_mutex_unlock(&ctx->lock);
+ }
+ free(task);
+}
+
+void z_erofs_read_ctx_enqueue(struct z_erofs_read_ctx *ctx)
+{
+ if (ctx && ctx->current_task) {
+#ifdef EROFS_MT_ENABLED
+ erofs_queue_work(&erofs_wq, &ctx->current_task->work);
+#else
+ z_erofs_decompress_worker(&ctx->current_task->work, NULL);
+#endif
+ ctx->current_task = NULL;
+ }
+}
void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
@@ -277,7 +344,8 @@ static int erofs_read_raw_data(struct erofs_inode *inode, char *buffer,
int z_erofs_read_one_data(struct erofs_inode *inode,
struct erofs_map_blocks *map, char *raw, char *buffer,
- erofs_off_t skip, erofs_off_t length, bool trimmed)
+ erofs_off_t skip, erofs_off_t length, bool trimmed,
+ erofs_off_t out_offset, struct z_erofs_read_ctx *ctx)
{
struct erofs_sb_info *sbi = inode->sbi;
struct erofs_map_dev mdev;
@@ -285,77 +353,107 @@ int z_erofs_read_one_data(struct erofs_inode *inode,
if (map->m_flags & __EROFS_MAP_FRAGMENT) {
if (__erofs_unlikely(inode->nid == sbi->packed_nid)) {
- erofs_err("fragment should not exist in the packed inode %llu",
- sbi->packed_nid | 0ULL);
- return -EFSCORRUPTED;
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ ret = erofs_packedfile_read(sbi, buffer, length - skip,
+ inode->fragmentoff + skip);
+
+ if (ret >= 0 && ctx && ctx->outfd >= 0) {
+ if (pwrite(ctx->outfd, buffer, length - skip, out_offset) < 0)
+ ret = -errno;
}
- return erofs_packedfile_read(sbi, buffer, length - skip,
- inode->fragmentoff + skip);
+ goto err_out;
}
- /* no device id here, thus it will always succeed */
- mdev = (struct erofs_map_dev) {
- .m_pa = map->m_pa,
- };
+ mdev = (struct erofs_map_dev) { .m_pa = map->m_pa };
ret = erofs_map_dev(sbi, &mdev);
- if (ret) {
- DBG_BUGON(1);
- return ret;
- }
+ if (ret)
+ goto err_out;
ret = erofs_dev_read(sbi, mdev.m_deviceid, raw, mdev.m_pa, map->m_plen);
if (ret < 0)
- return ret;
+ goto err_out;
+ struct z_erofs_decompress_task *task = ctx->current_task;
+ if (!task) {
+ task = calloc(1, sizeof(*task));
+ task->ctx = ctx;
+ task->work.fn = z_erofs_decompress_worker;
+ ctx->current_task = task;
+
+ erofs_mutex_lock(&ctx->lock);
+ ctx->pending_tasks++;
+ erofs_mutex_unlock(&ctx->lock);
+ }
- ret = z_erofs_decompress(&(struct z_erofs_decompress_req) {
- .sbi = sbi,
- .in = raw,
- .out = buffer,
- .decodedskip = skip,
- .interlaced_offset =
- map->m_algorithmformat == Z_EROFS_COMPRESSION_INTERLACED ?
- erofs_blkoff(sbi, map->m_la) : 0,
- .inputsize = map->m_plen,
- .decodedlength = length,
- .alg = map->m_algorithmformat,
- .partial_decoding = trimmed ? true :
- !(map->m_flags & EROFS_MAP_FULL_MAPPED) ||
- (map->m_flags & EROFS_MAP_PARTIAL_REF),
- });
- if (ret < 0)
- return ret;
+ int idx = task->nr_reqs++;
+ struct z_erofs_decompress_item *item = &task->items[idx];
+
+ item->req = (struct z_erofs_decompress_req) {
+ .sbi = sbi,
+ .in = raw,
+ .out = buffer,
+ .decodedskip = skip,
+ .interlaced_offset =
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_INTERLACED ?
+ erofs_blkoff(sbi, map->m_la) : 0,
+ .inputsize = map->m_plen,
+ .decodedlength = length,
+ .alg = map->m_algorithmformat,
+ .partial_decoding = trimmed ? true :
+ !(map->m_flags & EROFS_MAP_FULL_MAPPED) ||
+ (map->m_flags & EROFS_MAP_PARTIAL_REF),
+ };
+ item->raw_buf = raw;
+ item->out_buf = buffer;
+ item->out_offset = out_offset;
+ item->out_length = length;
+
+ int batch_limit = (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZ4) ?
+ Z_EROFS_PCLUSTER_MAX_BATCH_SIZE : 8;
+
+ if (task->nr_reqs >= batch_limit) {
+ z_erofs_read_ctx_enqueue(ctx);
+ }
return 0;
+
+err_out:
+ if (ctx && ctx->free_out) free(buffer);
+ free(raw);
+ return ret;
}
static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
- erofs_off_t size, erofs_off_t offset)
+ erofs_off_t size, erofs_off_t offset)
{
erofs_off_t end, length, skip;
struct erofs_map_blocks map = {
.buf = __EROFS_BUF_INITIALIZER,
};
bool trimmed;
- unsigned int bufsize = 0;
- char *raw = NULL;
int ret = 0;
+ struct z_erofs_read_ctx ctx = {
+ .pending_tasks = 0,
+ .final_err = 0,
+ .outfd = -1,
+ .free_out = false,
+ .current_task = NULL
+ };
+ erofs_mutex_init(&ctx.lock);
+ erofs_cond_init(&ctx.cond);
+
end = offset + size;
while (end > offset) {
map.m_la = end - 1;
ret = z_erofs_map_blocks_iter(inode, &map, 0);
- if (ret)
- break;
+ if (ret) break;
- /*
- * trim to the needed size if the returned extent is quite
- * larger than requested, and set up partial flag as well.
- */
if (end < map.m_la + map.m_llen) {
length = end - map.m_la;
trimmed = true;
} else {
- DBG_BUGON(end != map.m_la + map.m_llen);
length = map.m_llen;
trimmed = false;
}
@@ -374,25 +472,31 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
continue;
}
- if (map.m_plen > bufsize) {
- char *newraw;
-
- bufsize = map.m_plen;
- newraw = realloc(raw, bufsize);
- if (!newraw) {
- ret = -ENOMEM;
- break;
- }
- raw = newraw;
+ char *raw = malloc(map.m_plen);
+ if (!raw) {
+ ret = -ENOMEM;
+ break;
}
ret = z_erofs_read_one_data(inode, &map, raw,
- buffer + end - offset, skip, length, trimmed);
- if (ret < 0)
+ buffer + end - offset, skip, length, trimmed, 0, &ctx);
+ if (ret < 0) {
break;
+ }
}
- if (raw)
- free(raw);
+ z_erofs_read_ctx_enqueue(&ctx);
+
+ erofs_mutex_lock(&ctx.lock);
+ while (ctx.pending_tasks > 0)
+ erofs_cond_wait(&ctx.cond, &ctx.lock);
+
+ if (ctx.final_err < 0 && ret == 0)
+ ret = ctx.final_err;
+
+ erofs_mutex_unlock(&ctx.lock);
+ erofs_mutex_destroy(&ctx.lock);
+ erofs_cond_destroy(&ctx.cond);
+
return ret < 0 ? ret : 0;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCH 2/2] fsck.erofs: implement concurrent directory traversal
2026-06-21 12:01 [PATCH 0/2] fsck.erofs: implement multi-threaded extraction Nithurshen
2026-06-21 12:01 ` [PATCH 1/2] fsck.erofs: add multi-threaded decompression Nithurshen
@ 2026-06-21 12:01 ` Nithurshen
1 sibling, 0 replies; 3+ messages in thread
From: Nithurshen @ 2026-06-21 12:01 UTC (permalink / raw)
To: linux-erofs; +Cc: hsiangkao, xiang, Nithurshen
Currently, fsck.erofs traverses the filesystem tree and verifies
inodes synchronously on the main thread. While decompression
compute is offloaded, the main thread remains a bottleneck
during the I/O-heavy directory walk.
This patch parallelizes the directory traversal and inode
extraction processes. To achieve this safely, the globally shared
fsckcfg.extract_path and fsckcfg.dirstack states are decoupled
and localized into individual struct erofsfsck_inode_task
payloads, which are deep-copied and handed off to the worker
pool. Global statistics and hardlink tables are secured using
native erofs_mutex_t primitives.
To prevent thread pool exhaustion deadlocks—where workers
processing a deep directory tree occupy all available execution
slots and block on erofs_cond_wait, starving their own spawned
decompression tasks—this patch introduces a dedicated
erofs_traverse_wq. By isolating the producers (traversal and
verification) from the consumers (pcluster decompression), the
pipeline avoids gridlock.
Signed-off-by: Nithurshen <nithurshen.dev@gmail.com>
---
fsck/main.c | 300 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 211 insertions(+), 89 deletions(-)
diff --git a/fsck/main.c b/fsck/main.c
index ffe7e29..0ed0ddd 100644
--- a/fsck/main.c
+++ b/fsck/main.c
@@ -19,14 +19,45 @@
#include "../lib/liberofs_compress.h"
extern struct erofs_workqueue erofs_wq;
+struct erofs_workqueue erofs_traverse_wq;
-static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid);
+static EROFS_DEFINE_MUTEX(fsck_global_lock);
+static EROFS_DEFINE_MUTEX(fsck_hardlink_lock);
+
+static int fsck_pending_tasks = 0;
+static EROFS_DEFINE_MUTEX(fsck_task_lock);
+static erofs_cond_t fsck_task_cond;
+
+static void fsck_inc_task(void) {
+ erofs_mutex_lock(&fsck_task_lock);
+ fsck_pending_tasks++;
+ erofs_mutex_unlock(&fsck_task_lock);
+}
+
+static void fsck_dec_task(void) {
+ erofs_mutex_lock(&fsck_task_lock);
+ fsck_pending_tasks--;
+ if (fsck_pending_tasks == 0)
+ erofs_cond_signal(&fsck_task_cond);
+ erofs_mutex_unlock(&fsck_task_lock);
+}
+
+static void fsck_wait_tasks(void) {
+ erofs_mutex_lock(&fsck_task_lock);
+ while (fsck_pending_tasks > 0)
+ erofs_cond_wait(&fsck_task_cond, &fsck_task_lock);
+ erofs_mutex_unlock(&fsck_task_lock);
+}
struct erofsfsck_dirstack {
erofs_nid_t dirs[PATH_MAX];
int top;
};
+static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid,
+ char *current_path, size_t pos,
+ struct erofsfsck_dirstack *dirstack);
+
struct erofsfsck_cfg {
struct erofsfsck_dirstack dirstack;
u64 physical_blocks;
@@ -425,7 +456,7 @@ out:
return ret;
}
-static int erofsfsck_dump_xattrs(struct erofs_inode *inode)
+static int erofsfsck_dump_xattrs(struct erofs_inode *inode, const char *current_path)
{
static bool ignore_xattrs = false;
char *keylst, *key;
@@ -472,7 +503,7 @@ static int erofsfsck_dump_xattrs(struct erofs_inode *inode)
break;
}
if (fsckcfg.extract_path)
- ret = erofs_sys_lsetxattr(fsckcfg.extract_path, key,
+ ret = erofs_sys_lsetxattr(current_path, key,
value, size);
else
ret = 0;
@@ -587,9 +618,11 @@ out:
erofs_mutex_unlock(&ctx.lock);
if (fsckcfg.print_comp_ratio) {
+ erofs_mutex_lock(&fsck_global_lock);
if (!erofs_is_packed_inode(inode))
fsckcfg.logical_blocks += BLK_ROUND_UP(inode->sbi, inode->i_size);
fsckcfg.physical_blocks += BLK_ROUND_UP(inode->sbi, pchunk_len);
+ erofs_mutex_unlock(&fsck_global_lock);
}
if (outfd >= 0 && ret >= 0)
@@ -600,11 +633,11 @@ out:
return ret < 0 ? ret : 0;
}
-static inline int erofs_extract_dir(struct erofs_inode *inode)
+static inline int erofs_extract_dir(struct erofs_inode *inode, const char *current_path)
{
int ret;
- erofs_dbg("create directory %s", fsckcfg.extract_path);
+ erofs_dbg("create directory %s", current_path);
/* verify data chunk layout */
ret = erofs_verify_inode_data(inode, -1);
@@ -617,19 +650,19 @@ static inline int erofs_extract_dir(struct erofs_inode *inode)
* write/execute permission. These are fixed up later in
* erofsfsck_set_attributes().
*/
- if (mkdir(fsckcfg.extract_path, 0700) < 0) {
+ if (mkdir(current_path, 0700) < 0) {
struct stat st;
if (errno != EEXIST) {
erofs_err("failed to create directory: %s (%s)",
- fsckcfg.extract_path, strerror(errno));
+ current_path, strerror(errno));
return -errno;
}
- if (lstat(fsckcfg.extract_path, &st) ||
+ if (lstat(current_path, &st) ||
!S_ISDIR(st.st_mode)) {
erofs_err("path is not a directory: %s",
- fsckcfg.extract_path);
+ current_path);
return -ENOTDIR;
}
@@ -637,9 +670,9 @@ static inline int erofs_extract_dir(struct erofs_inode *inode)
* Try to change permissions of existing directory so
* that we can write to it
*/
- if (chmod(fsckcfg.extract_path, 0700) < 0) {
+ if (chmod(current_path, 0700) < 0) {
erofs_err("failed to set permissions: %s (%s)",
- fsckcfg.extract_path, strerror(errno));
+ current_path, strerror(errno));
return -errno;
}
}
@@ -651,11 +684,17 @@ static char *erofsfsck_hardlink_find(erofs_nid_t nid)
struct list_head *head =
&erofsfsck_link_hashtable[nid % NR_HARDLINK_HASHTABLE];
struct erofsfsck_hardlink_entry *entry;
+ char *path = NULL;
- list_for_each_entry(entry, head, list)
- if (entry->nid == nid)
+ erofs_mutex_lock(&fsck_hardlink_lock);
+ list_for_each_entry(entry, head, list){
+ if (entry->nid == nid){
return entry->path;
- return NULL;
+ break;
+ }
+ }
+ erofs_mutex_unlock(&fsck_hardlink_lock);
+ return path;
}
static int erofsfsck_hardlink_insert(erofs_nid_t nid, const char *path)
@@ -673,8 +712,10 @@ static int erofsfsck_hardlink_insert(erofs_nid_t nid, const char *path)
return -ENOMEM;
}
+ erofs_mutex_lock(&fsck_hardlink_lock);
list_add_tail(&entry->list,
&erofsfsck_link_hashtable[nid % NR_HARDLINK_HASHTABLE]);
+ erofs_mutex_unlock(&fsck_hardlink_lock);
return 0;
}
@@ -703,37 +744,37 @@ static void erofsfsck_hardlink_exit(void)
}
}
-static inline int erofs_extract_file(struct erofs_inode *inode)
+static inline int erofs_extract_file(struct erofs_inode *inode, const char *current_path)
{
bool tryagain = true;
int ret, fd;
- erofs_dbg("extract file to path: %s", fsckcfg.extract_path);
+ erofs_dbg("extract file to path: %s", current_path);
again:
- fd = open(fsckcfg.extract_path,
+ fd = open(current_path,
O_WRONLY | O_CREAT | O_NOFOLLOW |
(fsckcfg.overwrite ? O_TRUNC : O_EXCL), 0700);
if (fd < 0) {
if (fsckcfg.overwrite && tryagain) {
if (errno == EISDIR) {
erofs_warn("try to forcely remove directory %s",
- fsckcfg.extract_path);
- if (rmdir(fsckcfg.extract_path) < 0) {
+ current_path);
+ if (rmdir(current_path) < 0) {
erofs_err("failed to remove: %s (%s)",
- fsckcfg.extract_path, strerror(errno));
+ current_path, strerror(errno));
return -EISDIR;
}
} else if (errno == EACCES &&
- chmod(fsckcfg.extract_path, 0700) < 0) {
+ chmod(current_path, 0700) < 0) {
erofs_err("failed to set permissions: %s (%s)",
- fsckcfg.extract_path, strerror(errno));
+ current_path, strerror(errno));
return -errno;
}
tryagain = false;
goto again;
}
- erofs_err("failed to open: %s (%s)", fsckcfg.extract_path,
+ erofs_err("failed to open: %s (%s)", current_path,
strerror(errno));
return -errno;
}
@@ -744,14 +785,14 @@ again:
return ret;
}
-static inline int erofs_extract_symlink(struct erofs_inode *inode)
+static inline int erofs_extract_symlink(struct erofs_inode *inode, const char *current_path)
{
struct erofs_vfile vf;
bool tryagain = true;
int ret;
char *buf = NULL;
- erofs_dbg("extract symlink to path: %s", fsckcfg.extract_path);
+ erofs_dbg("extract symlink to path: %s", current_path);
/* verify data chunk layout */
ret = erofs_verify_inode_data(inode, -1);
@@ -777,13 +818,13 @@ static inline int erofs_extract_symlink(struct erofs_inode *inode)
buf[inode->i_size] = '\0';
again:
- if (symlink(buf, fsckcfg.extract_path) < 0) {
+ if (symlink(buf, current_path) < 0) {
if (errno == EEXIST && fsckcfg.overwrite && tryagain) {
erofs_warn("try to forcely remove file %s",
- fsckcfg.extract_path);
- if (unlink(fsckcfg.extract_path) < 0) {
+ current_path);
+ if (unlink(current_path) < 0) {
erofs_err("failed to remove: %s",
- fsckcfg.extract_path);
+ current_path);
ret = -errno;
goto out;
}
@@ -791,7 +832,7 @@ again:
goto again;
}
erofs_err("failed to create symlink: %s",
- fsckcfg.extract_path);
+ current_path);
ret = -errno;
}
out:
@@ -800,12 +841,12 @@ out:
return ret;
}
-static int erofs_extract_special(struct erofs_inode *inode)
+static int erofs_extract_special(struct erofs_inode *inode, const char *current_path)
{
bool tryagain = true;
int ret;
- erofs_dbg("extract special to path: %s", fsckcfg.extract_path);
+ erofs_dbg("extract special to path: %s", current_path);
/* verify data chunk layout */
ret = erofs_verify_inode_data(inode, -1);
@@ -813,13 +854,13 @@ static int erofs_extract_special(struct erofs_inode *inode)
return ret;
again:
- if (mknod(fsckcfg.extract_path, inode->i_mode, inode->u.i_rdev) < 0) {
+ if (mknod(current_path, inode->i_mode, inode->u.i_rdev) < 0) {
if (errno == EEXIST && fsckcfg.overwrite && tryagain) {
erofs_warn("try to forcely remove file %s",
- fsckcfg.extract_path);
- if (unlink(fsckcfg.extract_path) < 0) {
+ current_path);
+ if (unlink(current_path) < 0) {
erofs_err("failed to remove: %s",
- fsckcfg.extract_path);
+ current_path);
return -errno;
}
tryagain = false;
@@ -827,11 +868,11 @@ again:
}
if (errno == EEXIST || fsckcfg.superuser) {
erofs_err("failed to create special file: %s",
- fsckcfg.extract_path);
+ current_path);
ret = -errno;
} else {
erofs_warn("failed to create special file: %s, skipped",
- fsckcfg.extract_path);
+ current_path);
ret = -ECANCELED;
}
}
@@ -854,47 +895,82 @@ static int erofsfsck_get_parent_cb(struct erofs_dir_context *ctx)
return 0;
}
+struct erofsfsck_inode_task {
+ struct erofs_work work;
+ erofs_nid_t pnid;
+ erofs_nid_t nid;
+ char *path;
+ size_t path_pos;
+ struct erofsfsck_dirstack dirstack;
+};
+
+static void erofsfsck_check_inode_worker(struct erofs_work *work, void *tlsp);
+
+struct erofsfsck_dir_iter_ctx {
+ struct erofs_dir_context ctx;
+ char *base_path;
+ size_t base_pos;
+ struct erofsfsck_dirstack *dirstack;
+};
+
static int erofsfsck_dirent_iter(struct erofs_dir_context *ctx)
{
- int ret;
- size_t prev_pos, curr_pos;
+ struct erofsfsck_dir_iter_ctx *ictx = (void *)ctx;
+ size_t curr_pos = ictx->base_pos;
+ char *new_path = NULL;
if (ctx->dot_dotdot)
return 0;
- prev_pos = fsckcfg.extract_pos;
- curr_pos = prev_pos;
-
- if (prev_pos + ctx->de_namelen >= PATH_MAX) {
- erofs_err("unable to fsck since the path is too long (%llu)",
- (curr_pos + ctx->de_namelen) | 0ULL);
- return -EOPNOTSUPP;
- }
+ if (curr_pos + ctx->de_namelen >= PATH_MAX) {
+ erofs_err("unable to fsck since the path is too long");
+ return -EOPNOTSUPP;
+ }
if (fsckcfg.extract_path) {
- fsckcfg.extract_path[curr_pos++] = '/';
- strncpy(fsckcfg.extract_path + curr_pos, ctx->dname,
- ctx->de_namelen);
+ new_path = malloc(PATH_MAX);
+ if (!new_path) return -ENOMEM;
+
+ if (ictx->base_path)
+ memcpy(new_path, ictx->base_path, curr_pos);
+
+ new_path[curr_pos++] = '/';
+ strncpy(new_path + curr_pos, ctx->dname, ctx->de_namelen);
curr_pos += ctx->de_namelen;
- fsckcfg.extract_path[curr_pos] = '\0';
+ new_path[curr_pos] = '\0';
} else {
curr_pos += ctx->de_namelen;
}
- fsckcfg.extract_pos = curr_pos;
- ret = erofsfsck_check_inode(ctx->dir->nid, ctx->de_nid);
- if (fsckcfg.extract_path)
- fsckcfg.extract_path[prev_pos] = '\0';
- fsckcfg.extract_pos = prev_pos;
- return ret;
+ struct erofsfsck_inode_task *task = malloc(sizeof(*task));
+ if (!task) {
+ free(new_path);
+ return -ENOMEM;
+ }
+
+ task->work.fn = erofsfsck_check_inode_worker;
+ task->work.next = NULL;
+ task->pnid = ctx->dir->nid;
+ task->nid = ctx->de_nid;
+ task->path = new_path;
+ task->path_pos = curr_pos;
+ task->dirstack = *(ictx->dirstack);
+
+ fsck_inc_task();
+#ifdef EROFS_MT_ENABLED
+ erofs_queue_work(&erofs_traverse_wq, &task->work);
+#else
+ task->work.fn(&task->work, NULL);
+#endif
+ return 0;
}
-static int erofsfsck_extract_inode(struct erofs_inode *inode)
+static int erofsfsck_extract_inode(struct erofs_inode *inode, const char *current_path)
{
int ret;
char *oldpath;
- if (!fsckcfg.extract_path || erofs_is_packed_inode(inode)) {
+ if (!current_path || erofs_is_packed_inode(inode)) {
verify:
/* verify data chunk layout */
return erofs_verify_inode_data(inode, -1);
@@ -902,9 +978,9 @@ verify:
oldpath = erofsfsck_hardlink_find(inode->nid);
if (oldpath) {
- if (link(oldpath, fsckcfg.extract_path) == -1) {
+ if (link(oldpath, current_path) == -1) {
erofs_err("failed to extract hard link: %s (%s)",
- fsckcfg.extract_path, strerror(errno));
+ current_path, strerror(errno));
return -errno;
}
return 0;
@@ -912,19 +988,19 @@ verify:
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
- ret = erofs_extract_dir(inode);
+ ret = erofs_extract_dir(inode, current_path);
break;
case S_IFREG:
- ret = erofs_extract_file(inode);
+ ret = erofs_extract_file(inode, current_path);
break;
case S_IFLNK:
- ret = erofs_extract_symlink(inode);
+ ret = erofs_extract_symlink(inode, current_path);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- ret = erofs_extract_special(inode);
+ ret = erofs_extract_special(inode, current_path);
break;
default:
/* TODO */
@@ -936,11 +1012,13 @@ verify:
/* record nid and old path for hardlink */
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
ret = erofsfsck_hardlink_insert(inode->nid,
- fsckcfg.extract_path);
+ current_path);
return ret;
}
-static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid)
+static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid,
+ char *current_path, size_t pos,
+ struct erofsfsck_dirstack *dirstack)
{
int ret, i;
struct erofs_inode inode = {.sbi = &g_sbi, .nid = nid};
@@ -961,44 +1039,63 @@ static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid)
goto out;
}
- ret = erofsfsck_extract_inode(&inode);
+ ret = erofsfsck_extract_inode(&inode, current_path);
if (ret && ret != -ECANCELED)
goto out;
if (fsckcfg.check_decomp && fsckcfg.dump_xattrs) {
- ret = erofsfsck_dump_xattrs(&inode);
+ ret = erofsfsck_dump_xattrs(&inode, current_path);
if (ret)
return ret;
}
if (S_ISDIR(inode.i_mode)) {
- struct erofs_dir_context ctx = {
- .flags = EROFS_READDIR_VALID_PNID,
- .pnid = pnid,
- .dir = &inode,
- .cb = erofsfsck_dirent_iter,
+ struct erofsfsck_dir_iter_ctx ctx = {
+ .ctx.flags = EROFS_READDIR_VALID_PNID,
+ .ctx.pnid = pnid,
+ .ctx.dir = &inode,
+ .ctx.cb = erofsfsck_dirent_iter,
+ .base_path = current_path,
+ .base_pos = pos,
+ .dirstack = dirstack
};
- /* XXX: support the deeper cases later */
- if (fsckcfg.dirstack.top >= ARRAY_SIZE(fsckcfg.dirstack.dirs))
+ if (dirstack->top >= ARRAY_SIZE(dirstack->dirs))
return -ENAMETOOLONG;
- for (i = 0; i < fsckcfg.dirstack.top; ++i)
- if (inode.nid == fsckcfg.dirstack.dirs[i])
+ for (i = 0; i < dirstack->top; ++i)
+ if (inode.nid == dirstack->dirs[i])
return -ELOOP;
- fsckcfg.dirstack.dirs[fsckcfg.dirstack.top++] = pnid;
- ret = erofs_iterate_dir(&ctx, true);
- --fsckcfg.dirstack.top;
+
+ dirstack->dirs[dirstack->top++] = pnid;
+ ret = erofs_iterate_dir(&ctx.ctx, true);
+ --dirstack->top;
}
- if (!ret && !erofs_is_packed_inode(&inode))
- erofsfsck_set_attributes(&inode, fsckcfg.extract_path);
+ if (!ret && !erofs_is_packed_inode(&inode) && current_path)
+ erofsfsck_set_attributes(&inode, current_path);
if (ret == -ECANCELED)
ret = 0;
out:
- if (ret && ret != -EIO)
- fsckcfg.corrupted = true;
- return ret;
+ if (ret && ret != -EIO) {
+ erofs_mutex_lock(&fsck_global_lock);
+ fsckcfg.corrupted = true;
+ erofs_mutex_unlock(&fsck_global_lock);
+ }
+ return ret;
+}
+
+static void erofsfsck_check_inode_worker(struct erofs_work *work, void *tlsp)
+{
+ struct erofsfsck_inode_task *task = (struct erofsfsck_inode_task *)work;
+
+ erofsfsck_check_inode(task->pnid, task->nid, task->path,
+ task->path_pos, &task->dirstack);
+
+ if (task->path)
+ free(task->path);
+ free(task);
+ fsck_dec_task();
}
#ifdef FUZZING
@@ -1020,6 +1117,7 @@ int main(int argc, char *argv[])
if (workers < 1)
workers = 1;
erofs_alloc_workqueue(&erofs_wq, workers, 256, NULL, NULL);
+ erofs_alloc_workqueue(&erofs_traverse_wq, workers, 256, NULL, NULL);
#endif
fsckcfg.physical_blocks = 0;
@@ -1086,6 +1184,8 @@ int main(int argc, char *argv[])
fsckcfg.nid = g_sbi.root_nid;
}
+ erofs_cond_init(&fsck_task_cond);
+
if (!fsckcfg.inode_path && fsckcfg.nid == g_sbi.root_nid) {
if (erofs_sb_has_fragments(&g_sbi) && g_sbi.packed_nid > 0) {
err = erofs_packedfile_init(&g_sbi, false);
@@ -1095,7 +1195,7 @@ int main(int argc, char *argv[])
goto exit_hardlink;
}
- err = erofsfsck_check_inode(g_sbi.packed_nid, g_sbi.packed_nid);
+ err = erofsfsck_check_inode(g_sbi.packed_nid, g_sbi.packed_nid, NULL, 0, &fsckcfg.dirstack);
if (err) {
erofs_err("failed to verify packed file");
goto exit_packedinode;
@@ -1120,7 +1220,26 @@ int main(int argc, char *argv[])
pnid = ctx.pnid;
}
}
- err = erofsfsck_check_inode(pnid, fsckcfg.nid);
+
+ fsck_inc_task();
+ struct erofsfsck_inode_task *root_task = calloc(1, sizeof(*root_task));
+ root_task->work.fn = erofsfsck_check_inode_worker;
+ root_task->pnid = pnid;
+ root_task->nid = fsckcfg.nid;
+
+ if (fsckcfg.extract_path)
+ root_task->path = strdup(fsckcfg.extract_path);
+
+ root_task->path_pos = fsckcfg.extract_pos;
+ root_task->dirstack = fsckcfg.dirstack;
+
+#ifdef EROFS_MT_ENABLED
+ erofs_queue_work(&erofs_traverse_wq, &root_task->work);
+#else
+ root_task->work.fn(&root_task->work, NULL);
+#endif
+
+ fsck_wait_tasks();
}
if (fsckcfg.corrupted) {
@@ -1144,6 +1263,8 @@ int main(int argc, char *argv[])
}
}
+ erofs_cond_destroy(&fsck_task_cond);
+
exit_packedinode:
erofs_packedfile_exit(&g_sbi);
exit_hardlink:
@@ -1158,6 +1279,7 @@ exit:
erofs_exit_configure();
#ifdef EROFS_MT_ENABLED
erofs_destroy_workqueue(&erofs_wq);
+ erofs_destroy_workqueue(&erofs_traverse_wq);
#endif
return err ? 1 : 0;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 3+ messages in thread