* [RFC PATCH 1/4] ext4: introduce DAX fast commit ByteLog backend
2026-02-26 10:17 [RFC PATCH 0/4] ext4: Byte-granular ByteLog optimizes DAX fast commits Li Chen
@ 2026-02-26 10:17 ` Li Chen
2026-02-26 10:17 ` [RFC PATCH 2/4] ext4: add dax_fc_bytelog mount option Li Chen
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Li Chen @ 2026-02-26 10:17 UTC (permalink / raw)
To: linux-ext4, Theodore Ts'o, Andreas Dilger, linux-kernel
Cc: Harshad Shirwadkar, Li Chen
Add a ByteLog backend that can append fast commit records directly into a
DAX-mapped fast commit area, avoiding bufferhead based writes.
The backend provides a simple record format with CRC32C and helpers for
batching and persisting records.
Signed-off-by: Li Chen <me@linux.beauty>
---
MAINTAINERS | 1 +
fs/ext4/Makefile | 2 +-
fs/ext4/ext4.h | 9 +-
fs/ext4/fast_commit_bytelog.c | 780 ++++++++++++++++++++++++++++++++++
fs/ext4/fast_commit_bytelog.h | 147 +++++++
5 files changed, 937 insertions(+), 2 deletions(-)
create mode 100644 fs/ext4/fast_commit_bytelog.c
create mode 100644 fs/ext4/fast_commit_bytelog.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 71f76fddebbf..5a26b99aac63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9627,6 +9627,7 @@ Q: http://patchwork.ozlabs.org/project/linux-ext4/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git
F: Documentation/filesystems/ext4/
F: fs/ext4/
+F: fs/ext4/fast_commit_bytelog*
F: include/trace/events/ext4.h
F: include/uapi/linux/ext4.h
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 72206a292676..3df51f100536 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
- xattr_user.o fast_commit.o orphan.o
+ xattr_user.o fast_commit.o fast_commit_bytelog.o orphan.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..1b0746bf4869 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -999,6 +999,7 @@ do { \
#include "extents_status.h"
#include "fast_commit.h"
+#include "fast_commit_bytelog.h"
/*
* Lock subclasses for i_data_sem in the ext4_inode_info structure.
@@ -1282,6 +1283,8 @@ struct ext4_inode_info {
* scanning in mballoc
*/
#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */
+#define EXT4_MOUNT2_DAX_FC_BYTELOG 0x00000200 /* Use DAX ByteLog FC backend */
+#define EXT4_MOUNT2_DAX_FC_BYTELOG_FORCE 0x00000400 /* Ignore feature bit */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1797,6 +1800,7 @@ struct ext4_sb_info {
int s_fc_debug_max_replay;
#endif
struct ext4_fc_replay_state s_fc_replay_state;
+ struct ext4_fc_bytelog s_fc_bytelog;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2125,6 +2129,7 @@ static inline bool ext4_inode_orphan_tracked(struct inode *inode)
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000
+#define EXT4_FEATURE_INCOMPAT_DAX_FC_BYTELOG 0x40000
extern void ext4_update_dynamic_rev(struct super_block *sb);
@@ -2224,6 +2229,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
+EXT4_FEATURE_INCOMPAT_FUNCS(dax_fc_bytelog, DAX_FC_BYTELOG)
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -2254,7 +2260,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
EXT4_FEATURE_INCOMPAT_CASEFOLD | \
EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
- EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+ EXT4_FEATURE_INCOMPAT_DAX_FC_BYTELOG)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
diff --git a/fs/ext4/fast_commit_bytelog.c b/fs/ext4/fast_commit_bytelog.c
new file mode 100644
index 000000000000..64ba3edddbcb
--- /dev/null
+++ b/fs/ext4/fast_commit_bytelog.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ext4.h"
+#include "fast_commit_bytelog.h"
+
+#include <linux/crc32c.h>
+#include <linux/dax.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/libnvdimm.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <asm/barrier.h>
+
+#define EXT4_FC_BYTELOG_META_BLOCKS 1
+
+static void ext4_fc_bytelog_reset_batch(struct ext4_fc_bytelog *log);
+static int ext4_fc_bytelog_flush_batch(struct super_block *sb, u32 tid);
+
+#define EXT4_FC_CRC32C_POLY 0x82f63b78
+#define EXT4_FC_CRC32C_SHIFT_BITS (sizeof(size_t) * 8)
+
+static u32 ext4_fc_crc32c_shift_mats[EXT4_FC_CRC32C_SHIFT_BITS][32];
+static bool ext4_fc_crc32c_shift_mats_ready;
+
+static u32 ext4_fc_gf2_matrix_times(const u32 *mat, u32 vec)
+{
+ u32 sum = 0;
+ int i;
+
+ for (i = 0; i < 32; i++) {
+ if (vec & 1)
+ sum ^= mat[i];
+ vec >>= 1;
+ }
+
+ return sum;
+}
+
+static void ext4_fc_gf2_matrix_square(u32 *square, const u32 *mat)
+{
+ int i;
+
+ for (i = 0; i < 32; i++)
+ square[i] = ext4_fc_gf2_matrix_times(mat, mat[i]);
+}
+
+static void ext4_fc_crc32c_shift_mats_init_once(void)
+{
+ static DEFINE_MUTEX(lock);
+ u32 even[32], odd[32], one_byte[32];
+ u32 row = 1;
+ int i;
+
+ if (READ_ONCE(ext4_fc_crc32c_shift_mats_ready))
+ return;
+
+ mutex_lock(&lock);
+ if (ext4_fc_crc32c_shift_mats_ready)
+ goto out;
+
+ /*
+ * Build the GF(2) matrix operator for shifting by one byte of zeros,
+ * then square it repeatedly to get powers of two.
+ */
+ odd[0] = EXT4_FC_CRC32C_POLY;
+ for (i = 1; i < 32; i++) {
+ odd[i] = row;
+ row <<= 1;
+ }
+ ext4_fc_gf2_matrix_square(even, odd); /* 2 zero bits */
+ ext4_fc_gf2_matrix_square(odd, even); /* 4 zero bits */
+ ext4_fc_gf2_matrix_square(one_byte, odd); /* 8 zero bits */
+
+ memcpy(ext4_fc_crc32c_shift_mats[0], one_byte, sizeof(one_byte));
+ for (i = 1; i < EXT4_FC_CRC32C_SHIFT_BITS; i++)
+ ext4_fc_gf2_matrix_square(ext4_fc_crc32c_shift_mats[i],
+ ext4_fc_crc32c_shift_mats[i - 1]);
+
+ WRITE_ONCE(ext4_fc_crc32c_shift_mats_ready, true);
+out:
+ mutex_unlock(&lock);
+}
+
+static u32 ext4_fc_crc32c_shift_zeros(u32 crc, size_t len)
+{
+ size_t shift = len;
+ int bit = 0;
+
+ while (shift) {
+ if (shift & 1)
+ crc = ext4_fc_gf2_matrix_times(ext4_fc_crc32c_shift_mats[bit], crc);
+ shift >>= 1;
+ bit++;
+ }
+
+ return crc;
+}
+
+u32 ext4_fc_bytelog_crc32(const void *buf, size_t len)
+{
+ return crc32c(~0, buf, len);
+}
+
+bool ext4_fc_bytelog_mapped(struct ext4_sb_info *sbi)
+{
+ return READ_ONCE(sbi->s_fc_bytelog.mapped);
+}
+
+bool ext4_fc_bytelog_active(struct ext4_sb_info *sbi)
+{
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+
+ return log->mapped && log->enabled;
+}
+
+size_t ext4_fc_bytelog_record_size(size_t payload_len)
+{
+ size_t len = sizeof(struct ext4_fc_bytelog_hdr) + payload_len;
+
+ return ALIGN(len, EXT4_FC_BYTELOG_ALIGN);
+}
+
+void ext4_fc_bytelog_prep_hdr(struct ext4_fc_bytelog_hdr *hdr, u16 tag,
+ u16 flags, u32 tid, u64 seq, u32 payload_len)
+{
+ memset(hdr, 0, sizeof(*hdr));
+
+ hdr->magic = cpu_to_le32(EXT4_FC_BYTELOG_MAGIC);
+ hdr->version = cpu_to_le16(EXT4_FC_BYTELOG_VERSION);
+ hdr->hdr_len = cpu_to_le16(sizeof(*hdr));
+ hdr->tid = cpu_to_le32(tid);
+ hdr->tag = cpu_to_le16(tag);
+ hdr->flags = cpu_to_le16(flags & ~EXT4_FC_BYTELOG_COMMITTED);
+ hdr->payload_len = cpu_to_le32(payload_len);
+ hdr->record_len = cpu_to_le32(ext4_fc_bytelog_record_size(payload_len));
+ hdr->seq = cpu_to_le64(seq);
+}
+
+void ext4_fc_bytelog_finalize_hdr_crc(struct ext4_fc_bytelog_hdr *hdr,
+ u32 payload_crc)
+{
+ struct ext4_fc_bytelog_hdr tmp;
+ u32 crc;
+
+ hdr->payload_crc = cpu_to_le32(payload_crc);
+ hdr->header_crc = 0;
+
+ tmp = *hdr;
+ tmp.header_crc = 0;
+ crc = ext4_fc_bytelog_crc32(&tmp, sizeof(tmp));
+ hdr->header_crc = cpu_to_le32(crc);
+}
+
+static bool ext4_fc_bytelog_record_sane(const struct ext4_fc_bytelog_hdr *hdr,
+ size_t remaining)
+{
+ u32 record_len = le32_to_cpu(hdr->record_len);
+ u32 payload_len = le32_to_cpu(hdr->payload_len);
+ u16 hdr_len = le16_to_cpu(hdr->hdr_len);
+
+ if (le32_to_cpu(hdr->magic) != EXT4_FC_BYTELOG_MAGIC)
+ return false;
+ if (le16_to_cpu(hdr->version) != EXT4_FC_BYTELOG_VERSION)
+ return false;
+ if (hdr_len != sizeof(*hdr))
+ return false;
+ if (!record_len || record_len > remaining)
+ return false;
+ if (!IS_ALIGNED(record_len, EXT4_FC_BYTELOG_ALIGN))
+ return false;
+ if (record_len < hdr_len)
+ return false;
+ if (payload_len > record_len - hdr_len)
+ return false;
+
+ return true;
+}
+
+int ext4_fc_bytelog_validate_hdr(const struct ext4_fc_bytelog_hdr *hdr,
+ size_t remaining, const void *payload)
+{
+ struct ext4_fc_bytelog_hdr tmp;
+ u32 payload_len = le32_to_cpu(hdr->payload_len);
+ u32 crc;
+
+ if (!ext4_fc_bytelog_record_sane(hdr, remaining))
+ return -EINVAL;
+
+ tmp = *hdr;
+ tmp.header_crc = 0;
+ crc = ext4_fc_bytelog_crc32(&tmp, sizeof(tmp));
+ if (crc != le32_to_cpu(hdr->header_crc))
+ return -EFSBADCRC;
+
+ if (!payload_len)
+ return 0;
+ if (!payload)
+ return -EINVAL;
+
+ crc = ext4_fc_bytelog_crc32(payload, payload_len);
+ if (crc != le32_to_cpu(hdr->payload_crc))
+ return -EFSBADCRC;
+
+ return 0;
+}
+
+void ext4_fc_bytelog_mark_committed(struct ext4_fc_bytelog_hdr *hdr)
+{
+ u16 flags = le16_to_cpu(hdr->flags);
+ struct ext4_fc_bytelog_hdr tmp;
+ u32 crc;
+
+ flags |= EXT4_FC_BYTELOG_COMMITTED;
+ hdr->flags = cpu_to_le16(flags);
+
+ tmp = *hdr;
+ tmp.header_crc = 0;
+ crc = ext4_fc_bytelog_crc32(&tmp, sizeof(tmp));
+ hdr->header_crc = cpu_to_le32(crc);
+}
+
+void ext4_fc_bytelog_flush_persist(void *addr, size_t len)
+{
+ u8 *p = addr;
+ size_t off = 0;
+
+ if (!len)
+ return;
+
+ /*
+ * Large flushes can be very bursty. Chunk the flush so other tasks
+ * can make progress between chunks.
+ */
+ if (len <= 65536) {
+ arch_wb_cache_pmem(p, len);
+ return;
+ }
+
+ while (off < len) {
+ size_t n = min(len - off, (size_t)65536);
+
+ arch_wb_cache_pmem(p + off, n);
+ off += n;
+ cond_resched();
+ }
+}
+
+void ext4_fc_bytelog_persist_barrier(void)
+{
+ pmem_wmb();
+}
+
+static int ext4_fc_bytelog_map_ring(struct super_block *sb,
+ journal_t *journal,
+ struct ext4_fc_bytelog *log)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long long first, anchor;
+ unsigned long fc_blocks;
+ unsigned long ring_blocks;
+ u64 start_bytes, ring_bytes, start_offset;
+ pgoff_t start_pgoff;
+ unsigned long ring_pages;
+ void *addr = NULL;
+ int ret;
+ int blkbits = sb->s_blocksize_bits;
+
+ if (!journal->j_inode)
+ return -EOPNOTSUPP;
+
+ if (journal->j_fc_last <= journal->j_fc_first + 1)
+ return -ENOSPC;
+
+ fc_blocks = journal->j_fc_last - journal->j_fc_first;
+ ring_blocks = fc_blocks - 1;
+ if (ring_blocks <= EXT4_FC_BYTELOG_META_BLOCKS)
+ return -ENOSPC;
+
+ ret = jbd2_journal_bmap(journal, journal->j_fc_first, &first);
+ if (ret)
+ return ret;
+
+ ret = jbd2_journal_bmap(journal, journal->j_fc_last - 1, &anchor);
+ if (ret)
+ return ret;
+
+ start_bytes = (u64)first << blkbits;
+ ring_bytes = (u64)ring_blocks << blkbits;
+ if (!ring_bytes)
+ return -ENOSPC;
+ if (ring_bytes & (PAGE_SIZE - 1))
+ return -EOPNOTSUPP;
+ if (start_bytes > U64_MAX - sbi->s_dax_part_off)
+ return -ERANGE;
+
+ start_offset = start_bytes + sbi->s_dax_part_off;
+ if (!IS_ALIGNED(start_offset, PAGE_SIZE))
+ return -EINVAL;
+
+ start_pgoff = start_offset >> PAGE_SHIFT;
+ ring_pages = ring_bytes >> PAGE_SHIFT;
+ if (!ring_pages || ring_pages > LONG_MAX)
+ return -E2BIG;
+
+#if IS_ENABLED(CONFIG_DAX)
+ {
+ long mapped;
+ int dax_id = dax_read_lock();
+
+ mapped = dax_direct_access(sbi->s_daxdev, start_pgoff,
+ ring_pages, DAX_ACCESS, &addr,
+ NULL);
+ dax_read_unlock(dax_id);
+ if (mapped < 0)
+ return mapped;
+ if (mapped < ring_pages)
+ return -ENXIO;
+ }
+#else
+ return -EOPNOTSUPP;
+#endif
+
+ log->kaddr = addr;
+ log->size_bytes = ring_bytes;
+ log->base_off = (u64)EXT4_FC_BYTELOG_META_BLOCKS << blkbits;
+ log->persist_off = log->base_off;
+ log->blocks = ring_blocks;
+ log->blocksize = sb->s_blocksize;
+ log->start_pblk = first;
+ log->anchor_pblk = anchor;
+
+ return 0;
+}
+
+int ext4_fc_bytelog_init(struct super_block *sb, journal_t *journal)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ bool have_feature = ext4_has_feature_dax_fc_bytelog(sb);
+ bool requested = test_opt2(sb, DAX_FC_BYTELOG);
+ bool force = test_opt2(sb, DAX_FC_BYTELOG_FORCE);
+ bool need_map = have_feature || requested || force;
+ u32 batch_max;
+ int ret;
+
+ if (!need_map) {
+ log->enabled = false;
+ log->last_error = -EOPNOTSUPP;
+ return 0;
+ }
+
+ ext4_fc_crc32c_shift_mats_init_once();
+
+ if (log->mapped)
+ goto enable;
+
+ batch_max = log->batch_max;
+ memset(log, 0, sizeof(*log));
+ log->batch_max = batch_max ? batch_max :
+ EXT4_FC_BYTELOG_BATCH_MAX_DEFAULT;
+ log->last_error = -EOPNOTSUPP;
+
+ if (!journal || !test_opt2(sb, JOURNAL_FAST_COMMIT)) {
+ if (requested)
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog requires fast commits enabled");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * ext4_fc_bytelog_init() is called once before jbd2_journal_load() so
+ * that existing ByteLog records can be replayed. On a fresh
+ * filesystem, the JBD2 fast-commit feature may not be enabled on the
+ * journal yet, so there is no fast-commit area to map at this stage.
+ *
+ * If the on-disk feature bit is set, lack of journal fast-commit
+ * support indicates an inconsistent filesystem and must be fatal.
+ * Otherwise, defer mapping until the post-journal-load init path.
+ */
+ if (!jbd2_has_feature_fast_commit(journal)) {
+ if (have_feature) {
+ ext4_msg(sb, KERN_ERR,
+ "dax_fc_bytelog requires JBD2 fast commits enabled");
+ return -EINVAL;
+ }
+
+ log->enabled = false;
+ log->last_error = -EOPNOTSUPP;
+ return 0;
+ }
+
+ /*
+ * When dax_fc_bytelog=on is specified without the incompat feature
+ * bit, refuse to enable ByteLog. dax_fc_bytelog=force overrides this
+ * check and is intended only for testing.
+ */
+ if (!have_feature && requested && !force) {
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog=on requires INCOMPAT_DAX_FC_BYTELOG");
+ return -EOPNOTSUPP;
+ }
+ if (!have_feature && force)
+ ext4_warning(sb,
+ "forcing dax_fc_bytelog without INCOMPAT_DAX_FC_BYTELOG; older kernels cannot safely mount this filesystem");
+
+ if (test_opt2(sb, DAX_NEVER)) {
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog requires DAX, but dax=never is set");
+ return -EOPNOTSUPP;
+ }
+ if (!sbi->s_daxdev) {
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog requires a dax-capable filesystem device");
+ return -EOPNOTSUPP;
+ }
+ if (sb->s_blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog requires blocksize == PAGE_SIZE");
+ return -EOPNOTSUPP;
+ }
+
+ ret = ext4_fc_bytelog_map_ring(sb, journal, log);
+ if (ret) {
+ log->last_error = ret;
+ ext4_msg(sb, KERN_INFO,
+ "dax_fc_bytelog disabled: unable to map fast-commit ring (err=%d)",
+ ret);
+ ext4_debug("ByteLog mapping unavailable (err=%d)\n", ret);
+ return ret;
+ }
+
+ log->head = log->base_off;
+ log->tail = log->base_off;
+ log->seq = 0;
+ log->ring_crc = ~0;
+ log->dirty = false;
+ log->persist_off = log->base_off;
+ ext4_fc_bytelog_reset_batch(log);
+ log->mapped = true;
+ log->last_error = 0;
+enable:
+ log->enabled = requested || force;
+ return 0;
+}
+
+void ext4_fc_bytelog_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ memset(&sbi->s_fc_bytelog, 0, sizeof(sbi->s_fc_bytelog));
+}
+
+void ext4_fc_bytelog_reset(struct super_block *sb, bool full)
+{
+ struct ext4_fc_bytelog *log = &EXT4_SB(sb)->s_fc_bytelog;
+
+ if (!log->mapped)
+ return;
+ if (!full)
+ return;
+
+ log->head = log->base_off;
+ log->tail = log->base_off;
+ log->seq = 0;
+ log->ring_crc = ~0;
+ log->dirty = false;
+ log->persist_off = log->base_off;
+ ext4_fc_bytelog_reset_batch(log);
+}
+
+void ext4_fc_bytelog_begin_commit(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+
+ if (!log->mapped || !log->enabled)
+ return;
+
+ log->head = log->base_off;
+ log->tail = log->base_off;
+ log->seq = 0;
+ log->ring_crc = ~0;
+ log->dirty = false;
+ log->persist_off = log->base_off;
+ ext4_fc_bytelog_reset_batch(log);
+}
+
+int ext4_fc_bytelog_end_commit(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ journal_t *journal = sbi->s_journal;
+ u8 *base;
+ u64 cursor, end;
+ u32 tid;
+ int ret;
+
+ if (!log->mapped || !log->enabled)
+ return 0;
+
+ if (!journal || !journal->j_running_transaction)
+ return -EINVAL;
+ tid = journal->j_running_transaction->t_tid;
+
+ ret = ext4_fc_bytelog_flush_batch(sb, tid);
+ if (ret) {
+ log->last_error = ret;
+ return ret;
+ }
+
+ if (!log->dirty)
+ return 0;
+
+ base = log->kaddr;
+ if (!base)
+ return -EOPNOTSUPP;
+
+ cursor = log->persist_off;
+ end = log->head;
+ if (end <= cursor)
+ return 0;
+
+ ext4_fc_bytelog_flush_persist(base + cursor, end - cursor);
+ ext4_fc_bytelog_persist_barrier();
+
+ log->persist_off = end;
+ log->dirty = false;
+ return 0;
+}
+
+static inline bool ext4_fc_bytelog_has_space(struct ext4_fc_bytelog *log,
+ size_t len)
+{
+ if (log->head < log->base_off)
+ return false;
+ if (len > log->size_bytes - log->base_off)
+ return false;
+ return log->head + len <= log->size_bytes;
+}
+
+static void ext4_fc_bytelog_reset_batch(struct ext4_fc_bytelog *log)
+{
+ log->batch_first_tag = 0;
+ log->batch_len = 0;
+ log->batch_tlvs = 0;
+ log->batch_payload_crc = ~0U;
+}
+
+static int ext4_fc_bytelog_commit_record(struct super_block *sb, u32 tid, u16 tag,
+ size_t payload_len, u32 payload_crc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ struct ext4_fc_bytelog_hdr hdr;
+ size_t total_len, off;
+ u32 ring_crc;
+ u8 *dst;
+ u8 *payload;
+ u64 seq;
+ bool mats_ready;
+
+ total_len = ext4_fc_bytelog_record_size(payload_len);
+ if (!ext4_fc_bytelog_has_space(log, total_len))
+ return -ENOSPC;
+
+ seq = log->seq;
+ ring_crc = log->ring_crc;
+
+ mats_ready = READ_ONCE(ext4_fc_crc32c_shift_mats_ready);
+ ext4_fc_bytelog_prep_hdr(&hdr, tag, 0, tid, seq, payload_len);
+ dst = (u8 *)log->kaddr + log->head;
+ off = sizeof(hdr);
+ payload = dst + off;
+
+ if (payload_len) {
+ if (likely(mats_ready)) {
+ ring_crc = ext4_fc_crc32c_shift_zeros(ring_crc ^ ~0U, payload_len);
+ ring_crc ^= payload_crc;
+ } else {
+ ring_crc = crc32c(ring_crc, payload, payload_len);
+ }
+ off += payload_len;
+ } else {
+ payload_crc = ext4_fc_bytelog_crc32(NULL, 0);
+ }
+
+ if (off < total_len) {
+ size_t pad = total_len - off;
+
+ memset(dst + off, 0, pad);
+ }
+
+ hdr.flags = cpu_to_le16(le16_to_cpu(hdr.flags) | EXT4_FC_BYTELOG_COMMITTED);
+ ext4_fc_bytelog_finalize_hdr_crc(&hdr, payload_crc);
+ memcpy(dst, &hdr, sizeof(hdr));
+
+ log->head += total_len;
+ log->seq++;
+ log->dirty = true;
+ log->ring_crc = ring_crc;
+
+ return 0;
+}
+
+static size_t ext4_fc_bytelog_copy_vecs(u8 *dst,
+ struct ext4_fc_bytelog_vec *vecs,
+ int nvec, u32 *crc)
+{
+ size_t off = 0;
+ u32 crc_val = crc ? *crc : 0;
+ int i;
+
+ for (i = 0; i < nvec; i++) {
+ const u8 *src = vecs[i].base;
+ size_t len = vecs[i].len;
+
+ if (!len)
+ continue;
+
+ while (i + 1 < nvec && vecs[i + 1].len &&
+ vecs[i + 1].base == src + len) {
+ len += vecs[i + 1].len;
+ i++;
+ }
+
+ if (crc)
+ crc_val = crc32c(crc_val, src, len);
+ memcpy(dst + off, src, len);
+ off += len;
+ }
+
+ if (crc)
+ *crc = crc_val;
+ return off;
+}
+
+static int ext4_fc_bytelog_append_vec_direct(struct super_block *sb, u32 tid, u16 tag,
+ struct ext4_fc_bytelog_vec *vecs,
+ int nvec, size_t payload_len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ size_t total_len;
+ u32 payload_crc = ~0U;
+ u8 *dst;
+
+ total_len = ext4_fc_bytelog_record_size(payload_len);
+ if (!ext4_fc_bytelog_has_space(log, total_len))
+ return -ENOSPC;
+
+ dst = (u8 *)log->kaddr + log->head + sizeof(struct ext4_fc_bytelog_hdr);
+ ext4_fc_bytelog_copy_vecs(dst, vecs, nvec, &payload_crc);
+ return ext4_fc_bytelog_commit_record(sb, tid, tag, payload_len,
+ payload_crc);
+}
+
+static int ext4_fc_bytelog_flush_batch(struct super_block *sb, u32 tid)
+{
+ struct ext4_fc_bytelog *log = &EXT4_SB(sb)->s_fc_bytelog;
+ u32 payload_crc = ~0U;
+ u16 tag;
+ int ret;
+
+ if (!log->batch_len)
+ return 0;
+
+ tag = log->batch_first_tag;
+ if (log->batch_tlvs > 1)
+ tag = EXT4_FC_BYTELOG_TAG_BATCH;
+
+ if (!log->kaddr)
+ return -EOPNOTSUPP;
+
+ payload_crc = log->batch_payload_crc;
+ ret = ext4_fc_bytelog_commit_record(sb, tid, tag, log->batch_len,
+ payload_crc);
+ ext4_fc_bytelog_reset_batch(log);
+ return ret;
+}
+
+int ext4_fc_bytelog_append_vec(struct super_block *sb, u16 tag,
+ struct ext4_fc_bytelog_vec *vecs, int nvec)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ struct journal_s *journal = sbi->s_journal;
+ size_t payload_len = 0;
+ u32 batch_max = log->batch_max;
+ u32 tid;
+ int i;
+ u8 *base;
+ u8 *dst;
+
+ if (!ext4_fc_bytelog_active(sbi))
+ return -EOPNOTSUPP;
+
+ if (!journal || !journal->j_running_transaction)
+ return -EINVAL;
+ tid = journal->j_running_transaction->t_tid;
+
+ for (i = 0; i < nvec; i++)
+ payload_len += vecs[i].len;
+
+ base = log->kaddr;
+ if (!base)
+ return -EOPNOTSUPP;
+
+ if (!batch_max) {
+ int ret;
+
+ ret = ext4_fc_bytelog_flush_batch(sb, tid);
+ if (ret)
+ return ret;
+ return ext4_fc_bytelog_append_vec_direct(sb, tid, tag, vecs,
+ nvec, payload_len);
+ }
+
+ if (payload_len > batch_max) {
+ int ret;
+
+ ret = ext4_fc_bytelog_flush_batch(sb, tid);
+ if (ret)
+ return ret;
+ return ext4_fc_bytelog_append_vec_direct(sb, tid, tag, vecs,
+ nvec, payload_len);
+ }
+
+ if (log->batch_len && log->batch_len + payload_len > batch_max) {
+ int ret;
+
+ ret = ext4_fc_bytelog_flush_batch(sb, tid);
+ if (ret)
+ return ret;
+ }
+
+ if (!log->batch_len)
+ log->batch_first_tag = tag;
+
+ if (!ext4_fc_bytelog_has_space(log,
+ ext4_fc_bytelog_record_size(log->batch_len +
+ payload_len))) {
+ int ret;
+
+ ret = ext4_fc_bytelog_flush_batch(sb, tid);
+ if (ret)
+ return ret;
+ log->batch_first_tag = tag;
+ }
+
+ if (!ext4_fc_bytelog_has_space(log,
+ ext4_fc_bytelog_record_size(log->batch_len +
+ payload_len)))
+ return -ENOSPC;
+
+ dst = base + log->head + sizeof(struct ext4_fc_bytelog_hdr) +
+ log->batch_len;
+ log->batch_len += ext4_fc_bytelog_copy_vecs(dst, vecs, nvec, &log->batch_payload_crc);
+ log->batch_tlvs++;
+ log->dirty = true;
+ return 0;
+}
+
+void ext4_fc_bytelog_build_anchor(struct super_block *sb,
+ struct ext4_fc_bytelog_anchor *anchor,
+ u32 tid)
+{
+ struct ext4_fc_bytelog *log = &EXT4_SB(sb)->s_fc_bytelog;
+
+ memset(anchor, 0, sizeof(*anchor));
+ anchor->tid = tid;
+ anchor->head = log->head;
+ anchor->tail = log->tail;
+ anchor->seq = log->seq;
+ anchor->crc = log->ring_crc;
+}
diff --git a/fs/ext4/fast_commit_bytelog.h b/fs/ext4/fast_commit_bytelog.h
new file mode 100644
index 000000000000..d52754890222
--- /dev/null
+++ b/fs/ext4/fast_commit_bytelog.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EXT4_FAST_COMMIT_BYTELOG_H
+#define _EXT4_FAST_COMMIT_BYTELOG_H
+
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/types.h>
+
+struct super_block;
+struct journal_s;
+struct ext4_sb_info;
+
+#define EXT4_FC_BYTELOG_MAGIC 0x4c424346 /* "FCBL" */
+#define EXT4_FC_BYTELOG_VERSION 1
+#define EXT4_FC_BYTELOG_ALIGN 64
+#define EXT4_FC_BYTELOG_BATCH_MAX_DEFAULT 4096
+
+/*
+ * Record header @tag for a batched TLV payload stream.
+ *
+ * In this case the payload is a stream of standard fast-commit TLVs
+ * (struct ext4_fc_tl + value).
+ */
+#define EXT4_FC_BYTELOG_TAG_BATCH 0xffff
+
+/* Record flag bits */
+#define EXT4_FC_BYTELOG_COMMITTED BIT(0)
+
+/**
+ * struct ext4_fc_bytelog_hdr - On-media header for a ByteLog record
+ * @magic: Magic identifying the record
+ * @version: On-disk header format version
+ * @hdr_len: Length of this header in bytes
+ * @tid: JBD2 transaction identifier
+ * @tag: Ext4 fast-commit tag (or EXT4_FC_BYTELOG_TAG_BATCH)
+ * @flags: Record flags (EXT4_FC_BYTELOG_*)
+ * @payload_len:Length of payload bytes following the header
+ * @payload_crc:CRC32C of the payload
+ * @record_len: Entire record length including header, payload and padding
+ * @header_crc: CRC32C of the header with @header_crc zeroed
+ * @seq: Monotonic sequence number assigned by the ByteLog writer
+ * @reserved: Future fields, currently zeroed
+ *
+ * The structure is padded to 64 bytes to keep each record 64B aligned.
+ */
+struct ext4_fc_bytelog_hdr {
+ __le32 magic;
+ __le16 version;
+ __le16 hdr_len;
+ __le32 tid;
+ __le16 tag;
+ __le16 flags;
+ __le32 payload_len;
+ __le32 payload_crc;
+ __le32 record_len;
+ __le32 header_crc;
+ __le64 seq;
+ __le64 reserved[3];
+} __packed;
+
+struct ext4_fc_bytelog_anchor {
+ u32 tid;
+ u64 head;
+ u64 tail;
+ u64 seq;
+ u32 crc;
+};
+
+struct ext4_fc_bytelog {
+ void *kaddr;
+ u64 size_bytes;
+ u64 base_off;
+ u64 persist_off;
+ u32 blocksize;
+ u32 blocks;
+ u64 start_pblk;
+ u64 anchor_pblk;
+ u64 head;
+ u64 tail;
+ u64 seq;
+ u32 ring_crc;
+
+ u32 batch_max;
+ u16 batch_first_tag;
+ u32 batch_len;
+ u32 batch_tlvs;
+ u32 batch_payload_crc;
+
+ bool mapped;
+ bool enabled;
+ bool dirty;
+ int last_error;
+};
+
+struct ext4_fc_bytelog_vec {
+ const void *base;
+ size_t len;
+};
+
+int ext4_fc_bytelog_init(struct super_block *sb, struct journal_s *journal);
+void ext4_fc_bytelog_release(struct super_block *sb);
+void ext4_fc_bytelog_reset(struct super_block *sb, bool full);
+void ext4_fc_bytelog_begin_commit(struct super_block *sb);
+int ext4_fc_bytelog_end_commit(struct super_block *sb);
+bool ext4_fc_bytelog_active(struct ext4_sb_info *sbi);
+bool ext4_fc_bytelog_mapped(struct ext4_sb_info *sbi);
+int ext4_fc_bytelog_append_vec(struct super_block *sb, u16 tag,
+ struct ext4_fc_bytelog_vec *vecs, int nvec);
+void ext4_fc_bytelog_build_anchor(struct super_block *sb,
+ struct ext4_fc_bytelog_anchor *anchor,
+ u32 tid);
+
+static inline bool ext4_fc_bytelog_record_committed(const struct ext4_fc_bytelog_hdr *hdr)
+{
+ return !!(le16_to_cpu(hdr->flags) & EXT4_FC_BYTELOG_COMMITTED);
+}
+
+static inline u32 ext4_fc_bytelog_record_len(const struct ext4_fc_bytelog_hdr *hdr)
+{
+ return le32_to_cpu(hdr->record_len);
+}
+
+static inline u32 ext4_fc_bytelog_payload_len(const struct ext4_fc_bytelog_hdr *hdr)
+{
+ return le32_to_cpu(hdr->payload_len);
+}
+
+static inline u64 ext4_fc_bytelog_seq(const struct ext4_fc_bytelog_hdr *hdr)
+{
+ return le64_to_cpu(hdr->seq);
+}
+
+size_t ext4_fc_bytelog_record_size(size_t payload_len);
+void ext4_fc_bytelog_prep_hdr(struct ext4_fc_bytelog_hdr *hdr, u16 tag,
+ u16 flags, u32 tid, u64 seq, u32 payload_len);
+void ext4_fc_bytelog_finalize_hdr_crc(struct ext4_fc_bytelog_hdr *hdr,
+ u32 payload_crc);
+int ext4_fc_bytelog_validate_hdr(const struct ext4_fc_bytelog_hdr *hdr,
+ size_t remaining, const void *payload);
+void ext4_fc_bytelog_mark_committed(struct ext4_fc_bytelog_hdr *hdr);
+
+void ext4_fc_bytelog_flush_persist(void *addr, size_t len);
+void ext4_fc_bytelog_persist_barrier(void);
+
+u32 ext4_fc_bytelog_crc32(const void *buf, size_t len);
+
+#endif /* _EXT4_FAST_COMMIT_BYTELOG_H */
--
2.52.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [RFC PATCH 2/4] ext4: add dax_fc_bytelog mount option
2026-02-26 10:17 [RFC PATCH 0/4] ext4: Byte-granular ByteLog optimizes DAX fast commits Li Chen
2026-02-26 10:17 ` [RFC PATCH 1/4] ext4: introduce DAX fast commit ByteLog backend Li Chen
@ 2026-02-26 10:17 ` Li Chen
2026-02-26 10:17 ` [RFC PATCH 3/4] ext4: fast_commit: write TLVs into DAX ByteLog Li Chen
2026-02-26 10:17 ` [RFC PATCH 4/4] ext4: fast_commit: replay DAX ByteLog records Li Chen
3 siblings, 0 replies; 5+ messages in thread
From: Li Chen @ 2026-02-26 10:17 UTC (permalink / raw)
To: linux-ext4, Theodore Ts'o, Andreas Dilger, linux-kernel
Cc: Harshad Shirwadkar, Li Chen
Add dax_fc_bytelog={off,on,force} to control the DAX ByteLog fast commit
backend.
Initialize the ByteLog ring before fast commit replay and release it on
unmount.
Signed-off-by: Li Chen <me@linux.beauty>
---
fs/ext4/super.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 76 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 504148b2142b..3645456a61dd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1368,6 +1368,7 @@ static void ext4_put_super(struct super_block *sb)
sbi->s_ea_block_cache = NULL;
ext4_stop_mmpd(sbi);
+ ext4_fc_bytelog_release(sb);
brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
@@ -1685,6 +1686,8 @@ enum {
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
+ Opt_dax_fc_bytelog, Opt_dax_fc_bytelog_off, Opt_dax_fc_bytelog_on,
+ Opt_dax_fc_bytelog_force,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
@@ -1724,6 +1727,13 @@ static const struct constant_table ext4_param_dax[] = {
{}
};
+static const struct constant_table ext4_param_dax_fc_bytelog[] = {
+ {"off", Opt_dax_fc_bytelog_off},
+ {"on", Opt_dax_fc_bytelog_on},
+ {"force", Opt_dax_fc_bytelog_force},
+ {}
+};
+
/*
* Mount option specification
* We don't use fsparam_flag_no because of the way we set the
@@ -1780,6 +1790,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("i_version", Opt_removed),
fsparam_flag ("dax", Opt_dax),
fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
+ fsparam_enum("dax_fc_bytelog", Opt_dax_fc_bytelog,
+ ext4_param_dax_fc_bytelog),
fsparam_u32 ("stripe", Opt_stripe),
fsparam_flag ("delalloc", Opt_delalloc),
fsparam_flag ("nodelalloc", Opt_nodelalloc),
@@ -1965,6 +1977,7 @@ ext4_sb_read_encoding(const struct ext4_super_block *es)
#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
#define EXT4_SPEC_s_sb_block (1 << 18)
#define EXT4_SPEC_mb_optimize_scan (1 << 19)
+#define EXT4_SPEC_s_dax_fc_bytelog BIT(20)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
@@ -2370,6 +2383,26 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ext4_msg(NULL, KERN_INFO, "dax option not supported");
return -EINVAL;
#endif
+ case Opt_dax_fc_bytelog:
+ switch (result.uint_32) {
+ case Opt_dax_fc_bytelog_off:
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_FC_BYTELOG);
+ ctx_clear_mount_opt2(ctx,
+ EXT4_MOUNT2_DAX_FC_BYTELOG_FORCE);
+ break;
+ case Opt_dax_fc_bytelog_on:
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_FC_BYTELOG);
+ ctx_clear_mount_opt2(ctx,
+ EXT4_MOUNT2_DAX_FC_BYTELOG_FORCE);
+ break;
+ case Opt_dax_fc_bytelog_force:
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_FC_BYTELOG);
+ ctx_set_mount_opt2(ctx,
+ EXT4_MOUNT2_DAX_FC_BYTELOG_FORCE);
+ break;
+ }
+ ctx->spec |= EXT4_SPEC_s_dax_fc_bytelog;
+ return 0;
case Opt_data_err:
if (result.uint_32 == Opt_data_err_abort)
ctx_set_mount_opt(ctx, m->mount_opt);
@@ -2819,7 +2852,22 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
goto fail_dax_change_remount;
}
- }
+
+ if (ctx->spec & EXT4_SPEC_s_dax_fc_bytelog) {
+ bool new_on = ctx_test_mount_opt2(ctx,
+ EXT4_MOUNT2_DAX_FC_BYTELOG);
+ bool new_force = ctx_test_mount_opt2(ctx,
+ EXT4_MOUNT2_DAX_FC_BYTELOG_FORCE);
+ bool cur_on = test_opt2(sb, DAX_FC_BYTELOG);
+ bool cur_force = test_opt2(sb, DAX_FC_BYTELOG_FORCE);
+
+ if (new_on != cur_on || new_force != cur_force) {
+ ext4_msg(NULL, KERN_ERR,
+ "can't change dax_fc_bytelog mount option while remounting");
+ return -EINVAL;
+ }
+ }
+ }
return ext4_check_quota_consistency(fc, sb);
}
@@ -3038,6 +3086,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
+ if (test_opt2(sb, DAX_FC_BYTELOG)) {
+ if (test_opt2(sb, DAX_FC_BYTELOG_FORCE))
+ SEQ_OPTS_PUTS("dax_fc_bytelog=force");
+ else
+ SEQ_OPTS_PUTS("dax_fc_bytelog=on");
+ }
if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
!test_opt2(sb, MB_OPTIMIZE_SCAN)) {
@@ -4950,6 +5004,8 @@ static int ext4_load_and_init_journal(struct super_block *sb,
"Failed to set fast commit journal feature");
goto out;
}
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT))
+ ext4_fc_bytelog_init(sb, sbi->s_journal);
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -6124,10 +6180,29 @@ static int ext4_load_journal(struct super_block *sb,
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
__le16 orig_state;
bool changed = false;
+ int fc_err;
if (save)
memcpy(save, ((char *) es) +
EXT4_S_ERR_START, EXT4_S_ERR_LEN);
+
+ /*
+ * Map the ByteLog ring before fast-commit replay so that
+ * EXT4_FC_TAG_DAX_BYTELOG_ANCHOR records can be processed
+ * during jbd2_journal_load().
+ *
+ * For filesystems with the INCOMPAT_DAX_FC_BYTELOG feature
+ * bit set, failing to initialize the ByteLog ring must be
+ * treated as fatal.
+ */
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT)) {
+ fc_err = ext4_fc_bytelog_init(sb, journal);
+ if (fc_err && ext4_has_feature_dax_fc_bytelog(sb)) {
+ kfree(save);
+ err = fc_err;
+ goto err_out;
+ }
+ }
err = jbd2_journal_load(journal);
if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
save, EXT4_S_ERR_LEN)) {
--
2.52.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [RFC PATCH 3/4] ext4: fast_commit: write TLVs into DAX ByteLog
2026-02-26 10:17 [RFC PATCH 0/4] ext4: Byte-granular ByteLog optimizes DAX fast commits Li Chen
2026-02-26 10:17 ` [RFC PATCH 1/4] ext4: introduce DAX fast commit ByteLog backend Li Chen
2026-02-26 10:17 ` [RFC PATCH 2/4] ext4: add dax_fc_bytelog mount option Li Chen
@ 2026-02-26 10:17 ` Li Chen
2026-02-26 10:17 ` [RFC PATCH 4/4] ext4: fast_commit: replay DAX ByteLog records Li Chen
3 siblings, 0 replies; 5+ messages in thread
From: Li Chen @ 2026-02-26 10:17 UTC (permalink / raw)
To: linux-ext4, Theodore Ts'o, Andreas Dilger, linux-kernel
Cc: Harshad Shirwadkar, Li Chen
When dax_fc_bytelog is enabled, write fast commit TLVs directly into the
DAX-mapped ByteLog ring.
Keep traditional TLV writes confined to the reserved FC block and emit an
anchor TLV to describe the ByteLog window.
Signed-off-by: Li Chen <me@linux.beauty>
---
fs/ext4/fast_commit.c | 124 +++++++++++++++++++++++++++++++++-
fs/ext4/fast_commit.h | 13 ++++
fs/ext4/fast_commit_bytelog.c | 20 ++++++
fs/ext4/fast_commit_bytelog.h | 5 ++
4 files changed, 159 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 64c0c4ba58b0..2f7b7ea29df2 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -723,6 +723,12 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
* leaving enough space for a PAD tlv.
*/
remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (ext4_fc_bytelog_active(sbi) && len > remaining) {
+ ext4_fc_mark_ineligible(sb,
+ EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW,
+ NULL);
+ return NULL;
+ }
if (len <= remaining) {
sbi->s_fc_bytes += len;
return dst;
@@ -806,6 +812,31 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
struct ext4_fc_tl tl;
u8 *dst;
+ if (ext4_fc_bytelog_active(EXT4_SB(sb)) &&
+ (tag == EXT4_FC_TAG_ADD_RANGE || tag == EXT4_FC_TAG_DEL_RANGE ||
+ tag == EXT4_FC_TAG_LINK || tag == EXT4_FC_TAG_UNLINK ||
+ tag == EXT4_FC_TAG_CREAT || tag == EXT4_FC_TAG_INODE)) {
+ struct ext4_fc_bytelog_vec vecs[2];
+ int ret;
+
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(len);
+ vecs[0].base = &tl;
+ vecs[0].len = sizeof(tl);
+ vecs[1].base = val;
+ vecs[1].len = len;
+
+ ret = ext4_fc_bytelog_append_vec(sb, tag, vecs,
+ ARRAY_SIZE(vecs));
+ if (!ret)
+ return true;
+ if (ret == -ENOSPC)
+ ext4_fc_mark_ineligible(sb,
+ EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW,
+ NULL);
+ return false;
+ }
+
dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
if (!dst)
return false;
@@ -819,6 +850,17 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
return true;
}
+static bool ext4_fc_add_bytelog_anchor_tlv(struct super_block *sb,
+ struct ext4_fc_bytelog_anchor *anchor,
+ u32 *crc)
+{
+ struct ext4_fc_bytelog_entry entry;
+
+ ext4_fc_bytelog_anchor_to_disk(&entry, anchor);
+ return ext4_fc_add_tlv(sb, EXT4_FC_TAG_DAX_BYTELOG_ANCHOR,
+ sizeof(entry), (u8 *)&entry, crc);
+}
+
/* Same as above, but adds dentry tlv. */
static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
struct ext4_fc_dentry_update *fc_dentry)
@@ -826,9 +868,40 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
int dlen = fc_dentry->fcd_name.name.len;
- u8 *dst = ext4_fc_reserve_space(sb,
- EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
+ u8 *dst;
+
+ if (ext4_fc_bytelog_active(EXT4_SB(sb)) &&
+ (fc_dentry->fcd_op == EXT4_FC_TAG_LINK ||
+ fc_dentry->fcd_op == EXT4_FC_TAG_UNLINK ||
+ fc_dentry->fcd_op == EXT4_FC_TAG_CREAT)) {
+ struct ext4_fc_bytelog_vec vecs[3];
+ int ret;
+
+ fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
+ fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
+ tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
+ tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
+
+ vecs[0].base = &tl;
+ vecs[0].len = sizeof(tl);
+ vecs[1].base = &fcd;
+ vecs[1].len = sizeof(fcd);
+ vecs[2].base = fc_dentry->fcd_name.name.name;
+ vecs[2].len = dlen;
+
+ ret = ext4_fc_bytelog_append_vec(sb, fc_dentry->fcd_op, vecs,
+ ARRAY_SIZE(vecs));
+ if (!ret)
+ return true;
+ if (ret == -ENOSPC)
+ ext4_fc_mark_ineligible(sb,
+ EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW,
+ NULL);
+ return false;
+ }
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) +
+ dlen, crc);
if (!dst)
return false;
@@ -872,6 +945,25 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+ if (ext4_fc_bytelog_active(EXT4_SB(inode->i_sb))) {
+ struct ext4_fc_bytelog_vec vecs[3];
+
+ vecs[0].base = &tl;
+ vecs[0].len = sizeof(tl);
+ vecs[1].base = &fc_inode.fc_ino;
+ vecs[1].len = sizeof(fc_inode.fc_ino);
+ vecs[2].base = ext4_raw_inode(&iloc);
+ vecs[2].len = inode_len;
+
+ ret = ext4_fc_bytelog_append_vec(inode->i_sb, EXT4_FC_TAG_INODE,
+ vecs, ARRAY_SIZE(vecs));
+ if (ret == -ENOSPC)
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW,
+ NULL);
+ goto err;
+ }
+
ret = -ECANCELED;
dst = ext4_fc_reserve_space(inode->i_sb,
EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
@@ -1147,6 +1239,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
/* Step 6.2: Now write all the dentry updates. */
+ if (ext4_fc_bytelog_active(sbi))
+ ext4_fc_bytelog_begin_commit(sb);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
if (ret)
goto out;
@@ -1164,6 +1258,22 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
goto out;
}
+
+ if (ext4_fc_bytelog_active(sbi)) {
+ struct ext4_fc_bytelog_anchor anchor;
+
+ ret = ext4_fc_bytelog_end_commit(sb);
+ if (ret)
+ goto out;
+ if (sbi->s_fc_bytelog.seq) {
+ ext4_fc_bytelog_build_anchor(sb, &anchor,
+ sbi->s_journal->j_running_transaction->t_tid);
+ if (!ext4_fc_add_bytelog_anchor_tlv(sb, &anchor, &crc)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+ }
/* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
@@ -1262,6 +1372,12 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
else
journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
set_task_ioprio(current, journal_ioprio);
+
+ if (ext4_fc_bytelog_active(sbi)) {
+ journal->j_fc_off = 0;
+ sbi->s_fc_bytes = 0;
+ }
+
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
@@ -1367,8 +1483,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
}
- if (full)
+ if (full || ext4_fc_bytelog_active(sbi))
sbi->s_fc_bytes = 0;
+ ext4_fc_bytelog_reset(sb, full);
ext4_fc_unlock(sb, alloc_ctx);
trace_ext4_fc_stats(sb);
}
@@ -2315,6 +2432,7 @@ static const char * const fc_ineligible_reasons[] = {
[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
+ [EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW] = "ByteLog TLV overflow",
[EXT4_FC_REASON_MIGRATE] = "Inode format migration",
[EXT4_FC_REASON_VERITY] = "fs-verity enable",
[EXT4_FC_REASON_MOVE_EXT] = "Move extents",
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 2f77a37fb101..fb51e19b9778 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -18,6 +18,7 @@
#define EXT4_FC_TAG_PAD 0x0007
#define EXT4_FC_TAG_TAIL 0x0008
#define EXT4_FC_TAG_HEAD 0x0009
+#define EXT4_FC_TAG_DAX_BYTELOG_ANCHOR 0x000a
#define EXT4_FC_SUPPORTED_FEATURES 0x0
@@ -70,6 +71,15 @@ struct ext4_fc_tail {
__le32 fc_crc;
};
+/* Value structure for tag EXT4_FC_TAG_DAX_BYTELOG_ANCHOR. */
+struct ext4_fc_bytelog_entry {
+ __le32 fc_tid;
+ __le64 fc_head;
+ __le64 fc_tail;
+ __le64 fc_seq;
+ __le32 fc_crc;
+};
+
/* Tag base length */
#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
@@ -97,6 +107,7 @@ enum {
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ EXT4_FC_REASON_BYTELOG_TLV_OVERFLOW,
EXT4_FC_REASON_MIGRATE,
EXT4_FC_REASON_VERITY,
EXT4_FC_REASON_MOVE_EXT,
@@ -181,6 +192,8 @@ static inline const char *tag2str(__u16 tag)
return "TAIL";
case EXT4_FC_TAG_HEAD:
return "HEAD";
+ case EXT4_FC_TAG_DAX_BYTELOG_ANCHOR:
+ return "BYTELOG_ANCHOR";
default:
return "ERROR";
}
diff --git a/fs/ext4/fast_commit_bytelog.c b/fs/ext4/fast_commit_bytelog.c
index 64ba3edddbcb..77ac1d9ef031 100644
--- a/fs/ext4/fast_commit_bytelog.c
+++ b/fs/ext4/fast_commit_bytelog.c
@@ -455,6 +455,26 @@ void ext4_fc_bytelog_release(struct super_block *sb)
memset(&sbi->s_fc_bytelog, 0, sizeof(sbi->s_fc_bytelog));
}
+void ext4_fc_bytelog_anchor_to_disk(struct ext4_fc_bytelog_entry *dst,
+ const struct ext4_fc_bytelog_anchor *src)
+{
+ dst->fc_tid = cpu_to_le32(src->tid);
+ dst->fc_head = cpu_to_le64(src->head);
+ dst->fc_tail = cpu_to_le64(src->tail);
+ dst->fc_seq = cpu_to_le64(src->seq);
+ dst->fc_crc = cpu_to_le32(src->crc);
+}
+
+void ext4_fc_bytelog_anchor_from_disk(struct ext4_fc_bytelog_anchor *dst,
+ const struct ext4_fc_bytelog_entry *src)
+{
+ dst->tid = le32_to_cpu(src->fc_tid);
+ dst->head = le64_to_cpu(src->fc_head);
+ dst->tail = le64_to_cpu(src->fc_tail);
+ dst->seq = le64_to_cpu(src->fc_seq);
+ dst->crc = le32_to_cpu(src->fc_crc);
+}
+
void ext4_fc_bytelog_reset(struct super_block *sb, bool full)
{
struct ext4_fc_bytelog *log = &EXT4_SB(sb)->s_fc_bytelog;
diff --git a/fs/ext4/fast_commit_bytelog.h b/fs/ext4/fast_commit_bytelog.h
index d52754890222..d3e5b734a02e 100644
--- a/fs/ext4/fast_commit_bytelog.h
+++ b/fs/ext4/fast_commit_bytelog.h
@@ -9,6 +9,7 @@
struct super_block;
struct journal_s;
struct ext4_sb_info;
+struct ext4_fc_bytelog_entry;
#define EXT4_FC_BYTELOG_MAGIC 0x4c424346 /* "FCBL" */
#define EXT4_FC_BYTELOG_VERSION 1
@@ -109,6 +110,10 @@ int ext4_fc_bytelog_append_vec(struct super_block *sb, u16 tag,
void ext4_fc_bytelog_build_anchor(struct super_block *sb,
struct ext4_fc_bytelog_anchor *anchor,
u32 tid);
+void ext4_fc_bytelog_anchor_to_disk(struct ext4_fc_bytelog_entry *dst,
+ const struct ext4_fc_bytelog_anchor *src);
+void ext4_fc_bytelog_anchor_from_disk(struct ext4_fc_bytelog_anchor *dst,
+ const struct ext4_fc_bytelog_entry *src);
static inline bool ext4_fc_bytelog_record_committed(const struct ext4_fc_bytelog_hdr *hdr)
{
--
2.52.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [RFC PATCH 4/4] ext4: fast_commit: replay DAX ByteLog records
2026-02-26 10:17 [RFC PATCH 0/4] ext4: Byte-granular ByteLog optimizes DAX fast commits Li Chen
` (2 preceding siblings ...)
2026-02-26 10:17 ` [RFC PATCH 3/4] ext4: fast_commit: write TLVs into DAX ByteLog Li Chen
@ 2026-02-26 10:17 ` Li Chen
3 siblings, 0 replies; 5+ messages in thread
From: Li Chen @ 2026-02-26 10:17 UTC (permalink / raw)
To: linux-ext4, Theodore Ts'o, Andreas Dilger, linux-kernel
Cc: Harshad Shirwadkar, Li Chen
Add replay support for EXT4_FC_TAG_DAX_BYTELOG_ANCHOR.
The anchor TLV describes a ByteLog window in the DAX-mapped fast commit
area, which is validated and then replayed using existing TLV handlers.
Signed-off-by: Li Chen <me@linux.beauty>
---
fs/ext4/fast_commit.c | 246 ++++++++++++++++++++++++++++++++++++++++++
fs/ext4/fast_commit.h | 9 ++
2 files changed, 255 insertions(+)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 2f7b7ea29df2..6370505ecc86 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/crc32c.h>
#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
@@ -2172,10 +2173,228 @@ static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
return len >= sizeof(struct ext4_fc_tail);
case EXT4_FC_TAG_HEAD:
return len == sizeof(struct ext4_fc_head);
+ case EXT4_FC_TAG_DAX_BYTELOG_ANCHOR:
+ return len == sizeof(struct ext4_fc_bytelog_entry);
}
return false;
}
+static void ext4_fc_reset_bytelog_state(struct ext4_fc_bytelog_state *state)
+{
+ state->cursor = 0;
+ state->next_seq = 0;
+ state->ring_crc = ~0U;
+ state->initialized = false;
+}
+
+typedef int (*ext4_fc_bytelog_cb_t)(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl,
+ u8 *val, void *data);
+
+static int ext4_fc_bytelog_iterate(struct super_block *sb,
+ struct ext4_fc_bytelog_state *iter,
+ const struct ext4_fc_bytelog_anchor *anchor,
+ ext4_fc_bytelog_cb_t fn, void *data)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_bytelog *log = &sbi->s_fc_bytelog;
+ u8 *base = log->kaddr;
+ u64 cursor, end;
+ int ret;
+
+ if (!log->mapped || !base)
+ return -EOPNOTSUPP;
+ if (anchor->head > log->size_bytes)
+ return -EFSCORRUPTED;
+
+ iter->cursor = anchor->tail;
+ iter->next_seq = 0;
+ iter->ring_crc = ~0U;
+ iter->initialized = true;
+ cursor = iter->cursor;
+ end = anchor->head;
+
+ if (cursor < log->base_off)
+ return -EFSCORRUPTED;
+ if (cursor > end || cursor > log->size_bytes)
+ return -EFSCORRUPTED;
+
+ while (cursor < end) {
+ struct ext4_fc_bytelog_hdr *hdr;
+ size_t remaining;
+ u32 payload_len, record_len;
+ u16 record_tag;
+ u8 *payload;
+ struct ext4_fc_tl_mem tl;
+
+ if (end - cursor > SIZE_MAX)
+ return -E2BIG;
+ remaining = end - cursor;
+ if (cursor > log->size_bytes - sizeof(*hdr))
+ return -EFSCORRUPTED;
+
+ hdr = (struct ext4_fc_bytelog_hdr *)(base + cursor);
+ payload = (u8 *)hdr + sizeof(*hdr);
+ ret = ext4_fc_bytelog_validate_hdr(hdr, remaining, payload);
+ if (ret)
+ return ret;
+ if (!ext4_fc_bytelog_record_committed(hdr))
+ return -EUCLEAN;
+ if (ext4_fc_bytelog_seq(hdr) != iter->next_seq)
+ return -EUCLEAN;
+
+ payload_len = ext4_fc_bytelog_payload_len(hdr);
+ if (payload_len < EXT4_FC_TAG_BASE_LEN)
+ return -EFSCORRUPTED;
+
+ record_tag = le16_to_cpu(hdr->tag);
+ if (record_tag == EXT4_FC_BYTELOG_TAG_BATCH) {
+ u32 pos = 0;
+
+ while (pos < payload_len) {
+ u32 value_len;
+
+ if (payload_len - pos < EXT4_FC_TAG_BASE_LEN)
+ return -EFSCORRUPTED;
+
+ ext4_fc_get_tl(&tl, payload + pos);
+ value_len = tl.fc_len;
+ if (value_len >
+ payload_len - pos - EXT4_FC_TAG_BASE_LEN)
+ return -EFSCORRUPTED;
+ if (!ext4_fc_value_len_isvalid(sbi, tl.fc_tag,
+ tl.fc_len))
+ return -EFSCORRUPTED;
+ if (fn) {
+ ret = fn(sb, &tl,
+ payload + pos +
+ EXT4_FC_TAG_BASE_LEN,
+ data);
+ if (ret)
+ return ret;
+ }
+ pos += EXT4_FC_TAG_BASE_LEN + value_len;
+ }
+ } else {
+ u32 value_len;
+
+ ext4_fc_get_tl(&tl, payload);
+ value_len = payload_len - EXT4_FC_TAG_BASE_LEN;
+ if (tl.fc_len != value_len)
+ return -EFSCORRUPTED;
+ if (record_tag != tl.fc_tag)
+ return -EFSCORRUPTED;
+ if (!ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len))
+ return -EFSCORRUPTED;
+ if (fn) {
+ ret = fn(sb, &tl,
+ payload + EXT4_FC_TAG_BASE_LEN,
+ data);
+ if (ret)
+ return ret;
+ }
+ }
+
+ iter->ring_crc = crc32c(iter->ring_crc, payload, payload_len);
+ record_len = ext4_fc_bytelog_record_len(hdr);
+ cursor += record_len;
+ iter->next_seq++;
+ }
+
+ if (cursor != end)
+ return -EFSCORRUPTED;
+ iter->cursor = cursor;
+ if (iter->next_seq != anchor->seq)
+ return -EUCLEAN;
+ if (iter->ring_crc != anchor->crc)
+ return -EFSBADCRC;
+ return 0;
+}
+
+static int ext4_fc_bytelog_scan_cb(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val,
+ void *data)
+{
+ struct ext4_fc_add_range ext;
+ struct ext4_extent *ex;
+
+ (void)data;
+ switch (tl->fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ memcpy(&ext, val, sizeof(ext));
+ ex = (struct ext4_extent *)&ext.fc_ex;
+ return ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_get_actual_len(ex), 0);
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int ext4_fc_bytelog_replay_cb(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val,
+ void *data)
+{
+ (void)data;
+ switch (tl->fc_tag) {
+ case EXT4_FC_TAG_LINK:
+ return ext4_fc_replay_link(sb, tl, val);
+ case EXT4_FC_TAG_UNLINK:
+ return ext4_fc_replay_unlink(sb, tl, val);
+ case EXT4_FC_TAG_ADD_RANGE:
+ return ext4_fc_replay_add_range(sb, tl, val);
+ case EXT4_FC_TAG_CREAT:
+ return ext4_fc_replay_create(sb, tl, val);
+ case EXT4_FC_TAG_DEL_RANGE:
+ return ext4_fc_replay_del_range(sb, tl, val);
+ case EXT4_FC_TAG_INODE:
+ return ext4_fc_replay_inode(sb, tl, val);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int ext4_fc_replay_scan_bytelog(struct super_block *sb,
+ struct ext4_fc_replay_state *state,
+ const struct ext4_fc_bytelog_anchor *anchor)
+{
+ int ret;
+
+ ret = ext4_fc_bytelog_iterate(sb, &state->fc_bytelog_scan, anchor,
+ ext4_fc_bytelog_scan_cb, state);
+ if (ret)
+ return ret;
+ return JBD2_FC_REPLAY_CONTINUE;
+}
+
+static int ext4_fc_replay_apply_bytelog(struct super_block *sb,
+ struct ext4_fc_replay_state *state,
+ const struct ext4_fc_bytelog_anchor *anchor)
+{
+ return ext4_fc_bytelog_iterate(sb, &state->fc_bytelog_replay, anchor,
+ ext4_fc_bytelog_replay_cb, NULL);
+}
+
+static int ext4_fc_replay_bytelog_anchor(struct super_block *sb,
+ struct ext4_fc_replay_state *state,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct ext4_fc_bytelog_entry entry;
+ struct ext4_fc_bytelog_anchor anchor;
+
+ (void)tl;
+ memcpy(&entry, val, sizeof(entry));
+ ext4_fc_bytelog_anchor_from_disk(&anchor, &entry);
+ return ext4_fc_replay_apply_bytelog(sb, state, &anchor);
+}
+
/*
* Recovery Scan phase handler
*
@@ -2206,6 +2425,8 @@ static int ext4_fc_replay_scan(journal_t *journal,
struct ext4_fc_tail tail;
__u8 *start, *end, *cur, *val;
struct ext4_fc_head head;
+ struct ext4_fc_bytelog_entry entry;
+ struct ext4_fc_bytelog_anchor anchor;
struct ext4_extent *ex;
state = &sbi->s_fc_replay_state;
@@ -2220,6 +2441,8 @@ static int ext4_fc_replay_scan(journal_t *journal,
state->fc_regions = NULL;
state->fc_regions_valid = state->fc_regions_used =
state->fc_regions_size = 0;
+ ext4_fc_reset_bytelog_state(&state->fc_bytelog_scan);
+ ext4_fc_reset_bytelog_state(&state->fc_bytelog_replay);
/* Check if we can stop early */
if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
!= EXT4_FC_TAG_HEAD)
@@ -2278,6 +2501,9 @@ static int ext4_fc_replay_scan(journal_t *journal,
state->fc_replay_num_tags = state->fc_cur_tag;
state->fc_regions_valid =
state->fc_regions_used;
+ if (ext4_fc_bytelog_active(sbi) ||
+ state->fc_bytelog_scan.initialized)
+ ret = JBD2_FC_REPLAY_STOP;
} else {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -EFSBADCRC;
@@ -2299,6 +2525,15 @@ static int ext4_fc_replay_scan(journal_t *journal,
state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
+ case EXT4_FC_TAG_DAX_BYTELOG_ANCHOR:
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN +
+ tl.fc_len);
+ memcpy(&entry, val, sizeof(entry));
+ ext4_fc_bytelog_anchor_from_disk(&anchor, &entry);
+ ret = ext4_fc_replay_scan_bytelog(sb, state, &anchor);
+ break;
default:
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED;
@@ -2335,6 +2570,8 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
if (state->fc_current_pass != pass) {
state->fc_current_pass = pass;
sbi->s_mount_state |= EXT4_FC_REPLAY;
+ if (pass == PASS_REPLAY)
+ ext4_fc_reset_bytelog_state(&state->fc_bytelog_replay);
}
if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
ext4_debug("Replay stops\n");
@@ -2393,9 +2630,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
0, tl.fc_len, 0);
memcpy(&tail, val, sizeof(tail));
WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
+ if ((ext4_fc_bytelog_active(sbi) ||
+ state->fc_bytelog_scan.initialized) &&
+ state->fc_replay_num_tags == 0) {
+ ext4_fc_set_bitmaps_and_counters(sb);
+ return JBD2_FC_REPLAY_STOP;
+ }
break;
case EXT4_FC_TAG_HEAD:
break;
+ case EXT4_FC_TAG_DAX_BYTELOG_ANCHOR:
+ ret = ext4_fc_replay_bytelog_anchor(sb, state, &tl, val);
+ break;
default:
trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
ret = -ECANCELED;
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index fb51e19b9778..224d718150c4 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -153,6 +153,13 @@ struct ext4_fc_alloc_region {
int ino, len;
};
+struct ext4_fc_bytelog_state {
+ u64 cursor;
+ u64 next_seq;
+ u32 ring_crc;
+ bool initialized;
+};
+
/*
* Fast commit replay state.
*/
@@ -166,6 +173,8 @@ struct ext4_fc_replay_state {
int fc_regions_size, fc_regions_used, fc_regions_valid;
int *fc_modified_inodes;
int fc_modified_inodes_used, fc_modified_inodes_size;
+ struct ext4_fc_bytelog_state fc_bytelog_scan;
+ struct ext4_fc_bytelog_state fc_bytelog_replay;
};
#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
--
2.52.0
^ permalink raw reply related [flat|nested] 5+ messages in thread