From: Zhang Yi <yi.zhang@huaweicloud.com>
To: linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, tytso@mit.edu,
adilger.kernel@dilger.ca, jack@suse.cz, ritesh.list@gmail.com,
hch@infradead.org, djwong@kernel.org, willy@infradead.org,
yi.zhang@huawei.com, yi.zhang@huaweicloud.com,
chengzhihao1@huawei.com, yukuai3@huawei.com,
wangkefeng.wang@huawei.com
Subject: [RFC PATCH v2 17/25] ext4: implement buffered write iomap path
Date: Tue, 2 Jan 2024 20:39:10 +0800 [thread overview]
Message-ID: <20240102123918.799062-18-yi.zhang@huaweicloud.com> (raw)
In-Reply-To: <20240102123918.799062-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Implement buffered write iomap path, use ext4_da_map_blocks() to map
delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
delalloc is disabled or free space is about to run out.
Note that we don't want to support dioread_lock mount option any more,
so we drop the branch of ext4_should_dioread_nolock() and always
allocate unwritten extents for new blocks, also make
ext4_should_dioread_nolock() not controlled by the DIOREAD_NOLOCK mount
option and always return true. Besides, the i_disksize updating is also
postponed to after writeback.
After this, now we map or allocate batch of blocks once a time, so it
should be able to bring a lot of performance gains.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 3 +
fs/ext4/ext4_jbd2.h | 7 ++
fs/ext4/file.c | 19 ++++-
fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 190 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3461cb3ff524..03cdcf3d86a5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..c1194ba8d6f2 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -499,6 +499,13 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
+ /*
+ * Always enable dioread_nolock for inode which use buffered
+ * iomap path.
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ return 1;
+
if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
return 0;
if (!S_ISREG(inode->i_mode))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6aa15dafc677..d15bd6ff1b20 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return count;
}
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ const struct iomap_ops *iomap_ops;
+
+ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+ iomap_ops = &ext4_iomap_buffered_da_write_ops;
+ else
+ iomap_ops = &ext4_iomap_buffered_write_ops;
+
+ return iomap_file_buffered_write(iocb, from, iomap_ops);
+}
+
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- ret = generic_perform_write(iocb, from);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ ret = ext4_iomap_buffered_write(iocb, from);
+ else
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2044b322dfd8..5512f38a1a9d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2827,7 +2827,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
return ret;
}
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
{
s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3223,6 +3223,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+ return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+static const struct iomap_folio_ops ext4_iomap_folio_ops = {
+ .iomap_valid = ext4_iomap_valid,
+};
+
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
loff_t length, unsigned int flags)
@@ -3253,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
iomap->flags |= IOMAP_F_MERGED;
+ iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ iomap->folio_ops = &ext4_iomap_folio_ops;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3492,11 +3504,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
-static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+static int ext4_iomap_get_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ handle_t *handle;
+ int ret, needed_blocks;
+
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason.
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ /*
+ * Have to stop journal here since there is a potential deadlock
+ * caused by later balance_dirty_pages(), it might wait on the
+ * ditry pages to be written back, which might start another
+ * handle and wait this handle stop.
+ */
+ ext4_journal_stop(handle);
+
+ return ret;
+}
+
+#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE
+
+static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
loff_t length, unsigned int iomap_flags,
- struct iomap *iomap, struct iomap *srcmap)
+ struct iomap *iomap, struct iomap *srcmap,
+ bool delalloc)
{
- int ret;
+ int ret, retries = 0;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3506,20 +3549,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
return -EINVAL;
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
-
+retry:
/* Calculate the first and last logical blocks respectively. */
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ if (iomap_flags & IOMAP_WRITE) {
+ if (delalloc)
+ ret = ext4_da_map_blocks(inode, &map);
+ else
+ ret = ext4_iomap_get_blocks(inode, &map);
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
+ if (delalloc)
+ iomap->flags |= IOMAP_F_EXT4_DELALLOC;
+
+ return 0;
+}
+
+static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, false);
+}
+
+static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+ DIV_ROUND_UP(length, EXT4_BLOCK_SIZE(inode->i_sb)));
return 0;
}
+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+ loff_t length, ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ handle_t *handle;
+ loff_t end;
+ int ret = 0, ret2;
+
+ /* delalloc */
+ if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
+ ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
+ offset, length, written, ext4_iomap_punch_delalloc);
+ if (ret)
+ ext4_warning(inode->i_sb,
+ "Failed to clean up delalloc for inode %lu, %d",
+ inode->i_ino, ret);
+ return ret;
+ }
+
+ /* nodelalloc */
+ end = offset + length;
+ if (!(iomap->flags & IOMAP_F_SIZE_CHANGED) && end <= inode->i_size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+ ext4_update_i_disksize(inode, inode->i_size);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ }
+
+ /*
+ * If we have allocated more blocks and copied less.
+ * We will have blocks allocated outside inode->i_size,
+ * so truncate them.
+ */
+ if (end > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ ret = ret ? : ret2;
+
+ if (end > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return ret;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_da_write_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
const struct iomap_ops ext4_iomap_buffered_read_ops = {
.iomap_begin = ext4_iomap_buffered_io_begin,
};
--
2.39.2
next prev parent reply other threads:[~2024-01-02 12:42 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-02 12:38 [RFC PATCH v2 00/25] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
2024-01-02 12:38 ` [RFC PATCH v2 01/25] ext4: refactor ext4_da_map_blocks() Zhang Yi
2024-01-03 9:56 ` Jan Kara
2024-01-02 12:38 ` [RFC PATCH v2 02/25] ext4: convert to exclusive lock while inserting delalloc extents Zhang Yi
2024-01-03 10:03 ` Jan Kara
2024-01-02 12:38 ` [RFC PATCH v2 03/25] ext4: correct the hole length returned by ext4_map_blocks() Zhang Yi
2024-01-03 11:02 ` Jan Kara
2024-01-02 12:38 ` [RFC PATCH v2 04/25] ext4: add a hole extent entry in cache after punch Zhang Yi
2024-01-03 11:04 ` Jan Kara
2024-01-02 12:38 ` [RFC PATCH v2 05/25] ext4: make ext4_map_blocks() distinguish delalloc only extent Zhang Yi
2024-01-03 11:31 ` Jan Kara
2024-01-03 13:20 ` Zhang Yi
2024-01-02 12:38 ` [RFC PATCH v2 06/25] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC map type Zhang Yi
2024-01-03 11:35 ` Jan Kara
2024-01-02 12:39 ` [RFC PATCH v2 07/25] iomap: don't increase i_size if it's not a write operation Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 08/25] iomap: add pos and dirty_len into trace_iomap_writepage_map Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 09/25] ext4: allow inserting delalloc extents with multi-blocks Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 10/25] ext4: correct delalloc extent length Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 11/25] ext4: also mark extent as delalloc if it's been unwritten Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 12/25] ext4: factor out bh handles to ext4_da_get_block_prep() Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 13/25] ext4: use reserved metadata blocks when splitting extent in endio Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 14/25] ext4: introduce seq counter for extent entry Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 15/25] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 16/25] ext4: implement buffered read iomap path Zhang Yi
2024-01-02 12:39 ` Zhang Yi [this message]
2024-01-02 12:39 ` [RFC PATCH v2 18/25] ext4: implement writeback " Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 19/25] ext4: implement mmap " Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 20/25] ext4: implement zero_range " Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 21/25] ext4: writeback partial blocks before zero range Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 22/25] ext4: fall back to buffer_head path for defrag Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 23/25] ext4: partially enable iomap for regular file's buffered IO path Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 24/25] filemap: support disable large folios on active inode Zhang Yi
2024-01-02 12:39 ` [RFC PATCH v2 25/25] ext4: enable large folio for regular file with iomap buffered IO path Zhang Yi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240102123918.799062-18-yi.zhang@huaweicloud.com \
--to=yi.zhang@huaweicloud.com \
--cc=adilger.kernel@dilger.ca \
--cc=chengzhihao1@huawei.com \
--cc=djwong@kernel.org \
--cc=hch@infradead.org \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=ritesh.list@gmail.com \
--cc=tytso@mit.edu \
--cc=wangkefeng.wang@huawei.com \
--cc=willy@infradead.org \
--cc=yi.zhang@huawei.com \
--cc=yukuai3@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).