[PATCH 15/27] ext4: implement buffered write iomap path

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Zhang Yi <yi.zhang@huaweicloud.com>
To: linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	tytso@mit.edu, adilger.kernel@dilger.ca, jack@suse.cz,
	ritesh.list@gmail.com, hch@infradead.org, djwong@kernel.org,
	david@fromorbit.com, zokeefe@google.com, yi.zhang@huawei.com,
	yi.zhang@huaweicloud.com, chengzhihao1@huawei.com,
	yukuai3@huawei.com, yangerkun@huawei.com
Subject: [PATCH 15/27] ext4: implement buffered write iomap path
Date: Tue, 22 Oct 2024 19:10:46 +0800	[thread overview]
Message-ID: <20241022111059.2566137-16-yi.zhang@huaweicloud.com> (raw)
In-Reply-To: <20241022111059.2566137-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Introduce two new iomap_ops: ext4_iomap_buffered_write_ops and
ext4_iomap_buffered_da_write_ops to implement the iomap write path.
These operations invoke ext4_da_map_blocks() to map delayed allocation
extents and introduce ext4_iomap_get_blocks() to directly allocate
blocks in non-delayed allocation mode. Additionally, implement
ext4_iomap_valid() to check the validity of extent mapping.

There are two key differences between the buffer_head write path and the
iomap write path:

1) In the iomap write path, we always allocate unwritten extents for new
   blocks, which means we consistently enable dioread_nolock. Therefore,
   we do not need to truncate blocks for short writes and write failure.
2) The iomap write frame maps multi-blocks in the ->iomap_begin()
   function, so we must remove the stale delayed allocation range from
   the short writes and write failure. Otherwise, this could result in a
   range of delayed extents being covered by a clean folio, leading to
   inaccurate space reservation.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h  |   3 +
 fs/ext4/file.c  |  19 +++++-
 fs/ext4/inode.c | 155 +++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ee170196bfff..a09f96ef17d8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2985,6 +2985,7 @@ int ext4_walk_page_buffers(handle_t *handle,
 				     struct buffer_head *bh));
 int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				struct buffer_head *bh);
+int ext4_nonda_switch(struct super_block *sb);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
@@ -3845,6 +3846,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 extern const struct iomap_ops ext4_iomap_ops;
 extern const struct iomap_ops ext4_iomap_overwrite_ops;
 extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
 
 static inline int ext4_buffer_uptodate(struct buffer_head *bh)
 {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..92471865b4e5 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return count;
 }
 
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+					 struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	const struct iomap_ops *iomap_ops;
+
+	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+		iomap_ops = &ext4_iomap_buffered_da_write_ops;
+	else
+		iomap_ops = &ext4_iomap_buffered_write_ops;
+
+	return iomap_file_buffered_write(iocb, from, iomap_ops, NULL);
+}
+
 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 					struct iov_iter *from)
 {
@@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 	if (ret <= 0)
 		goto out;
 
-	ret = generic_perform_write(iocb, from);
+	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+		ret = ext4_iomap_buffered_write(iocb, from);
+	else
+		ret = generic_perform_write(iocb, from);
 
 out:
 	inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f0bc4b58ac4f..23cbcaab0a56 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2862,7 +2862,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
 	return ret;
 }
 
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_clusters, dirty_clusters;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3257,6 +3257,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 	return inode->i_state & I_DIRTY_DATASYNC;
 }
 
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+static const struct iomap_folio_ops ext4_iomap_folio_ops = {
+	.iomap_valid = ext4_iomap_valid,
+};
+
 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 			   struct ext4_map_blocks *map, loff_t offset,
 			   loff_t length, unsigned int flags)
@@ -3287,6 +3296,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		iomap->flags |= IOMAP_F_MERGED;
 
+	iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
+	iomap->folio_ops = &ext4_iomap_folio_ops;
+
 	/*
 	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
 	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3526,11 +3538,57 @@ const struct iomap_ops ext4_iomap_report_ops = {
 	.iomap_begin = ext4_iomap_begin_report,
 };
 
-static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
-			loff_t length, unsigned int flags, struct iomap *iomap,
-			struct iomap *srcmap)
+static int ext4_iomap_get_blocks(struct inode *inode,
+				 struct ext4_map_blocks *map)
 {
-	int ret;
+	loff_t i_size = i_size_read(inode);
+	handle_t *handle;
+	int ret, needed_blocks;
+
+	/*
+	 * Check if the blocks have already been allocated, this could
+	 * avoid initiating a new journal transaction and return the
+	 * mapping information directly.
+	 */
+	if ((map->m_lblk + map->m_len) <=
+	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
+		ret = ext4_map_blocks(NULL, inode, map, 0);
+		if (ret < 0)
+			return ret;
+		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
+				    EXT4_MAP_DELAYED))
+			return 0;
+	}
+
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason.
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ret = ext4_map_blocks(handle, inode, map,
+			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+	/*
+	 * We need to stop handle here due to a potential deadlock caused
+	 * by the subsequent call to balance_dirty_pages(). This function
+	 * may wait for the dirty pages to be written back, which could
+	 * initiate another handle and cause it to wait for the first
+	 * handle to complete.
+	 */
+	ext4_journal_stop(handle);
+
+	return ret;
+}
+
+static int ext4_iomap_buffered_begin(struct inode *inode, loff_t offset,
+				     loff_t length, unsigned int flags,
+				     struct iomap *iomap, struct iomap *srcmap,
+				     bool delalloc)
+{
+	int ret, retries = 0;
 	struct ext4_map_blocks map;
 	u8 blkbits = inode->i_blkbits;
 
@@ -3541,13 +3599,23 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	/* Inline data support is not yet available. */
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
-
+retry:
 	/* Calculate the first and last logical blocks respectively. */
 	map.m_lblk = offset >> blkbits;
 	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
 			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+	if (flags & IOMAP_WRITE) {
+		if (delalloc)
+			ret = ext4_da_map_blocks(inode, &map);
+		else
+			ret = ext4_iomap_get_blocks(inode, &map);
 
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	} else {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+	}
 	if (ret < 0)
 		return ret;
 
@@ -3555,6 +3623,79 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	return 0;
 }
 
+static int ext4_iomap_buffered_read_begin(struct inode *inode,
+			loff_t offset, loff_t length, unsigned int flags,
+			struct iomap *iomap, struct iomap *srcmap)
+{
+	return ext4_iomap_buffered_begin(inode, offset, length, flags,
+					 iomap, srcmap, false);
+}
+
+static int ext4_iomap_buffered_write_begin(struct inode *inode,
+			loff_t offset, loff_t length, unsigned int flags,
+			struct iomap *iomap, struct iomap *srcmap)
+{
+	return ext4_iomap_buffered_begin(inode, offset, length, flags,
+					 iomap, srcmap, false);
+}
+
+static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+			loff_t offset, loff_t length, unsigned int flags,
+			struct iomap *iomap, struct iomap *srcmap)
+{
+	return ext4_iomap_buffered_begin(inode, offset, length, flags,
+					 iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+				     loff_t length, struct iomap *iomap)
+{
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
+	up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
+					    loff_t length, ssize_t written,
+					    unsigned int flags,
+					    struct iomap *iomap)
+{
+	loff_t start_byte, end_byte;
+
+	/* If we didn't reserve the blocks, we're not allowed to punch them. */
+	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
+		return 0;
+
+	/* Nothing to do if we've written the entire delalloc extent */
+	start_byte = iomap_last_written_block(inode, offset, written);
+	end_byte = round_up(offset + length, i_blocksize(inode));
+	if (start_byte >= end_byte)
+		return 0;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
+				     iomap, ext4_iomap_punch_delalloc);
+	filemap_invalidate_unlock(inode->i_mapping);
+	return 0;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_write_begin,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_da_write_begin,
+	.iomap_end = ext4_iomap_buffered_da_write_end,
+};
+
 const struct iomap_ops ext4_iomap_buffered_read_ops = {
 	.iomap_begin = ext4_iomap_buffered_read_begin,
 };
-- 
2.46.1

next prev parent reply	other threads:[~2024-10-22  3:13 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-22 11:10 [PATCH 00/27] ext4: use iomap for regular file's buffered I/O path and enable large folio Zhang Yi
2024-10-22  6:59 ` Sedat Dilek
2024-10-22  9:22   ` Zhang Yi
2024-10-23 12:13     ` Sedat Dilek
2024-10-24  7:44       ` Zhang Yi
2024-10-22 11:10 ` [PATCH 01/27] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
2024-12-04 11:13   ` Jan Kara
2024-12-06  7:59     ` Zhang Yi
2024-12-06 15:49       ` Jan Kara
2024-10-22 11:10 ` [PATCH 02/27] ext4: don't explicit update times in ext4_fallocate() Zhang Yi
2024-10-22 11:10 ` [PATCH 03/27] ext4: don't write back data before punch hole in nojournal mode Zhang Yi
2024-11-18 23:15   ` Darrick J. Wong
2024-11-20  2:56     ` Zhang Yi
2024-12-04 11:26       ` Jan Kara
2024-12-04 11:27   ` Jan Kara
2024-10-22 11:10 ` [PATCH 04/27] ext4: refactor ext4_punch_hole() Zhang Yi
2024-11-18 23:27   ` Darrick J. Wong
2024-11-20  3:18     ` Zhang Yi
2024-12-04 11:36   ` Jan Kara
2024-10-22 11:10 ` [PATCH 05/27] ext4: refactor ext4_zero_range() Zhang Yi
2024-12-04 11:52   ` Jan Kara
2024-12-06  8:09     ` Zhang Yi
2024-10-22 11:10 ` [PATCH 06/27] ext4: refactor ext4_collapse_range() Zhang Yi
2024-12-04 11:58   ` Jan Kara
2024-10-22 11:10 ` [PATCH 07/27] ext4: refactor ext4_insert_range() Zhang Yi
2024-12-04 12:02   ` Jan Kara
2024-10-22 11:10 ` [PATCH 08/27] ext4: factor out ext4_do_fallocate() Zhang Yi
2024-10-22 11:10 ` [PATCH 09/27] ext4: move out inode_lock into ext4_fallocate() Zhang Yi
2024-12-04 12:05   ` Jan Kara
2024-12-06  8:13     ` Zhang Yi
2024-12-06 15:51       ` Jan Kara
2024-10-22 11:10 ` [PATCH 10/27] ext4: move out common parts " Zhang Yi
2024-12-04 12:10   ` Jan Kara
2024-10-22 11:10 ` [PATCH 11/27] ext4: use reserved metadata blocks when splitting extent on endio Zhang Yi
2024-12-04 12:16   ` Jan Kara
2024-10-22 11:10 ` [PATCH 12/27] ext4: introduce seq counter for the extent status entry Zhang Yi
2024-12-04 12:42   ` Jan Kara
2024-12-06  8:55     ` Zhang Yi
2024-12-06 16:21       ` Jan Kara
2024-12-09  8:32         ` Zhang Yi
2024-12-10 12:57           ` Jan Kara
2024-12-11  7:59             ` Zhang Yi
2024-12-11 16:00               ` Jan Kara
2024-12-12  2:32                 ` Zhang Yi
2024-10-22 11:10 ` [PATCH 13/27] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
2024-10-22 11:10 ` [PATCH 14/27] ext4: implement buffered read iomap path Zhang Yi
2024-10-22 11:10 ` Zhang Yi [this message]
2024-10-22 11:10 ` [PATCH 16/27] ext4: don't order data for inode with EXT4_STATE_BUFFERED_IOMAP Zhang Yi
2024-10-22 11:10 ` [PATCH 17/27] ext4: implement writeback iomap path Zhang Yi
2024-10-22 11:10 ` [PATCH 18/27] ext4: implement mmap " Zhang Yi
2024-10-22 11:10 ` [PATCH 19/27] ext4: do not always order data when partial zeroing out a block Zhang Yi
2024-10-22 11:10 ` [PATCH 20/27] ext4: do not start handle if unnecessary while " Zhang Yi
2024-10-22 11:10 ` [PATCH 21/27] ext4: implement zero_range iomap path Zhang Yi
2024-10-22 11:10 ` [PATCH 22/27] ext4: disable online defrag when inode using iomap buffered I/O path Zhang Yi
2024-10-22 11:10 ` [PATCH 23/27] ext4: disable inode journal mode when " Zhang Yi
2024-10-22 11:10 ` [PATCH 24/27] ext4: partially enable iomap for the buffered I/O path of regular files Zhang Yi
2024-10-22 11:10 ` [PATCH 25/27] ext4: enable large folio for regular file with iomap buffered I/O path Zhang Yi
2024-10-22 11:10 ` [PATCH 26/27] ext4: change mount options code style Zhang Yi
2024-10-22 11:10 ` [PATCH 27/27] ext4: introduce a mount option for iomap buffered I/O path Zhang Yi

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ee170196bff dfblob:a09f96ef17d dfblob:f14aed14b9c
dfblob:92471865b4e dfblob:f0bc4b58ac4 dfblob:23cbcaab0a5 )
 OR (
bs:"[PATCH 15/27] ext4: implement buffered write iomap path" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241022111059.2566137-16-yi.zhang@huaweicloud.com \
    --to=yi.zhang@huaweicloud.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=chengzhihao1@huawei.com \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ritesh.list@gmail.com \
    --cc=tytso@mit.edu \
    --cc=yangerkun@huawei.com \
    --cc=yi.zhang@huawei.com \
    --cc=yukuai3@huawei.com \
    --cc=zokeefe@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).