* [RFC PATCH 01/18] ext4: introduce ext4_es_skip_hole_extent() to skip hole extents
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 02/18] ext4: make ext4_es_lookup_extent() return the next extent if not found Zhang Yi
` (17 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Introduce a new helper ext4_es_skip_hole_extent() to skip all hole
extents in a search range, return the valid lblk of next not hole extent
entry. It's useful to estimate and limit the length of a potential hole
returned when querying mapping status in ext4_map_blocks().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 32 ++++++++++++++++++++++++++++++++
fs/ext4/extents_status.h | 2 ++
include/trace/events/ext4.h | 28 ++++++++++++++++++++++++++++
3 files changed, 62 insertions(+)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 6f7de14c0fa8..1b1b1a8848a8 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -944,6 +944,38 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
write_unlock(&EXT4_I(inode)->i_es_lock);
}
+/*
+ * ext4_es_skip_hole_extent() skip hole extents and loops up the next
+ * delayed/unwritten/mapped extent in extent status tree from lblk to
+ * end.
+ */
+ext4_lblk_t ext4_es_skip_hole_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct extent_status *es = NULL;
+ ext4_lblk_t next_lblk;
+ struct rb_node *node;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+
+ while (es && es->es_lblk < lblk + len) {
+ if (!ext4_es_is_hole(es))
+ break;
+ node = rb_next(&es->rb_node);
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ if (!es || es->es_lblk >= lblk + len)
+ next_lblk = lblk + len;
+ else
+ next_lblk = es->es_lblk;
+
+ trace_ext4_es_skip_hole_extent(inode, lblk, len, next_lblk);
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ return next_lblk;
+}
+
/*
* ext4_es_lookup_extent() looks up an extent in extent status tree.
*
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d9847a4a25db..4f69322dd626 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -139,6 +139,8 @@ extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
+ext4_lblk_t ext4_es_skip_hole_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk,
struct extent_status *es);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 65029dfb92fb..84421cecec0b 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2291,6 +2291,34 @@ TRACE_EVENT(ext4_es_find_extent_range_exit,
__entry->pblk, show_extent_status(__entry->status))
);
+TRACE_EVENT(ext4_es_skip_hole_extent,
+ TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_lblk_t next_lblk),
+
+ TP_ARGS(inode, lblk, len, next_lblk),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ext4_lblk_t, lblk )
+ __field( ext4_lblk_t, len )
+ __field( ext4_lblk_t, next )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->lblk = lblk;
+ __entry->len = len;
+ __entry->next = next_lblk;
+ ),
+
+ TP_printk("dev %d,%d ino %lu [%u/%u) next_lblk %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino, __entry->lblk,
+ __entry->len, __entry->next)
+);
+
TRACE_EVENT(ext4_es_lookup_extent_enter,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 02/18] ext4: make ext4_es_lookup_extent() return the next extent if not found
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 01/18] ext4: introduce ext4_es_skip_hole_extent() to skip hole extents Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 03/18] ext4: correct the hole length returned by ext4_map_blocks() Zhang Yi
` (16 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Make ext4_es_lookup_extent() return the next extent entry if we can't
find the extent that lblk belongs to, it's useful to estimate and limit
the length of a potential hole in ext4_map_blocks().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 21 ++++++++-------------
1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 1b1b1a8848a8..19a0cc904cd8 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1012,19 +1012,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
goto out;
}
- node = tree->root.rb_node;
- while (node) {
- es1 = rb_entry(node, struct extent_status, rb_node);
- if (lblk < es1->es_lblk)
- node = node->rb_left;
- else if (lblk > ext4_es_end(es1))
- node = node->rb_right;
- else {
- found = 1;
- break;
- }
- }
-
+ es1 = __es_tree_search(&tree->root, lblk);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len))
+ found = 1;
out:
stats = &EXT4_SB(inode->i_sb)->s_es_stats;
if (found) {
@@ -1045,6 +1035,11 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
*next_lblk = 0;
}
} else {
+ if (es1) {
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
percpu_counter_inc(&stats->es_stats_cache_misses);
}
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 03/18] ext4: correct the hole length returned by ext4_map_blocks()
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 01/18] ext4: introduce ext4_es_skip_hole_extent() to skip hole extents Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 02/18] ext4: make ext4_es_lookup_extent() return the next extent if not found Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 04/18] ext4: add a hole extent entry in cache after punch Zhang Yi
` (15 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
In ext4_map_blocks(), if we can't find a range of mapping in the
extents cache, we are calling ext4_ext_map_blocks() to search the real
path. But if the querying range was tail overlaped by a delayed extent,
we can't find it on the real extent path, so the returned hole length
could be larger than it really is.
| querying map |
v v
|----------{-------------}{------|----------------}-----...
^ ^ ^^ ^
| uncached | hole extent || delayed extent |
We have to adjust the mapping length to the next not hole extent's
lblk before searching the extent path.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4ce35f1c8b0a..94e7b8500878 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -479,6 +479,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct extent_status es;
+ ext4_lblk_t next;
int retval;
int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
@@ -502,8 +503,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ goto uncached;
+
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -532,6 +535,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
#endif
goto found;
}
+ /*
+ * Not found, maybe a hole, need to adjust the map length before
+ * seraching the real extent path. It can prevent incorrect hole
+ * length returned if the following entries have delayed only
+ * ones.
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CREATE) && es.es_lblk > map->m_lblk) {
+ next = es.es_lblk;
+ if (ext4_es_is_hole(&es))
+ next = ext4_es_skip_hole_extent(inode, map->m_lblk,
+ map->m_len);
+ retval = next - map->m_lblk;
+ if (map->m_len > retval)
+ map->m_len = retval;
+ }
+
+uncached:
/*
* In the query cache no-wait mode, nothing we can do more if we
* cannot find extent in the cache.
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 04/18] ext4: add a hole extent entry in cache after punch
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (2 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 03/18] ext4: correct the hole length returned by ext4_map_blocks() Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 05/18] ext4: make ext4_map_blocks() distinguish delayed only mapping Zhang Yi
` (14 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
In order to cache hole extents in the extent status tree and keep the
hole continuity as much as possible, add a hole entry to the cache after
punching a hole. It can reduce the 'hole' in some continuous hole extent
entries.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 3 +++
fs/ext4/extents.c | 5 ++---
fs/ext4/inode.c | 2 ++
3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9418359b1d9d..c2ca28c6ec38 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3681,6 +3681,9 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_put_gap_in_cache(struct inode *inode,
+ ext4_lblk_t hole_start,
+ ext4_lblk_t hole_len);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 202c76996b62..52bad225e3c8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2275,9 +2275,8 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
* calculate boundaries of the gap that the requested block fits into
* and cache this gap
*/
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
- ext4_lblk_t hole_len)
+void ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+ ext4_lblk_t hole_len)
{
struct extent_status es;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 94e7b8500878..3908ce7f6fb8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4034,6 +4034,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
+ ext4_ext_put_gap_in_cache(inode, first_block,
+ stop_block - first_block);
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(handle, inode, first_block, stop_block);
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 05/18] ext4: make ext4_map_blocks() distinguish delayed only mapping
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (3 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 04/18] ext4: add a hole extent entry in cache after punch Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 06/18] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC mapping type Zhang Yi
` (13 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Add a new map flag EXT4_MAP_DELAYED to indicate the mapping range is a
delayed allocated only (not unwritten) one, and making
ext4_map_blocks() can distinguish it, no longer mixing it with holes.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 4 +++-
fs/ext4/inode.c | 2 ++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c2ca28c6ec38..b5026090ad6f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -252,8 +252,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3908ce7f6fb8..74b41566d31a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -518,6 +518,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 06/18] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC mapping type
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (4 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 05/18] ext4: make ext4_map_blocks() distinguish delayed only mapping Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 07/18] ext4: allow reserving multi-delayed blocks Zhang Yi
` (12 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Since ext4_map_blocks() can recognize a delayed allocated only extent,
make ext4_set_iomap() can also recognize it, and remove the useless
separate check in ext4_iomap_begin_report().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 32 +++-----------------------------
1 file changed, 3 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 74b41566d31a..17fe2bd83617 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3279,6 +3279,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->addr = (u64) map->m_pblk << blkbits;
if (flags & IOMAP_DAX)
iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3441,35 +3444,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3510,13 +3489,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
return 0;
}
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 07/18] ext4: allow reserving multi-delayed blocks
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (5 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 06/18] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC mapping type Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 08/18] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
` (11 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Introduce a new helper ext4_insert_delayed_blocks() to support adding
multi-delayed blocks into the extent status tree, it doesn't support
bigalloc feature yet. Also rename ext4_es_insert_delayed_block() to
ext4_es_insert_delayed_extent(), which matches the name style of other
ext4_es_{insert|remove}_extent() helpers.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 20 +++++----
fs/ext4/extents_status.h | 4 +-
fs/ext4/inode.c | 82 ++++++++++++++++++++++++-------------
include/trace/events/ext4.h | 12 +++---
4 files changed, 74 insertions(+), 44 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 19a0cc904cd8..c8783b4009ec 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -2043,19 +2043,21 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds delayed blocks to the extents status
+ * tree, adding a pending reservation where
+ * needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - first logical block to be added
+ * @len - length of blocks to be added
* @allocated - indicates whether a physical cluster has been allocated for
* the logical cluster that contains the block
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int len, bool allocated)
{
struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err1 = 0;
int err2 = 0;
struct extent_status *es1 = NULL;
@@ -2068,9 +2070,9 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
lblk, inode->i_ino);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+ trace_ext4_es_insert_delayed_extent(inode, &newes, allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2081,7 +2083,7 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es2 = __es_alloc_extent(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4f69322dd626..4ccc965a9876 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -251,8 +251,8 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int len, bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17fe2bd83617..e92b205b3b24 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1462,7 +1462,7 @@ static int ext4_journalled_write_end(struct file *file,
/*
* Reserve space for a single cluster
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, unsigned int len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1473,18 +1473,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, len));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, len, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, len));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += len;
+ trace_ext4_da_reserve_space(inode, len);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1630,6 +1630,37 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
+
+/*
+ * ext4_insert_delayed_blocks - adds multi-delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making a pending
+ * reservation where needed.
+ *
+ * @inode - file containing the newly added block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ /* TODO: support bigalloc and replace ext4_insert_delayed_block(). */
+ if (sbi->s_cluster_ratio != 1)
+ return -EOPNOTSUPP;
+
+ ret = ext4_da_reserve_space(inode, len);
+ if (ret) /* ENOSPC */
+ return ret;
+
+ ext4_es_insert_delayed_extent(inode, lblk, len, false);
+ return 0;
+}
+
/*
* ext4_insert_delayed_block - adds a delayed block to the extents status
* tree, incrementing the reserved cluster/block
@@ -1647,10 +1678,13 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
int ret;
bool allocated = false;
+ if (sbi->s_cluster_ratio == 1)
+ return ext4_insert_delayed_blocks(inode, lblk, 1);
+
/*
- * If the cluster containing lblk is shared with a delayed,
- * written, or unwritten extent in a bigalloc file system, it's
- * already been accounted for and does not need to be reserved.
+ * For bigalloc, if the cluster containing lblk is shared with a
+ * delayed, written, or unwritten extent in a bigalloc file system,
+ * it's already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
* shared with a written or unwritten extent and doesn't already
* have one. Written and unwritten extents can be purged from the
@@ -1658,32 +1692,24 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* it's necessary to examine the extent tree if a search of the
* extents status tree doesn't get a match.
*/
- if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) {
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret != 0) /* ENOSPC */
return ret;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else {
- allocated = true;
- }
} else {
allocated = true;
}
+ } else {
+ allocated = true;
}
}
- ext4_es_insert_delayed_block(inode, lblk, allocated);
+ ext4_es_insert_delayed_extent(inode, lblk, 1, allocated);
return 0;
}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 84421cecec0b..6b871d42b259 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1249,14 +1249,15 @@ TRACE_EVENT(ext4_da_update_reserve_space,
);
TRACE_EVENT(ext4_da_reserve_space,
- TP_PROTO(struct inode *inode),
+ TP_PROTO(struct inode *inode, int reserved_blocks),
- TP_ARGS(inode),
+ TP_ARGS(inode, reserved_blocks),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u64, i_blocks )
+ __field( int, reserved_blocks )
__field( int, reserved_data_blocks )
__field( __u16, mode )
),
@@ -1265,16 +1266,17 @@ TRACE_EVENT(ext4_da_reserve_space,
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->i_blocks = inode->i_blocks;
+ __entry->reserved_blocks = reserved_blocks;
__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
__entry->mode = inode->i_mode;
),
- TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
+ TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserved_blocks %u "
"reserved_data_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->mode, __entry->i_blocks,
- __entry->reserved_data_blocks)
+ __entry->reserved_blocks, __entry->reserved_data_blocks)
);
TRACE_EVENT(ext4_da_release_space,
@@ -2509,7 +2511,7 @@ TRACE_EVENT(ext4_es_shrink,
__entry->scan_time, __entry->nr_skipped, __entry->retried)
);
-TRACE_EVENT(ext4_es_insert_delayed_block,
+TRACE_EVENT(ext4_es_insert_delayed_extent,
TP_PROTO(struct inode *inode, struct extent_status *es,
bool allocated),
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 08/18] ext4: add a new iomap aops for regular file's buffered IO path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (6 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 07/18] ext4: allow reserving multi-delayed blocks Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 09/18] ext4: implement buffered read iomap path Zhang Yi
` (10 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Introduce a new iomap address space operations ext4_iomap_aops that used
to support regular file's buffered IO path. Most of their callbacks have
already been implemented yet, the .read_folio, .readahead and
.writepages should be implemented later.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e92b205b3b24..4eef3828d5fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3525,6 +3525,22 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
+{
+ return 0;
+}
+
+static void ext4_iomap_readahead(struct readahead_control *rac)
+{
+
+}
+
+static int ext4_iomap_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return 0;
+}
+
/*
* For data=journal mode, folio should be marked dirty only when it was
* writeably mapped. When that happens, it was already attached to the
@@ -3614,6 +3630,21 @@ static const struct address_space_operations ext4_da_aops = {
.swap_activate = ext4_iomap_swap_activate,
};
+static const struct address_space_operations ext4_iomap_aops = {
+ .read_folio = ext4_iomap_read_folio,
+ .readahead = ext4_iomap_readahead,
+ .writepages = ext4_iomap_writepages,
+ .dirty_folio = iomap_dirty_folio,
+ .bmap = ext4_bmap,
+ .invalidate_folio = iomap_invalidate_folio,
+ .release_folio = iomap_release_folio,
+ .direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
+};
+
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 09/18] ext4: implement buffered read iomap path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (7 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 08/18] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 10/18] ext4: implement buffered write " Zhang Yi
` (9 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Implement ext4_iomap_buffered_io_begin() blocks mapping helper, it query
block mapping and use ext4_set_iomap() convert ext4 map to iomap, then
the buffered read path can be supported by iomap frame.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4eef3828d5fd..4c206cf37a49 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3525,14 +3525,47 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
-static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
+static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
+
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+
+ /*
+ * Calculate the first and last logical blocks respectively.
+ */
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
}
-static void ext4_iomap_readahead(struct readahead_control *rac)
+const struct iomap_ops ext4_iomap_read_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+};
+
+static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
{
+ return iomap_read_folio(folio, &ext4_iomap_read_ops);
+}
+static void ext4_iomap_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &ext4_iomap_read_ops);
}
static int ext4_iomap_writepages(struct address_space *mapping,
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 10/18] ext4: implement buffered write iomap path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (8 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 09/18] ext4: implement buffered read iomap path Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend Zhang Yi
` (8 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Implement both buffer write path with/without delayed allocation
feature, also inherit the fallback to nodelalloc logic from buffer_head
path when the free space is about to run out. After switching to iomap,
we support mapping multi-blocks once a time, which could bring a lot of
performance gains.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 207 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4c206cf37a49..9229297e1efc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3525,13 +3525,154 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+static int ext4_iomap_da_map_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ unsigned int status;
+ ext4_lblk_t next;
+ int mapped_len;
+ int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+ map->m_flags = 0;
+ ext_debug(inode, "max_blocks %u, logical block %llu\n", map->m_len,
+ (unsigned long long)map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ int es_len = es.es_len - (map->m_lblk - es.es_lblk);
+
+ map->m_len = min_t(unsigned int, map->m_len, es_len);
+ if (ext4_es_is_delonly(&es)) {
+ map->m_pblk = 0;
+ map->m_flags |= EXT4_MAP_DELAYED;
+ return 0;
+ }
+ if (ext4_es_is_hole(&es)) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto add_delayed;
+ }
+
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG();
+
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+ /* Already delayed */
+ if (ext4_es_is_delayed(&es))
+ return 0;
+
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto insert_extent;
+ }
+
+ /*
+ * Not found cached extents, adjust the length if it has been
+ * partially allocated.
+ */
+ if (es.es_lblk > map->m_lblk &&
+ es.es_lblk < map->m_lblk + map->m_len) {
+ next = es.es_lblk;
+ if (ext4_es_is_hole(&es))
+ next = ext4_es_skip_hole_extent(inode, map->m_lblk,
+ map->m_len);
+ map->m_len = next - map->m_lblk;
+ }
+
+ /*
+ * Try to see if we can get blocks without requesting new file
+ * system blocks.
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ mapped_len = ext4_ext_map_blocks(NULL, inode, map, 0);
+ else
+ mapped_len = ext4_ind_map_blocks(NULL, inode, map, 0);
+ if (mapped_len < 0) {
+ ret = mapped_len;
+ goto out_unlock;
+ }
+ if (mapped_len == 0)
+ goto add_delayed;
+
+ if (unlikely(mapped_len != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode %lu: "
+ "retval %d != map->m_len %d",
+ inode->i_ino, mapped_len, map->m_len);
+ WARN_ON(1);
+ }
+
+insert_extent:
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (status == EXTENT_STATUS_UNWRITTEN)
+ status |= EXTENT_STATUS_DELAYED;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ goto out_unlock;
+add_delayed:
+ ret = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+ return ret;
+}
+
+static int ext4_iomap_noda_map_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ handle_t *handle;
+ int ret, needed_blocks;
+ int flags;
+
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason.
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (ext4_should_dioread_nolock(inode))
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ else
+ flags = EXT4_GET_BLOCKS_CREATE;
+
+ ret = ext4_map_blocks(handle, inode, map, flags);
+ if (ret < 0) {
+ ext4_journal_stop(handle);
+ return ret;
+ }
+
+ return 0;
+}
+
+#define IOMAP_F_EXT4_NONDELALLOC IOMAP_F_PRIVATE
+
static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
- int ret;
+ int ret, retries = 0;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ bool no_delalloc = false;
+
+ if ((flags & IOMAP_WRITE) &&
+ unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3539,6 +3680,7 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
+retry:
/*
* Calculate the first and last logical blocks respectively.
*/
@@ -3546,14 +3688,77 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (flags & IOMAP_WRITE) {
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ ret = ext4_iomap_da_map_blocks(inode, &map);
+ } else {
+ ret = ext4_iomap_noda_map_blocks(inode, &map);
+ no_delalloc = true;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+ if (no_delalloc)
+ iomap->flags |= IOMAP_F_EXT4_NONDELALLOC;
+
return 0;
}
+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+ loff_t length, ssize_t written,
+ unsigned flags, struct iomap *iomap)
+{
+ handle_t *handle;
+ int ret = 0, ret2;
+
+ if (!(flags & IOMAP_WRITE))
+ return 0;
+ if (!(iomap->flags & IOMAP_F_EXT4_NONDELALLOC))
+ return 0;
+
+ handle = ext4_journal_current_handle();
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+ ext4_update_i_disksize(inode, inode->i_size);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ }
+
+ /*
+ * If we have allocated more blocks and copied less.
+ * We will have blocks allocated outside inode->i_size,
+ * so truncate them.
+ */
+ if (offset + length > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ ret = ret ? ret : ret2;
+
+ if (offset + length > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret;
+}
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
const struct iomap_ops ext4_iomap_read_ops = {
.iomap_begin = ext4_iomap_buffered_io_begin,
};
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (9 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 10/18] ext4: implement buffered write " Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 15:36 ` Christoph Hellwig
2023-11-23 12:51 ` [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation Zhang Yi
` (7 subsequent siblings)
18 siblings, 1 reply; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Add a private parameter to iomap_ioend structure, letting filesystems
can pass something they needed from .prepare_ioend() to IO end.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/iomap/buffered-io.c | 1 +
include/linux/iomap.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2bc0aa23fde3..fd4d43bafd1b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1676,6 +1676,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
ioend->io_offset = offset;
ioend->io_bio = bio;
ioend->io_sector = sector;
+ ioend->io_private = NULL;
return ioend;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..8b3296a5474d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -300,6 +300,7 @@ struct iomap_ioend {
sector_t io_sector; /* start sector of ioend */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
+ void *io_private; /* fs private pointer */
};
struct iomap_writeback_ops {
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend
2023-11-23 12:51 ` [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend Zhang Yi
@ 2023-11-23 15:36 ` Christoph Hellwig
2023-11-24 1:36 ` Zhang Yi
0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2023-11-23 15:36 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, tytso, adilger.kernel, jack,
ritesh.list, hch, djwong, yi.zhang, chengzhihao1, yukuai3
On Thu, Nov 23, 2023 at 08:51:13PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Add a private parameter to iomap_ioend structure, letting filesystems
> can pass something they needed from .prepare_ioend() to IO end.
On it's own this looks fine. Note that I have a series that I probably
should send out ASAP:
http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/iomap-map-multiple-blocks
that makes each ioend only have the embdeed bio, and bi_private in that
is unused, so you could just use that if we go down that route.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend
2023-11-23 15:36 ` Christoph Hellwig
@ 2023-11-24 1:36 ` Zhang Yi
0 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-24 1:36 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-ext4, linux-fsdevel, tytso, adilger.kernel, jack,
ritesh.list, djwong, yi.zhang, chengzhihao1, yukuai3
On 2023/11/23 23:36, Christoph Hellwig wrote:
> On Thu, Nov 23, 2023 at 08:51:13PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Add a private parameter to iomap_ioend structure, letting filesystems
>> can pass something they needed from .prepare_ioend() to IO end.
>
> On it's own this looks fine. Note that I have a series that I probably
> should send out ASAP:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/iomap-map-multiple-blocks
>
> that makes each ioend only have the embdeed bio, and bi_private in that
> is unused, so you could just use that if we go down that route.
>
Thanks for this improvement, I will analyze the changes of this series
in depth.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (10 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 15:34 ` Christoph Hellwig
2023-11-23 12:51 ` [RFC PATCH 13/18] ext4: impliment writeback iomap path Zhang Yi
` (6 subsequent siblings)
18 siblings, 1 reply; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Increase i_size in iomap_zero_range() looks not needed, the caller
should handle it. Especially, when truncate partial block, we should
not increase i_size beyond the new EOF here. It dosn't affect xfs and
gfs2 now because they reset the new file size after zero out, it doesn't
matter that a brief increase in i_size. But it will affect ext4 because
it set file size before truncate, so avoid increasing if it's not a
write path.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/iomap/buffered-io.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fd4d43bafd1b..3b9ba390dd1b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -852,13 +852,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
* cache. It's up to the file system to write the updated size to disk,
* preferably after I/O completion so that no stale data is exposed.
*/
- if (pos + ret > old_size) {
+ if ((iter->flags & IOMAP_WRITE) && pos + ret > old_size) {
i_size_write(iter->inode, pos + ret);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
__iomap_put_folio(iter, pos, ret, folio);
- if (old_size < pos)
+ if ((iter->flags & IOMAP_WRITE) && old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
if (ret < len)
iomap_write_failed(iter->inode, pos + ret, len - ret);
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation
2023-11-23 12:51 ` [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation Zhang Yi
@ 2023-11-23 15:34 ` Christoph Hellwig
2023-11-24 1:41 ` Zhang Yi
2023-11-30 12:26 ` Zhang Yi
0 siblings, 2 replies; 26+ messages in thread
From: Christoph Hellwig @ 2023-11-23 15:34 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, tytso, adilger.kernel, jack,
ritesh.list, hch, djwong, yi.zhang, chengzhihao1, yukuai3
On Thu, Nov 23, 2023 at 08:51:14PM +0800, Zhang Yi wrote:
> index fd4d43bafd1b..3b9ba390dd1b 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -852,13 +852,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> * cache. It's up to the file system to write the updated size to disk,
> * preferably after I/O completion so that no stale data is exposed.
> */
> - if (pos + ret > old_size) {
> + if ((iter->flags & IOMAP_WRITE) && pos + ret > old_size) {
> i_size_write(iter->inode, pos + ret);
> iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> }
> __iomap_put_folio(iter, pos, ret, folio);
>
> - if (old_size < pos)
> + if ((iter->flags & IOMAP_WRITE) && old_size < pos)
> pagecache_isize_extended(iter->inode, old_size, pos);
> if (ret < len)
> iomap_write_failed(iter->inode, pos + ret, len - ret);
I agree with your rationale, but I hate how this code ends up
looking. In many ways iomap_write_end seems like the wrong
place to update the inode size anyway. I've not done a deep
analysis, but I think there shouldn't really be any major blocker
to only setting IOMAP_F_SIZE_CHANGED in iomap_write_end, and then
move updating i_size and calling pagecache_isize_extended to
iomap_write_iter.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation
2023-11-23 15:34 ` Christoph Hellwig
@ 2023-11-24 1:41 ` Zhang Yi
2023-11-30 12:26 ` Zhang Yi
1 sibling, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-24 1:41 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-ext4, linux-fsdevel, tytso, adilger.kernel, jack,
ritesh.list, djwong, yi.zhang, chengzhihao1, yukuai3
On 2023/11/23 23:34, Christoph Hellwig wrote:
> On Thu, Nov 23, 2023 at 08:51:14PM +0800, Zhang Yi wrote:
>> index fd4d43bafd1b..3b9ba390dd1b 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -852,13 +852,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>> * cache. It's up to the file system to write the updated size to disk,
>> * preferably after I/O completion so that no stale data is exposed.
>> */
>> - if (pos + ret > old_size) {
>> + if ((iter->flags & IOMAP_WRITE) && pos + ret > old_size) {
>> i_size_write(iter->inode, pos + ret);
>> iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
>> }
>> __iomap_put_folio(iter, pos, ret, folio);
>>
>> - if (old_size < pos)
>> + if ((iter->flags & IOMAP_WRITE) && old_size < pos)
>> pagecache_isize_extended(iter->inode, old_size, pos);
>> if (ret < len)
>> iomap_write_failed(iter->inode, pos + ret, len - ret);
>
> I agree with your rationale, but I hate how this code ends up
> looking. In many ways iomap_write_end seems like the wrong
> place to update the inode size anyway. I've not done a deep
> analysis, but I think there shouldn't really be any major blocker
> to only setting IOMAP_F_SIZE_CHANGED in iomap_write_end, and then
> move updating i_size and calling pagecache_isize_extended to
> iomap_write_iter.
>
Yeah, make sense. It looks fine in my first glance, I will check
is there are any side effects.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation
2023-11-23 15:34 ` Christoph Hellwig
2023-11-24 1:41 ` Zhang Yi
@ 2023-11-30 12:26 ` Zhang Yi
1 sibling, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-30 12:26 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-ext4, linux-fsdevel, tytso, adilger.kernel, jack,
ritesh.list, djwong, yi.zhang, chengzhihao1, yukuai3
On 2023/11/23 23:34, Christoph Hellwig wrote:
> On Thu, Nov 23, 2023 at 08:51:14PM +0800, Zhang Yi wrote:
>> index fd4d43bafd1b..3b9ba390dd1b 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -852,13 +852,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>> * cache. It's up to the file system to write the updated size to disk,
>> * preferably after I/O completion so that no stale data is exposed.
>> */
>> - if (pos + ret > old_size) {
>> + if ((iter->flags & IOMAP_WRITE) && pos + ret > old_size) {
>> i_size_write(iter->inode, pos + ret);
>> iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
>> }
>> __iomap_put_folio(iter, pos, ret, folio);
>>
>> - if (old_size < pos)
>> + if ((iter->flags & IOMAP_WRITE) && old_size < pos)
>> pagecache_isize_extended(iter->inode, old_size, pos);
>> if (ret < len)
>> iomap_write_failed(iter->inode, pos + ret, len - ret);
>
> I agree with your rationale, but I hate how this code ends up
> looking. In many ways iomap_write_end seems like the wrong
> place to update the inode size anyway. I've not done a deep
> analysis, but I think there shouldn't really be any major blocker
> to only setting IOMAP_F_SIZE_CHANGED in iomap_write_end, and then
> move updating i_size and calling pagecache_isize_extended to
> iomap_write_iter.
>
Think about it in depth, I think we cannot move updating i_size
to iomap_write_iter() because we have to do this under folio lock,
otherwise, once we unlock folio, the writeback process could start
writing back and call folio_zero_segment() to zero out the valid
data beyond the unupdated i_size. Only if we move
__iomap_put_folio() out together, but I suppose it's not a good
way.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC PATCH 13/18] ext4: impliment writeback iomap path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (11 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 14/18] ext4: impliment zero_range " Zhang Yi
` (5 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Impliment writeback iomap path and journal write data path in
data=order mode, includes .map_blocks() and .prepare_ioend() callbacks
in iomap_writeback_ops, most of them are inherited from
ext4_writepages() and ext4_normal_submit_inode_data_buffers(), modify
and reuse mpage_map_one_extent() to save some codes. At the same time,
we are not able to switch buffered IO to iomap at onece, so introduce a
flag EXT4_STATE_BUFFERED_IOMAP to indicate one inode use traditional
buffered_head path or iomap path.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 5 +
fs/ext4/inode.c | 262 +++++++++++++++++++++++++++++++++++++++++-----
fs/ext4/page-io.c | 74 +++++++++++++
fs/ext4/super.c | 2 +
4 files changed, 318 insertions(+), 25 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b5026090ad6f..65373d53ba6a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1136,6 +1136,8 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
+ struct list_head i_iomap_ioend_list;
+ struct work_struct i_iomap_ioend_work;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1900,6 +1902,7 @@ enum {
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
+ EXT4_STATE_BUFFERED_IOMAP, /* Inode use iomap for buffered IO */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -3743,6 +3746,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9229297e1efc..f72864b9a6b3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -43,6 +43,7 @@
#include <linux/iversion.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"
@@ -2172,10 +2173,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
return err;
}
-static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+static int mpage_map_one_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_io_submit *io)
{
- struct inode *inode = mpd->inode;
- struct ext4_map_blocks *map = &mpd->map;
int get_blocks_flags;
int err, dioread_nolock;
@@ -2207,13 +2208,13 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
return err;
- if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
- if (!mpd->io_submit.io_end->handle &&
+ if (io && dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!io->io_end->handle &&
ext4_handle_valid(handle)) {
- mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ io->io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(inode, io->io_end);
}
BUG_ON(map->m_len == 0);
@@ -2257,7 +2258,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return PTR_ERR(io_end_vec);
io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
- err = mpage_map_one_extent(handle, mpd);
+ err = mpage_map_one_extent(handle, inode, map, &mpd->io_submit);
if (err < 0) {
struct super_block *sb = inode->i_sb;
@@ -2822,22 +2823,6 @@ static int ext4_writepages(struct address_space *mapping,
return ret;
}
-int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
- };
- struct mpage_da_data mpd = {
- .inode = jinode->i_vfs_inode,
- .wbc = &wbc,
- .can_map = 0,
- };
- return ext4_do_writepages(&mpd);
-}
-
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -3773,10 +3758,237 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
iomap_readahead(rac, &ext4_iomap_read_ops);
}
+struct ext4_writeback_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+};
+
+static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ struct ext4_writeback_ctx *ewpc =
+ container_of(wpc, struct ext4_writeback_ctx, ctx);
+ struct super_block *sb = inode->i_sb;
+ struct journal_s *journal = EXT4_SB(sb)->s_journal;
+ int needed_blocks;
+ struct ext4_map_blocks map;
+ handle_t *handle = NULL;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int index = offset >> blkbits;
+ unsigned int end = ewpc->wbc->range_end >> blkbits;
+ unsigned int len = end - index + 1 ? : UINT_MAX;
+ loff_t new_disksize;
+ bool allocated = false;
+ int ret = 0;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ /* Check validity of the cached writeback mapping. */
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+retry:
+ map.m_lblk = index;
+ map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len);
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+
+ if (!ewpc->can_map &&
+ (map.m_len == 0 || map.m_flags != EXT4_MAP_MAPPED)) {
+ /*
+ * We cannot reach here when we do a journal commit via
+ * journal_submit_data_buffers(), we must write mapped
+ * blocks to achieve data=ordered mode guarantees.
+ */
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ allocated = (map.m_flags & EXT4_MAP_MAPPED) ||
+ ((map.m_flags & EXT4_MAP_UNWRITTEN) &&
+ ext4_should_dioread_nolock(inode));
+ if (allocated) {
+ new_disksize = offset + (map.m_len << blkbits);
+ if (new_disksize <= READ_ONCE(EXT4_I(inode)->i_disksize))
+ goto out;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (!allocated) {
+ ret = mpage_map_one_extent(handle, inode, &map, NULL);
+ if (ret < 0) {
+ if (ext4_forced_shutdown(sb))
+ goto out_journal;
+
+ /*
+ * Retry transient ENOSPC errors, if
+ * ext4_count_free_blocks() is non-zero, a commit
+ * should free up blocks.
+ */
+ if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+ ext4_journal_stop(handle);
+ jbd2_journal_force_commit_nested(journal);
+ goto retry;
+ }
+
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with "
+ "max blocks %u with error %d",
+ inode->i_ino, (unsigned long long)map.m_lblk,
+ (unsigned int)map.m_len, -ret);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (ret == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ goto out_journal;
+ }
+ }
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ new_disksize = offset + (map.m_len << blkbits);
+ if (new_disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (new_disksize > i_size)
+ new_disksize = i_size;
+ if (new_disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ EXT4_ERROR_INODE_ERR(inode, -ret,
+ "Failed to mark inode dirty");
+ }
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ if (!ret)
+ ext4_set_iomap(inode, &wpc->iomap, &map, offset,
+ map.m_len << blkbits, 0);
+ return 0;
+}
+
+static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status)
+{
+ handle_t *handle = NULL;
+ struct inode *inode = ioend->io_inode;
+ int rsv_blocks;
+ int ret;
+
+ if (ioend->io_type != IOMAP_UNWRITTEN)
+ return status;
+
+ ioend->io_bio->bi_end_io = ext4_iomap_end_bio;
+
+ /*
+ * Reserve enough transaction credits for unwritten extent
+ * convert processing in end IO.
+ */
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ ioend->io_size >> inode->i_blkbits);
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, 0, rsv_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "%s: jbd2_start: %ld blocks, ino %lu; err %d\n",
+ __func__, ioend->io_size >> inode->i_blkbits,
+ inode->i_ino, ret);
+ return status ? status : ret;
+ }
+ if (ext4_handle_valid(handle)) {
+ ioend->io_private = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_journal_stop(handle);
+
+ return status;
+}
+
+static const struct iomap_writeback_ops ext4_iomap_writeback_ops = {
+ .map_blocks = ext4_iomap_map_blocks,
+ .prepare_ioend = ext4_iomap_prepare_ioend,
+};
+
+static int ext4_iomap_do_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ext4_writeback_ctx *ewpc)
+{
+ struct inode *inode = mapping->host;
+ long nr_to_write = wbc->nr_to_write;
+ int ret;
+
+ trace_ext4_writepages(inode, wbc);
+ ret = iomap_writepages(mapping, wbc, &ewpc->ctx,
+ &ext4_iomap_writeback_ops);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
+ return ret;
+}
+
static int ext4_iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return 0;
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ struct super_block *sb = mapping->host->i_sb;
+ int alloc_ctx, ret;
+
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+
+ alloc_ctx = ext4_writepages_down_read(sb);
+ ret = ext4_iomap_do_writepages(mapping, wbc, &ewpc);
+ ext4_writepages_up_read(sb, alloc_ctx);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct inode *inode = jinode->i_vfs_inode;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ return ext4_iomap_do_writepages(jinode->i_vfs_inode->i_mapping,
+ &wbc, &ewpc);
+
+ return ext4_do_writepages(&mpd);
}
/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..f817fcf8df99 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
@@ -565,3 +566,76 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
return 0;
}
+
+static int ext4_iomap_convert_unwritten_io_end(struct iomap_ioend *ioend)
+{
+ handle_t *handle = ioend->io_private;
+ struct inode *inode = ioend->io_inode;
+ int ret, err;
+
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ }
+
+ ret = ext4_convert_unwritten_extents(handle, ioend->io_inode,
+ ioend->io_offset, ioend->io_size);
+ if (handle) {
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ }
+out:
+ if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to "
+ "written extents -- potential data loss! "
+ "(inode %lu, error %d)", inode->i_ino, ret);
+ }
+ iomap_finish_ioends(ioend, ret);
+ return ret;
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_iomap_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head ioend_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&ioend_list)) {
+ ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+ BUG_ON(ioend->io_type != IOMAP_UNWRITTEN);
+ list_del_init(&ioend->io_list);
+ ext4_iomap_convert_unwritten_io_end(ioend);
+ }
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = bio->bi_private;
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb);
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(ioend->io_type != IOMAP_UNWRITTEN);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&ei->i_iomap_ioend_list))
+ queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+ list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..08a39f364d78 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1422,11 +1422,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 14/18] ext4: impliment zero_range iomap path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (12 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 13/18] ext4: impliment writeback iomap path Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 15/18] ext4: writeback partial blocks before zero range Zhang Yi
` (4 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Impliment zero_range iomap path, direct use iomap_zero_range() and add a
branch in ext4_block_zero_page_range().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f72864b9a6b3..ca66afd61fb3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4218,6 +4218,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
return err;
}
+static int ext4_iomap_zero_range(struct inode *inode,
+ loff_t from, loff_t length)
+{
+ return iomap_zero_range(inode, from, length, NULL,
+ &ext4_iomap_buffered_write_ops);
+}
+
/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
@@ -4243,6 +4250,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
if (IS_DAX(inode)) {
return dax_zero_range(inode, from, length, NULL,
&ext4_iomap_ops);
+ } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ return ext4_iomap_zero_range(inode, from, length);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 15/18] ext4: writeback partial blocks before zero range
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (13 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 14/18] ext4: impliment zero_range " Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 16/18] ext4: impliment mmap iomap path Zhang Yi
` (3 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
If we zero partial blocks, iomap_zero_iter() will skip zeroing out if
the srcmap is IOMAP_UNWRITTEN, it works fine in xfs because this type
means the block is pure unwritten, doesn't contain any delayed data,
but in ext4, IOMAP_UNWRITTEN may contain delayed data, it may hard to
change the definition of this flag in ext4, so just writeback partial
blocks firstly, make sure it becomes IOMAP_MAPPED.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 52bad225e3c8..b502edcf014b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4576,6 +4576,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (ret)
goto out_mutex;
+ ret = filemap_write_and_wait_range(mapping,
+ round_down(offset, 1 << blkbits), offset);
+ if (ret)
+ goto out_mutex;
+
+ ret = filemap_write_and_wait_range(mapping, offset + len,
+ round_up((offset + len), 1 << blkbits));
+ if (ret)
+ goto out_mutex;
}
/* Zero range excluding the unaligned edges */
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 16/18] ext4: impliment mmap iomap path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (14 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 15/18] ext4: writeback partial blocks before zero range Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
` (2 subsequent siblings)
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Impliment mmap iomap path, direct use iomap_page_mkwrite() for the
.page_mkwrite() callback.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 1 +
fs/ext4/file.c | 6 ++++++
fs/ext4/inode.c | 24 ++++++++++++++++++++++++
3 files changed, 31 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 65373d53ba6a..6b3e34ea58ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3006,6 +3006,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..15fe65744cba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -798,6 +798,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
.page_mkwrite = ext4_page_mkwrite,
};
+static const struct vm_operations_struct ext4_iomap_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = ext4_iomap_page_mkwrite,
+};
+
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ca66afd61fb3..2efa898403f7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3748,6 +3748,10 @@ const struct iomap_ops ext4_iomap_read_ops = {
.iomap_begin = ext4_iomap_buffered_io_begin,
};
+const struct iomap_ops ext4_iomap_page_mkwrite_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+};
+
static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
{
return iomap_read_folio(folio, &ext4_iomap_read_ops);
@@ -6698,3 +6702,23 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
ext4_journal_stop(handle);
goto out;
}
+
+vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct address_space *mapping = inode->i_mapping;
+ vm_fault_t ret;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+
+ filemap_invalidate_lock_shared(mapping);
+ ret = iomap_page_mkwrite(vmf, &ext4_iomap_page_mkwrite_ops);
+ filemap_invalidate_unlock_shared(mapping);
+
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (15 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 16/18] ext4: impliment mmap iomap path Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-24 13:57 ` Zhang Yi
2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 18/18] ext4: enable large folio for regular file which has been switched to use iomap Zhang Yi
18 siblings, 1 reply; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Partial enable iomap for regular file's buffered IO path on default
mount option and default filesystem features. Does not support inline
data, fs_verity, fs_crypt, bigalloc, dax and data=journal mode yet,
those will be supported in the future.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/ext4_jbd2.c | 3 ++-
fs/ext4/file.c | 8 +++++++-
fs/ext4/ialloc.c | 3 +++
fs/ext4/inode.c | 31 +++++++++++++++++++++++++++++++
fs/ext4/move_extent.c | 8 ++++++++
6 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6b3e34ea58ad..7ce688cb1b07 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2959,6 +2959,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+extern bool ext4_should_use_buffered_iomap(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3822,6 +3823,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d1a2e6624401..cf25cdda7234 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 15fe65744cba..7e3352b3b752 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -296,7 +296,11 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- ret = generic_perform_write(iocb, from);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ ret = iomap_file_buffered_write(iocb, from,
+ &ext4_iomap_buffered_write_ops);
+ else
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
@@ -823,6 +827,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vm_flags_set(vma, VM_HUGEPAGE);
+ } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ vma->vm_ops = &ext4_iomap_file_vm_ops;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..0aae2810dbf6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
}
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b2ab202af57b..f95d4c321cbb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -779,6 +779,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (ext4_has_inline_data(inode))
return -ERANGE;
+ if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)))
+ return -EINVAL;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -4121,6 +4123,8 @@ void ext4_set_aops(struct inode *inode)
}
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext4_dax_aops;
+ else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ inode->i_mapping->a_ops = &ext4_iomap_aops;
else if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
else
@@ -5185,6 +5189,30 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
return NULL;
}
+bool ext4_should_use_buffered_iomap(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (ext4_has_feature_inline_data(sb))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_bigalloc(sb))
+ return false;
+ if (!IS_DAX(inode))
+ return false;
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
+ return false;
+
+ return true;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -5449,6 +5477,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (ret)
goto bad_inode;
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 18a9e7c47975..23b4b77c5af8 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -597,6 +597,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
+ /* TODO: not supported since block getting function is not switched */
+ if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP) ||
+ ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported with buffered iomap");
+ return -EOPNOTSUPP;
+ }
+
if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path
2023-11-23 12:51 ` [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
@ 2023-11-24 13:57 ` Zhang Yi
0 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-24 13:57 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, chengzhihao1, yukuai3
This one is redundant and not correct, please look the next one
in this series. I'm sorry about it.
https://lore.kernel.org/linux-ext4/20231123125121.4064694-19-yi.zhang@huaweicloud.com/T/#u
On 2023/11/23 20:51, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Partial enable iomap for regular file's buffered IO path on default
> mount option and default filesystem features. Does not support inline
> data, fs_verity, fs_crypt, bigalloc, dax and data=journal mode yet,
> those will be supported in the future.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> fs/ext4/ext4.h | 2 ++
> fs/ext4/ext4_jbd2.c | 3 ++-
> fs/ext4/file.c | 8 +++++++-
> fs/ext4/ialloc.c | 3 +++
> fs/ext4/inode.c | 31 +++++++++++++++++++++++++++++++
> fs/ext4/move_extent.c | 8 ++++++++
> 6 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6b3e34ea58ad..7ce688cb1b07 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2959,6 +2959,7 @@ int ext4_walk_page_buffers(handle_t *handle,
> struct buffer_head *bh));
> int do_journal_get_write_access(handle_t *handle, struct inode *inode,
> struct buffer_head *bh);
> +extern bool ext4_should_use_buffered_iomap(struct inode *inode);
> #define FALL_BACK_TO_NONDELALLOC 1
> #define CONVERT_INLINE_DATA 2
>
> @@ -3822,6 +3823,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
> extern const struct iomap_ops ext4_iomap_ops;
> extern const struct iomap_ops ext4_iomap_overwrite_ops;
> extern const struct iomap_ops ext4_iomap_report_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
>
> static inline int ext4_buffer_uptodate(struct buffer_head *bh)
> {
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index d1a2e6624401..cf25cdda7234 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
> ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
> test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> - !test_opt(inode->i_sb, DELALLOC))) {
> + !ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) &&
> + !test_opt(inode->i_sb, DELALLOC))) {
> /* We do not support data journalling for encrypted data */
> if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
> return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 15fe65744cba..7e3352b3b752 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -296,7 +296,11 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> if (ret <= 0)
> goto out;
>
> - ret = generic_perform_write(iocb, from);
> + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
> + ret = iomap_file_buffered_write(iocb, from,
> + &ext4_iomap_buffered_write_ops);
> + else
> + ret = generic_perform_write(iocb, from);
>
> out:
> inode_unlock(inode);
> @@ -823,6 +827,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
> if (IS_DAX(file_inode(file))) {
> vma->vm_ops = &ext4_dax_vm_ops;
> vm_flags_set(vma, VM_HUGEPAGE);
> + } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
> + vma->vm_ops = &ext4_iomap_file_vm_ops;
> } else {
> vma->vm_ops = &ext4_file_vm_ops;
> }
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index b65058d972f9..0aae2810dbf6 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
> }
> }
>
> + if (ext4_should_use_buffered_iomap(inode))
> + ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
> +
> if (ext4_handle_valid(handle)) {
> ei->i_sync_tid = handle->h_transaction->t_tid;
> ei->i_datasync_tid = handle->h_transaction->t_tid;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index b2ab202af57b..f95d4c321cbb 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -779,6 +779,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>
> if (ext4_has_inline_data(inode))
> return -ERANGE;
> + if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)))
> + return -EINVAL;
>
> map.m_lblk = iblock;
> map.m_len = bh->b_size >> inode->i_blkbits;
> @@ -4121,6 +4123,8 @@ void ext4_set_aops(struct inode *inode)
> }
> if (IS_DAX(inode))
> inode->i_mapping->a_ops = &ext4_dax_aops;
> + else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
> + inode->i_mapping->a_ops = &ext4_iomap_aops;
> else if (test_opt(inode->i_sb, DELALLOC))
> inode->i_mapping->a_ops = &ext4_da_aops;
> else
> @@ -5185,6 +5189,30 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
> return NULL;
> }
>
> +bool ext4_should_use_buffered_iomap(struct inode *inode)
> +{
> + struct super_block *sb = inode->i_sb;
> +
> + if (ext4_has_feature_inline_data(sb))
> + return false;
> + if (ext4_has_feature_verity(sb))
> + return false;
> + if (ext4_has_feature_bigalloc(sb))
> + return false;
> + if (!IS_DAX(inode))
> + return false;
> + if (!S_ISREG(inode->i_mode))
> + return false;
> + if (ext4_should_journal_data(inode))
> + return false;
> + if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
> + return false;
> + if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
> + return false;
> +
> + return true;
> +}
> +
> struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> ext4_iget_flags flags, const char *function,
> unsigned int line)
> @@ -5449,6 +5477,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> if (ret)
> goto bad_inode;
>
> + if (ext4_should_use_buffered_iomap(inode))
> + ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
> +
> if (S_ISREG(inode->i_mode)) {
> inode->i_op = &ext4_file_inode_operations;
> inode->i_fop = &ext4_file_operations;
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index 18a9e7c47975..23b4b77c5af8 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -597,6 +597,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
> return -EOPNOTSUPP;
> }
>
> + /* TODO: not supported since block getting function is not switched */
> + if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP) ||
> + ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) {
> + ext4_msg(orig_inode->i_sb, KERN_ERR,
> + "Online defrag not supported with buffered iomap");
> + return -EOPNOTSUPP;
> + }
> +
> if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
> ext4_msg(orig_inode->i_sb, KERN_ERR,
> "Online defrag not supported for encrypted files");
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (16 preceding siblings ...)
2023-11-23 12:51 ` [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 18/18] ext4: enable large folio for regular file which has been switched to use iomap Zhang Yi
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
Partial enable iomap for regular file's buffered IO path on default
mount option and default filesystem features. Does not support inline
data, fs_verity, fs_crypt, bigalloc, dax and data=journal mode yet,
those will be supported in the future.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/ext4_jbd2.c | 3 ++-
fs/ext4/file.c | 8 +++++++-
fs/ext4/ialloc.c | 3 +++
fs/ext4/inode.c | 31 +++++++++++++++++++++++++++++++
fs/ext4/move_extent.c | 8 ++++++++
6 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6b3e34ea58ad..7ce688cb1b07 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2959,6 +2959,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+extern bool ext4_should_use_buffered_iomap(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3822,6 +3823,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d1a2e6624401..cf25cdda7234 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 15fe65744cba..7e3352b3b752 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -296,7 +296,11 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- ret = generic_perform_write(iocb, from);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ ret = iomap_file_buffered_write(iocb, from,
+ &ext4_iomap_buffered_write_ops);
+ else
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
@@ -823,6 +827,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vm_flags_set(vma, VM_HUGEPAGE);
+ } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ vma->vm_ops = &ext4_iomap_file_vm_ops;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..0aae2810dbf6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
}
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2efa898403f7..33920dc461a8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -779,6 +779,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (ext4_has_inline_data(inode))
return -ERANGE;
+ if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)))
+ return -EINVAL;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -4121,6 +4123,8 @@ void ext4_set_aops(struct inode *inode)
}
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext4_dax_aops;
+ else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ inode->i_mapping->a_ops = &ext4_iomap_aops;
else if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
else
@@ -5185,6 +5189,30 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
return NULL;
}
+bool ext4_should_use_buffered_iomap(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (ext4_has_feature_inline_data(sb))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_bigalloc(sb))
+ return false;
+ if (IS_DAX(inode))
+ return false;
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
+ return false;
+
+ return true;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -5449,6 +5477,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (ret)
goto bad_inode;
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 18a9e7c47975..23b4b77c5af8 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -597,6 +597,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
+ /* TODO: not supported since block getting function is not switched */
+ if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP) ||
+ ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported with buffered iomap");
+ return -EOPNOTSUPP;
+ }
+
if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [RFC PATCH 18/18] ext4: enable large folio for regular file which has been switched to use iomap
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
` (17 preceding siblings ...)
2023-11-23 12:51 ` Zhang Yi
@ 2023-11-23 12:51 ` Zhang Yi
18 siblings, 0 replies; 26+ messages in thread
From: Zhang Yi @ 2023-11-23 12:51 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, tytso, adilger.kernel, jack, ritesh.list, hch,
djwong, yi.zhang, yi.zhang, chengzhihao1, yukuai3
From: Zhang Yi <yi.zhang@huawei.com>
After switching to use iomap for regular file in the default mode, we
can enable large foilo for it together, that could bring a lot of
performance gains.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ialloc.c | 4 +++-
fs/ext4/inode.c | 4 +++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 0aae2810dbf6..a72c7167c33f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,8 +1336,10 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
}
- if (ext4_should_use_buffered_iomap(inode))
+ if (ext4_should_use_buffered_iomap(inode)) {
ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+ mapping_set_large_folios(inode->i_mapping);
+ }
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 33920dc461a8..f8801d3378e3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5477,8 +5477,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (ret)
goto bad_inode;
- if (ext4_should_use_buffered_iomap(inode))
+ if (ext4_should_use_buffered_iomap(inode)) {
ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+ mapping_set_large_folios(inode->i_mapping);
+ }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
--
2.39.2
^ permalink raw reply related [flat|nested] 26+ messages in thread