* [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-16 15:00 ` Jan Kara
` (2 more replies)
2024-12-16 1:39 ` [PATCH v4 02/10] ext4: don't explicit update times in ext4_fallocate() Zhang Yi
` (8 subsequent siblings)
9 siblings, 3 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
When zeroing a range of folios on the filesystem which block size is
less than the page size, the file's mapped blocks within one page will
be marked as unwritten, we should remove writable userspace mappings to
ensure that ext4_page_mkwrite() can be called during subsequent write
access to these partial folios. Otherwise, data written by subsequent
mmap writes may not be saved to disk.
$mkfs.ext4 -b 1024 /dev/vdb
$mount /dev/vdb /mnt
$xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
-c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
-c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
$od -Ax -t x1z /mnt/foo
000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
*
000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
*
001000
$umount /mnt && mount /dev/vdb /mnt
$od -Ax -t x1z /mnt/foo
000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
*
000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
001000
Fix this by introducing ext4_truncate_page_cache_block_range() to remove
writable userspace mappings when truncating a partial folio range.
Additionally, move the journal data mode-specific handlers and
truncate_pagecache_range() into this function, allowing it to serve as a
common helper that correctly manages the page cache in preparation for
block range manipulations.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/extents.c | 19 ++++-----------
fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 69 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 74f2071189b2..8843929b46ce 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a07a98a4b97a..8dc6b4271b15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- /*
- * For journalled data we need to write (and checkpoint) pages
- * before discarding page cache to avoid inconsitent data on
- * disk in case of crash before zeroing trans is committed.
- */
- if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start,
- end - 1);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, start, end);
+ if (ret) {
+ filemap_invalidate_unlock(mapping);
+ goto out_mutex;
}
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89aade6f45f6..c68a8b841148 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t start_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(start_boundary, end));
+ if (end > start_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 1:39 ` [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
@ 2024-12-16 15:00 ` Jan Kara
2024-12-17 7:05 ` Zhang Yi
2024-12-16 15:15 ` Matthew Wilcox
2024-12-18 9:56 ` Ojaswin Mujoo
2 siblings, 1 reply; 35+ messages in thread
From: Jan Kara @ 2024-12-16 15:00 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon 16-12-24 09:39:06, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> When zeroing a range of folios on the filesystem which block size is
> less than the page size, the file's mapped blocks within one page will
> be marked as unwritten, we should remove writable userspace mappings to
> ensure that ext4_page_mkwrite() can be called during subsequent write
> access to these partial folios. Otherwise, data written by subsequent
> mmap writes may not be saved to disk.
>
> $mkfs.ext4 -b 1024 /dev/vdb
> $mount /dev/vdb /mnt
> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
> *
> 001000
>
> $umount /mnt && mount /dev/vdb /mnt
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> *
> 001000
>
> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> writable userspace mappings when truncating a partial folio range.
> Additionally, move the journal data mode-specific handlers and
> truncate_pagecache_range() into this function, allowing it to serve as a
> common helper that correctly manages the page cache in preparation for
> block range manipulations.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
I like the patch. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Just one thing occured to me when thinking about this: It seems like a
nasty catch that truncate_inode_pages_range() does not writeprotect these
partial pages because practically any filesystem supporting blocksize <
pagesize and doing anything non-trivial in ->page_mkwrite handler will need
this. So ultimately I think we might want to fix this in generic code but
ext4 solution is fine for now.
Honza
> ---
> fs/ext4/ext4.h | 2 ++
> fs/ext4/extents.c | 19 ++++-----------
> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 69 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 74f2071189b2..8843929b46ce 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
> extern int ext4_can_truncate(struct inode *inode);
> extern int ext4_truncate(struct inode *);
> extern int ext4_break_layouts(struct inode *);
> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> + loff_t start, loff_t end);
> extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
> extern void ext4_set_inode_flags(struct inode *, bool init);
> extern int ext4_alloc_da_blocks(struct inode *inode);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a07a98a4b97a..8dc6b4271b15 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> goto out_mutex;
> }
>
> - /*
> - * For journalled data we need to write (and checkpoint) pages
> - * before discarding page cache to avoid inconsitent data on
> - * disk in case of crash before zeroing trans is committed.
> - */
> - if (ext4_should_journal_data(inode)) {
> - ret = filemap_write_and_wait_range(mapping, start,
> - end - 1);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + /* Now release the pages and zero block aligned part of pages */
> + ret = ext4_truncate_page_cache_block_range(inode, start, end);
> + if (ret) {
> + filemap_invalidate_unlock(mapping);
> + goto out_mutex;
> }
>
> - /* Now release the pages and zero block aligned part of pages */
> - truncate_pagecache_range(inode, start, end - 1);
> inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>
> ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 89aade6f45f6..c68a8b841148 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -31,6 +31,7 @@
> #include <linux/writeback.h>
> #include <linux/pagevec.h>
> #include <linux/mpage.h>
> +#include <linux/rmap.h>
> #include <linux/namei.h>
> #include <linux/uio.h>
> #include <linux/bio.h>
> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
> return ret;
> }
>
> +static inline void ext4_truncate_folio(struct inode *inode,
> + loff_t start, loff_t end)
> +{
> + unsigned long blocksize = i_blocksize(inode);
> + struct folio *folio;
> +
> + /* Nothing to be done if no complete block needs to be truncated. */
> + if (round_up(start, blocksize) >= round_down(end, blocksize))
> + return;
> +
> + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> + if (IS_ERR(folio))
> + return;
> +
> + if (folio_mkclean(folio))
> + folio_mark_dirty(folio);
> + folio_unlock(folio);
> + folio_put(folio);
> +}
> +
> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> + loff_t start, loff_t end)
> +{
> + unsigned long blocksize = i_blocksize(inode);
> + int ret;
> +
> + /*
> + * For journalled data we need to write (and checkpoint) pages
> + * before discarding page cache to avoid inconsitent data on disk
> + * in case of crash before freeing or unwritten converting trans
> + * is committed.
> + */
> + if (ext4_should_journal_data(inode)) {
> + ret = filemap_write_and_wait_range(inode->i_mapping, start,
> + end - 1);
> + if (ret)
> + return ret;
> + goto truncate_pagecache;
> + }
> +
> + /*
> + * If the block size is less than the page size, the file's mapped
> + * blocks within one page could be freed or converted to unwritten.
> + * So it's necessary to remove writable userspace mappings, and then
> + * ext4_page_mkwrite() can be called during subsequent write access
> + * to these partial folios.
> + */
> + if (blocksize < PAGE_SIZE && start < inode->i_size) {
> + loff_t start_boundary = round_up(start, PAGE_SIZE);
> +
> + ext4_truncate_folio(inode, start, min(start_boundary, end));
> + if (end > start_boundary)
> + ext4_truncate_folio(inode,
> + round_down(end, PAGE_SIZE), end);
> + }
> +
> +truncate_pagecache:
> + truncate_pagecache_range(inode, start, end - 1);
> + return 0;
> +}
> +
> static void ext4_wait_dax_page(struct inode *inode)
> {
> filemap_invalidate_unlock(inode->i_mapping);
> --
> 2.46.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 15:00 ` Jan Kara
@ 2024-12-17 7:05 ` Zhang Yi
0 siblings, 0 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-17 7:05 UTC (permalink / raw)
To: Jan Kara
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
yi.zhang, chengzhihao1, yukuai3, yangerkun
On 2024/12/16 23:00, Jan Kara wrote:
> On Mon 16-12-24 09:39:06, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> When zeroing a range of folios on the filesystem which block size is
>> less than the page size, the file's mapped blocks within one page will
>> be marked as unwritten, we should remove writable userspace mappings to
>> ensure that ext4_page_mkwrite() can be called during subsequent write
>> access to these partial folios. Otherwise, data written by subsequent
>> mmap writes may not be saved to disk.
>>
>> $mkfs.ext4 -b 1024 /dev/vdb
>> $mount /dev/vdb /mnt
>> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>> *
>> 001000
>>
>> $umount /mnt && mount /dev/vdb /mnt
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> *
>> 001000
>>
>> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
>> writable userspace mappings when truncating a partial folio range.
>> Additionally, move the journal data mode-specific handlers and
>> truncate_pagecache_range() into this function, allowing it to serve as a
>> common helper that correctly manages the page cache in preparation for
>> block range manipulations.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>
> I like the patch. Feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
> Just one thing occured to me when thinking about this: It seems like a
> nasty catch that truncate_inode_pages_range() does not writeprotect these
> partial pages because practically any filesystem supporting blocksize <
> pagesize and doing anything non-trivial in ->page_mkwrite handler will need
> this. So ultimately I think we might want to fix this in generic code but
> ext4 solution is fine for now.
>
Yes, I agree with you. I've checked XFS, Btrfs, and Bcachefs. Currently,
XFS and Btrfs choose to perform writeback before truncating partial
folios, so they do not require this at the moment. However, it appears
that Bcachefs also does a similar approach in __bch2_truncate_folio().
So I think do this in generic code should be better too.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 1:39 ` [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
2024-12-16 15:00 ` Jan Kara
@ 2024-12-16 15:15 ` Matthew Wilcox
2024-12-17 7:38 ` Zhang Yi
2024-12-18 9:56 ` Ojaswin Mujoo
2 siblings, 1 reply; 35+ messages in thread
From: Matthew Wilcox @ 2024-12-16 15:15 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
> $mkfs.ext4 -b 1024 /dev/vdb
> $mount /dev/vdb /mnt
> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
> *
> 001000
>
> $umount /mnt && mount /dev/vdb /mnt
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> *
> 001000
Can you add this to fstests please so we can be sure other filesystems
don't have the same problem?
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 15:15 ` Matthew Wilcox
@ 2024-12-17 7:38 ` Zhang Yi
0 siblings, 0 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-17 7:38 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On 2024/12/16 23:15, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
>> $mkfs.ext4 -b 1024 /dev/vdb
>> $mount /dev/vdb /mnt
>> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>> *
>> 001000
>>
>> $umount /mnt && mount /dev/vdb /mnt
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> *
>> 001000
>
> Can you add this to fstests please so we can be sure other filesystems
> don't have the same problem?
Sure, I captured this issue by generic/567 while refactoring punch
hole operation on ext4. The generic/567 only performs a partial punch
hole test but does not include a partial zero range test, so we
did not capture this issue. I will expand this test and add this case.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-16 1:39 ` [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
2024-12-16 15:00 ` Jan Kara
2024-12-16 15:15 ` Matthew Wilcox
@ 2024-12-18 9:56 ` Ojaswin Mujoo
2024-12-18 13:02 ` Zhang Yi
2 siblings, 1 reply; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 9:56 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> When zeroing a range of folios on the filesystem which block size is
> less than the page size, the file's mapped blocks within one page will
> be marked as unwritten, we should remove writable userspace mappings to
> ensure that ext4_page_mkwrite() can be called during subsequent write
> access to these partial folios. Otherwise, data written by subsequent
> mmap writes may not be saved to disk.
>
> $mkfs.ext4 -b 1024 /dev/vdb
> $mount /dev/vdb /mnt
> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
> *
> 001000
>
> $umount /mnt && mount /dev/vdb /mnt
> $od -Ax -t x1z /mnt/foo
> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> *
> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> *
> 001000
>
> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> writable userspace mappings when truncating a partial folio range.
> Additionally, move the journal data mode-specific handlers and
> truncate_pagecache_range() into this function, allowing it to serve as a
> common helper that correctly manages the page cache in preparation for
> block range manipulations.
Hi Zhang,
Thanks for the fix, just to confirm my understanding, the issue arises
because of the following flow:
1. page_mkwrite() makes folio dirty when we write to the mmap'd region
2. ext4_zero_range (2kb to 4kb)
truncate_pagecache_range
truncate_inode_pages_range
truncate_inode_partial_folio
folio_zero_range (2kb to 4kb)
folio_invalidate
ext4_invalidate_folio
block_invalidate_folio -> clear the bh dirty bit
3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
dirty hence after a remount the data is not seen on disk
Also, we won't see this issue if we are zeroing a page aligned range
since we end up unmapping the pages from the proccess address space in
that case. Correct?
I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
size and can confirm that it fixes the data loss issue. That being said,
I have a few minor comments on the patch below:
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> fs/ext4/ext4.h | 2 ++
> fs/ext4/extents.c | 19 ++++-----------
> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 69 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 74f2071189b2..8843929b46ce 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
> extern int ext4_can_truncate(struct inode *inode);
> extern int ext4_truncate(struct inode *);
> extern int ext4_break_layouts(struct inode *);
> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> + loff_t start, loff_t end);
> extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
> extern void ext4_set_inode_flags(struct inode *, bool init);
> extern int ext4_alloc_da_blocks(struct inode *inode);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a07a98a4b97a..8dc6b4271b15 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> goto out_mutex;
> }
>
> - /*
> - * For journalled data we need to write (and checkpoint) pages
> - * before discarding page cache to avoid inconsitent data on
> - * disk in case of crash before zeroing trans is committed.
> - */
> - if (ext4_should_journal_data(inode)) {
> - ret = filemap_write_and_wait_range(mapping, start,
> - end - 1);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + /* Now release the pages and zero block aligned part of pages */
> + ret = ext4_truncate_page_cache_block_range(inode, start, end);
> + if (ret) {
> + filemap_invalidate_unlock(mapping);
> + goto out_mutex;
> }
>
> - /* Now release the pages and zero block aligned part of pages */
> - truncate_pagecache_range(inode, start, end - 1);
> inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>
> ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 89aade6f45f6..c68a8b841148 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -31,6 +31,7 @@
> #include <linux/writeback.h>
> #include <linux/pagevec.h>
> #include <linux/mpage.h>
> +#include <linux/rmap.h>
> #include <linux/namei.h>
> #include <linux/uio.h>
> #include <linux/bio.h>
> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
> return ret;
> }
>
> +static inline void ext4_truncate_folio(struct inode *inode,
> + loff_t start, loff_t end)
> +{
> + unsigned long blocksize = i_blocksize(inode);
> + struct folio *folio;
> +
> + /* Nothing to be done if no complete block needs to be truncated. */
> + if (round_up(start, blocksize) >= round_down(end, blocksize))
> + return;
> +
> + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> + if (IS_ERR(folio))
> + return;
> +
> + if (folio_mkclean(folio))
> + folio_mark_dirty(folio);
> + folio_unlock(folio);
> + folio_put(folio);
> +}
> +
> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> + loff_t start, loff_t end)
> +{
> + unsigned long blocksize = i_blocksize(inode);
> + int ret;
> +
> + /*
> + * For journalled data we need to write (and checkpoint) pages
> + * before discarding page cache to avoid inconsitent data on disk
> + * in case of crash before freeing or unwritten converting trans
> + * is committed.
> + */
> + if (ext4_should_journal_data(inode)) {
> + ret = filemap_write_and_wait_range(inode->i_mapping, start,
> + end - 1);
> + if (ret)
> + return ret;
> + goto truncate_pagecache;
> + }
> +
> + /*
> + * If the block size is less than the page size, the file's mapped
> + * blocks within one page could be freed or converted to unwritten.
> + * So it's necessary to remove writable userspace mappings, and then
> + * ext4_page_mkwrite() can be called during subsequent write access
> + * to these partial folios.
> + */
> + if (blocksize < PAGE_SIZE && start < inode->i_size) {
Maybe we should only call ext4_truncate_folio() if the range is not page
aligned, rather than calling it everytime for bs < ps?
> + loff_t start_boundary = round_up(start, PAGE_SIZE);
I think page_boundary seems like a more suitable name for the variable.
Regards,
ojaswin
> +
> + ext4_truncate_folio(inode, start, min(start_boundary, end));
> + if (end > start_boundary)
> + ext4_truncate_folio(inode,
> + round_down(end, PAGE_SIZE), end);
> + }
> +
> +truncate_pagecache:
> + truncate_pagecache_range(inode, start, end - 1);
> + return 0;
> +}
> +
> static void ext4_wait_dax_page(struct inode *inode)
> {
> filemap_invalidate_unlock(inode->i_mapping);
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-18 9:56 ` Ojaswin Mujoo
@ 2024-12-18 13:02 ` Zhang Yi
2024-12-19 7:19 ` Ojaswin Mujoo
0 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-18 13:02 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On 2024/12/18 17:56, Ojaswin Mujoo wrote:
> On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> When zeroing a range of folios on the filesystem which block size is
>> less than the page size, the file's mapped blocks within one page will
>> be marked as unwritten, we should remove writable userspace mappings to
>> ensure that ext4_page_mkwrite() can be called during subsequent write
>> access to these partial folios. Otherwise, data written by subsequent
>> mmap writes may not be saved to disk.
>>
>> $mkfs.ext4 -b 1024 /dev/vdb
>> $mount /dev/vdb /mnt
>> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>> *
>> 001000
>>
>> $umount /mnt && mount /dev/vdb /mnt
>> $od -Ax -t x1z /mnt/foo
>> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>> *
>> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> *
>> 001000
>>
>> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
>> writable userspace mappings when truncating a partial folio range.
>> Additionally, move the journal data mode-specific handlers and
>> truncate_pagecache_range() into this function, allowing it to serve as a
>> common helper that correctly manages the page cache in preparation for
>> block range manipulations.
>
> Hi Zhang,
>
> Thanks for the fix, just to confirm my understanding, the issue arises
> because of the following flow:
>
> 1. page_mkwrite() makes folio dirty when we write to the mmap'd region
>
> 2. ext4_zero_range (2kb to 4kb)
> truncate_pagecache_range
> truncate_inode_pages_range
> truncate_inode_partial_folio
> folio_zero_range (2kb to 4kb)
> folio_invalidate
> ext4_invalidate_folio
> block_invalidate_folio -> clear the bh dirty bit
>
> 3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
> dirty hence after a remount the data is not seen on disk
>
> Also, we won't see this issue if we are zeroing a page aligned range
> since we end up unmapping the pages from the proccess address space in
> that case. Correct?
Thank you for review! Yes, it's correct.
>
> I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
> size and can confirm that it fixes the data loss issue. That being said,
> I have a few minor comments on the patch below:
>
Thank you for the test.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> fs/ext4/ext4.h | 2 ++
>> fs/ext4/extents.c | 19 ++++-----------
>> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 69 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 74f2071189b2..8843929b46ce 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
>> extern int ext4_can_truncate(struct inode *inode);
>> extern int ext4_truncate(struct inode *);
>> extern int ext4_break_layouts(struct inode *);
>> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
>> + loff_t start, loff_t end);
>> extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
>> extern void ext4_set_inode_flags(struct inode *, bool init);
>> extern int ext4_alloc_da_blocks(struct inode *inode);
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index a07a98a4b97a..8dc6b4271b15 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>> goto out_mutex;
>> }
>>
>> - /*
>> - * For journalled data we need to write (and checkpoint) pages
>> - * before discarding page cache to avoid inconsitent data on
>> - * disk in case of crash before zeroing trans is committed.
>> - */
>> - if (ext4_should_journal_data(inode)) {
>> - ret = filemap_write_and_wait_range(mapping, start,
>> - end - 1);
>> - if (ret) {
>> - filemap_invalidate_unlock(mapping);
>> - goto out_mutex;
>> - }
>> + /* Now release the pages and zero block aligned part of pages */
>> + ret = ext4_truncate_page_cache_block_range(inode, start, end);
>> + if (ret) {
>> + filemap_invalidate_unlock(mapping);
>> + goto out_mutex;
>> }
>>
>> - /* Now release the pages and zero block aligned part of pages */
>> - truncate_pagecache_range(inode, start, end - 1);
>> inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>>
>> ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 89aade6f45f6..c68a8b841148 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -31,6 +31,7 @@
>> #include <linux/writeback.h>
>> #include <linux/pagevec.h>
>> #include <linux/mpage.h>
>> +#include <linux/rmap.h>
>> #include <linux/namei.h>
>> #include <linux/uio.h>
>> #include <linux/bio.h>
>> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>> return ret;
>> }
>>
>> +static inline void ext4_truncate_folio(struct inode *inode,
>> + loff_t start, loff_t end)
>> +{
>> + unsigned long blocksize = i_blocksize(inode);
>> + struct folio *folio;
>> +
>> + /* Nothing to be done if no complete block needs to be truncated. */
>> + if (round_up(start, blocksize) >= round_down(end, blocksize))
>> + return;
>> +
>> + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
>> + if (IS_ERR(folio))
>> + return;
>> +
>> + if (folio_mkclean(folio))
>> + folio_mark_dirty(folio);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> +}
>> +
>> +int ext4_truncate_page_cache_block_range(struct inode *inode,
>> + loff_t start, loff_t end)
>> +{
>> + unsigned long blocksize = i_blocksize(inode);
>> + int ret;
>> +
>> + /*
>> + * For journalled data we need to write (and checkpoint) pages
>> + * before discarding page cache to avoid inconsitent data on disk
>> + * in case of crash before freeing or unwritten converting trans
>> + * is committed.
>> + */
>> + if (ext4_should_journal_data(inode)) {
>> + ret = filemap_write_and_wait_range(inode->i_mapping, start,
>> + end - 1);
>> + if (ret)
>> + return ret;
>> + goto truncate_pagecache;
>> + }
>> +
>> + /*
>> + * If the block size is less than the page size, the file's mapped
>> + * blocks within one page could be freed or converted to unwritten.
>> + * So it's necessary to remove writable userspace mappings, and then
>> + * ext4_page_mkwrite() can be called during subsequent write access
>> + * to these partial folios.
>> + */
>> + if (blocksize < PAGE_SIZE && start < inode->i_size) {
>
> Maybe we should only call ext4_truncate_folio() if the range is not page
> aligned, rather than calling it everytime for bs < ps?
I agree with you, so how about below?
if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
blocksize < PAGE_SIZE && start < inode->i_size && )
>
>> + loff_t start_boundary = round_up(start, PAGE_SIZE);
>
> I think page_boundary seems like a more suitable name for the variable.
Yeah, it looks fine to me.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache
2024-12-18 13:02 ` Zhang Yi
@ 2024-12-19 7:19 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-19 7:19 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Wed, Dec 18, 2024 at 09:02:18PM +0800, Zhang Yi wrote:
> On 2024/12/18 17:56, Ojaswin Mujoo wrote:
> > On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> When zeroing a range of folios on the filesystem which block size is
> >> less than the page size, the file's mapped blocks within one page will
> >> be marked as unwritten, we should remove writable userspace mappings to
> >> ensure that ext4_page_mkwrite() can be called during subsequent write
> >> access to these partial folios. Otherwise, data written by subsequent
> >> mmap writes may not be saved to disk.
> >>
> >> $mkfs.ext4 -b 1024 /dev/vdb
> >> $mount /dev/vdb /mnt
> >> $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
> >> -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
> >> -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
> >>
> >> $od -Ax -t x1z /mnt/foo
> >> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> >> *
> >> 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
> >> *
> >> 001000
> >>
> >> $umount /mnt && mount /dev/vdb /mnt
> >> $od -Ax -t x1z /mnt/foo
> >> 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> >> *
> >> 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >> *
> >> 001000
> >>
> >> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> >> writable userspace mappings when truncating a partial folio range.
> >> Additionally, move the journal data mode-specific handlers and
> >> truncate_pagecache_range() into this function, allowing it to serve as a
> >> common helper that correctly manages the page cache in preparation for
> >> block range manipulations.
> >
> > Hi Zhang,
> >
> > Thanks for the fix, just to confirm my understanding, the issue arises
> > because of the following flow:
> >
> > 1. page_mkwrite() makes folio dirty when we write to the mmap'd region
> >
> > 2. ext4_zero_range (2kb to 4kb)
> > truncate_pagecache_range
> > truncate_inode_pages_range
> > truncate_inode_partial_folio
> > folio_zero_range (2kb to 4kb)
> > folio_invalidate
> > ext4_invalidate_folio
> > block_invalidate_folio -> clear the bh dirty bit
> >
> > 3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
> > dirty hence after a remount the data is not seen on disk
> >
> > Also, we won't see this issue if we are zeroing a page aligned range
> > since we end up unmapping the pages from the proccess address space in
> > that case. Correct?
>
> Thank you for review! Yes, it's correct.
>
> >
> > I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
> > size and can confirm that it fixes the data loss issue. That being said,
> > I have a few minor comments on the patch below:
> >
>
> Thank you for the test.
>
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >> fs/ext4/ext4.h | 2 ++
> >> fs/ext4/extents.c | 19 ++++-----------
> >> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> >> 3 files changed, 69 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> >> index 74f2071189b2..8843929b46ce 100644
> >> --- a/fs/ext4/ext4.h
> >> +++ b/fs/ext4/ext4.h
> >> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
> >> extern int ext4_can_truncate(struct inode *inode);
> >> extern int ext4_truncate(struct inode *);
> >> extern int ext4_break_layouts(struct inode *);
> >> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> >> + loff_t start, loff_t end);
> >> extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
> >> extern void ext4_set_inode_flags(struct inode *, bool init);
> >> extern int ext4_alloc_da_blocks(struct inode *inode);
> >> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> >> index a07a98a4b97a..8dc6b4271b15 100644
> >> --- a/fs/ext4/extents.c
> >> +++ b/fs/ext4/extents.c
> >> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> >> goto out_mutex;
> >> }
> >>
> >> - /*
> >> - * For journalled data we need to write (and checkpoint) pages
> >> - * before discarding page cache to avoid inconsitent data on
> >> - * disk in case of crash before zeroing trans is committed.
> >> - */
> >> - if (ext4_should_journal_data(inode)) {
> >> - ret = filemap_write_and_wait_range(mapping, start,
> >> - end - 1);
> >> - if (ret) {
> >> - filemap_invalidate_unlock(mapping);
> >> - goto out_mutex;
> >> - }
> >> + /* Now release the pages and zero block aligned part of pages */
> >> + ret = ext4_truncate_page_cache_block_range(inode, start, end);
> >> + if (ret) {
> >> + filemap_invalidate_unlock(mapping);
> >> + goto out_mutex;
> >> }
> >>
> >> - /* Now release the pages and zero block aligned part of pages */
> >> - truncate_pagecache_range(inode, start, end - 1);
> >> inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> >>
> >> ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index 89aade6f45f6..c68a8b841148 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -31,6 +31,7 @@
> >> #include <linux/writeback.h>
> >> #include <linux/pagevec.h>
> >> #include <linux/mpage.h>
> >> +#include <linux/rmap.h>
> >> #include <linux/namei.h>
> >> #include <linux/uio.h>
> >> #include <linux/bio.h>
> >> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
> >> return ret;
> >> }
> >>
> >> +static inline void ext4_truncate_folio(struct inode *inode,
> >> + loff_t start, loff_t end)
> >> +{
> >> + unsigned long blocksize = i_blocksize(inode);
> >> + struct folio *folio;
> >> +
> >> + /* Nothing to be done if no complete block needs to be truncated. */
> >> + if (round_up(start, blocksize) >= round_down(end, blocksize))
> >> + return;
> >> +
> >> + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> >> + if (IS_ERR(folio))
> >> + return;
> >> +
> >> + if (folio_mkclean(folio))
> >> + folio_mark_dirty(folio);
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> +}
> >> +
> >> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> >> + loff_t start, loff_t end)
> >> +{
> >> + unsigned long blocksize = i_blocksize(inode);
> >> + int ret;
> >> +
> >> + /*
> >> + * For journalled data we need to write (and checkpoint) pages
> >> + * before discarding page cache to avoid inconsitent data on disk
> >> + * in case of crash before freeing or unwritten converting trans
> >> + * is committed.
> >> + */
> >> + if (ext4_should_journal_data(inode)) {
> >> + ret = filemap_write_and_wait_range(inode->i_mapping, start,
> >> + end - 1);
> >> + if (ret)
> >> + return ret;
> >> + goto truncate_pagecache;
> >> + }
> >> +
> >> + /*
> >> + * If the block size is less than the page size, the file's mapped
> >> + * blocks within one page could be freed or converted to unwritten.
> >> + * So it's necessary to remove writable userspace mappings, and then
> >> + * ext4_page_mkwrite() can be called during subsequent write access
> >> + * to these partial folios.
> >> + */
> >> + if (blocksize < PAGE_SIZE && start < inode->i_size) {
> >
> > Maybe we should only call ext4_truncate_folio() if the range is not page
> > aligned, rather than calling it everytime for bs < ps?
>
> I agree with you, so how about below?
>
> if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
> blocksize < PAGE_SIZE && start < inode->i_size && )
This looks good Zhang, with this change and the variable rename, feel free to add
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Regards,
ojaswin
>
> >
> >> + loff_t start_boundary = round_up(start, PAGE_SIZE);
> >
> > I think page_boundary seems like a more suitable name for the variable.
>
> Yeah, it looks fine to me.
>
> Thanks,
> Yi.
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 02/10] ext4: don't explicit update times in ext4_fallocate()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
2024-12-16 1:39 ` [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 9:58 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode Zhang Yi
` (7 subsequent siblings)
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
After commit 'ad5cd4f4ee4d ("ext4: fix fallocate to use file_modified to
update permissions consistently"), we can update mtime and ctime
appropriately through file_modified() when doing zero range, collapse
rage, insert range and punch hole, hence there is no need to explicit
update times in those paths, just drop them.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 5 -----
fs/ext4/inode.c | 1 -
2 files changed, 6 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8dc6b4271b15..7fb38aab241d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4674,8 +4674,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
filemap_invalidate_unlock(mapping);
@@ -4699,7 +4697,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5435,7 +5432,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -5545,7 +5541,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c68a8b841148..bf735d06b621 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4138,7 +4138,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 02/10] ext4: don't explicit update times in ext4_fallocate()
2024-12-16 1:39 ` [PATCH v4 02/10] ext4: don't explicit update times in ext4_fallocate() Zhang Yi
@ 2024-12-18 9:58 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 9:58 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:07AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> After commit 'ad5cd4f4ee4d ("ext4: fix fallocate to use file_modified to
> update permissions consistently"), we can update mtime and ctime
> appropriately through file_modified() when doing zero range, collapse
> rage, insert range and punch hole, hence there is no need to explicit
> update times in those paths, just drop them.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> ---
> fs/ext4/extents.c | 5 -----
> fs/ext4/inode.c | 1 -
> 2 files changed, 6 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 8dc6b4271b15..7fb38aab241d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4674,8 +4674,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> goto out_mutex;
> }
>
> - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> -
> ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> flags);
> filemap_invalidate_unlock(mapping);
> @@ -4699,7 +4697,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> goto out_mutex;
> }
>
> - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> if (new_size)
> ext4_update_inode_size(inode, new_size);
> ret = ext4_mark_inode_dirty(handle, inode);
> @@ -5435,7 +5432,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> up_write(&EXT4_I(inode)->i_data_sem);
> if (IS_SYNC(inode))
> ext4_handle_sync(handle);
> - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> ret = ext4_mark_inode_dirty(handle, inode);
> ext4_update_inode_fsync_trans(handle, inode, 1);
>
> @@ -5545,7 +5541,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> /* Expand file to avoid data loss if there is error while shifting */
> inode->i_size += len;
> EXT4_I(inode)->i_disksize += len;
> - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> ret = ext4_mark_inode_dirty(handle, inode);
> if (ret)
> goto out_stop;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c68a8b841148..bf735d06b621 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4138,7 +4138,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> if (IS_SYNC(inode))
> ext4_handle_sync(handle);
>
> - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> ret2 = ext4_mark_inode_dirty(handle, inode);
> if (unlikely(ret2))
> ret = ret2;
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
2024-12-16 1:39 ` [PATCH v4 01/10] ext4: remove writable userspace mappings before truncating page cache Zhang Yi
2024-12-16 1:39 ` [PATCH v4 02/10] ext4: don't explicit update times in ext4_fallocate() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-16 15:02 ` Jan Kara
2024-12-17 14:31 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 04/10] ext4: refactor ext4_punch_hole() Zhang Yi
` (6 subsequent siblings)
9 siblings, 2 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
There is no need to write back all data before punching a hole in
non-journaled mode since it will be dropped soon after removing space.
Therefore, the call to filemap_write_and_wait_range() can be eliminated.
Besides, similar to ext4_zero_range(), we must address the case of
partially punched folios when block size < page size. It is essential to
remove writable userspace mappings to ensure that the folio can be
faulted again during subsequent mmap write access.
In journaled mode, we need to write dirty pages out before discarding
page cache in case of crash before committing the freeing data
transaction, which could expose old, stale data, even if synchronization
has been performed.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf735d06b621..a5ba2b71d508 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
inode_lock(inode);
/* No need to punch hole beyond i_size */
@@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
+
+ ret = ext4_truncate_page_cache_block_range(inode,
+ first_block_offset, last_block_offset + 1);
+ if (ret)
+ goto out_dio;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-16 1:39 ` [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode Zhang Yi
@ 2024-12-16 15:02 ` Jan Kara
2024-12-17 14:31 ` Ojaswin Mujoo
1 sibling, 0 replies; 35+ messages in thread
From: Jan Kara @ 2024-12-16 15:02 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon 16-12-24 09:39:08, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> There is no need to write back all data before punching a hole in
> non-journaled mode since it will be dropped soon after removing space.
> Therefore, the call to filemap_write_and_wait_range() can be eliminated.
> Besides, similar to ext4_zero_range(), we must address the case of
> partially punched folios when block size < page size. It is essential to
> remove writable userspace mappings to ensure that the folio can be
> faulted again during subsequent mmap write access.
>
> In journaled mode, we need to write dirty pages out before discarding
> page cache in case of crash before committing the freeing data
> transaction, which could expose old, stale data, even if synchronization
> has been performed.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/inode.c | 18 +++++-------------
> 1 file changed, 5 insertions(+), 13 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index bf735d06b621..a5ba2b71d508 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> trace_ext4_punch_hole(inode, offset, length, 0);
>
> - /*
> - * Write out all dirty pages to avoid race conditions
> - * Then release them.
> - */
> - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> - ret = filemap_write_and_wait_range(mapping, offset,
> - offset + length - 1);
> - if (ret)
> - return ret;
> - }
> -
> inode_lock(inode);
>
> /* No need to punch hole beyond i_size */
> @@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> ret = ext4_update_disksize_before_punch(inode, offset, length);
> if (ret)
> goto out_dio;
> - truncate_pagecache_range(inode, first_block_offset,
> - last_block_offset);
> +
> + ret = ext4_truncate_page_cache_block_range(inode,
> + first_block_offset, last_block_offset + 1);
> + if (ret)
> + goto out_dio;
> }
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> --
> 2.46.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-16 1:39 ` [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode Zhang Yi
2024-12-16 15:02 ` Jan Kara
@ 2024-12-17 14:31 ` Ojaswin Mujoo
2024-12-17 14:50 ` Ojaswin Mujoo
2024-12-18 7:10 ` Zhang Yi
1 sibling, 2 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-17 14:31 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:08AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> There is no need to write back all data before punching a hole in
> non-journaled mode since it will be dropped soon after removing space.
> Therefore, the call to filemap_write_and_wait_range() can be eliminated.
Hi, sorry I'm a bit late to this however following the discussion here
[1], I believe the initial concern was that we don't in PATCH v1 01/10
was that after truncating the pagecache, the ext4_alloc_file_blocks()
call might fail with errors like EIO, ENOMEM etc leading to inconsistent
data.
Is my understanding correct that we realised that these are very rare
cases and are not worth the performance penalty of writeback? In which
case, is it really okay to just let the scope for corruption exist even
though its rare. There might be some other error cases we might be
missing which might be more easier to hit. For eg I think we can also
fail ext4_alloc_file_blocks() with ENOSPC in case there is a written to
unwritten extent conversion causing an extent split leading to extent
tree node allocation. (Maybe can be avoided by using PRE_IO with
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT in the first ext4_alloc_file_blocks() call)
So does it make sense to retain the writeback behavior or am I just
being paranoid :)
Regards,
ojaswin
> Besides, similar to ext4_zero_range(), we must address the case of
> partially punched folios when block size < page size. It is essential to
> remove writable userspace mappings to ensure that the folio can be
> faulted again during subsequent mmap write access.
>
> In journaled mode, we need to write dirty pages out before discarding
> page cache in case of crash before committing the freeing data
> transaction, which could expose old, stale data, even if synchronization
> has been performed.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> fs/ext4/inode.c | 18 +++++-------------
> 1 file changed, 5 insertions(+), 13 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index bf735d06b621..a5ba2b71d508 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> trace_ext4_punch_hole(inode, offset, length, 0);
>
> - /*
> - * Write out all dirty pages to avoid race conditions
> - * Then release them.
> - */
> - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> - ret = filemap_write_and_wait_range(mapping, offset,
> - offset + length - 1);
> - if (ret)
> - return ret;
> - }
> -
> inode_lock(inode);
>
> /* No need to punch hole beyond i_size */
> @@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> ret = ext4_update_disksize_before_punch(inode, offset, length);
> if (ret)
> goto out_dio;
> - truncate_pagecache_range(inode, first_block_offset,
> - last_block_offset);
> +
> + ret = ext4_truncate_page_cache_block_range(inode,
> + first_block_offset, last_block_offset + 1);
> + if (ret)
> + goto out_dio;
> }
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-17 14:31 ` Ojaswin Mujoo
@ 2024-12-17 14:50 ` Ojaswin Mujoo
2024-12-18 7:10 ` Zhang Yi
1 sibling, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-17 14:50 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Tue, Dec 17, 2024 at 08:01:26PM +0530, Ojaswin Mujoo wrote:
> On Mon, Dec 16, 2024 at 09:39:08AM +0800, Zhang Yi wrote:
> > From: Zhang Yi <yi.zhang@huawei.com>
> >
> > There is no need to write back all data before punching a hole in
> > non-journaled mode since it will be dropped soon after removing space.
> > Therefore, the call to filemap_write_and_wait_range() can be eliminated.
>
> Hi, sorry I'm a bit late to this however following the discussion here
> [1], I believe the initial concern was that we don't in PATCH v1 01/10
> was that after truncating the pagecache, the ext4_alloc_file_blocks()
> call might fail with errors like EIO, ENOMEM etc leading to inconsistent
> data.
>
> Is my understanding correct that we realised that these are very rare
> cases and are not worth the performance penalty of writeback? In which
> case, is it really okay to just let the scope for corruption exist even
> though its rare. There might be some other error cases we might be
> missing which might be more easier to hit. For eg I think we can also
> fail ext4_alloc_file_blocks() with ENOSPC in case there is a written to
> unwritten extent conversion causing an extent split leading to extent
> tree node allocation. (Maybe can be avoided by using PRE_IO with
> EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT in the first ext4_alloc_file_blocks() call)
>
> So does it make sense to retain the writeback behavior or am I just
> being paranoid :)
>
> Regards,
> ojaswin
[1]
https://lore.kernel.org/linux-ext4/20240917165007.j5dywaekvnirfffm@quack3/
>
> > Besides, similar to ext4_zero_range(), we must address the case of
> > partially punched folios when block size < page size. It is essential to
> > remove writable userspace mappings to ensure that the folio can be
> > faulted again during subsequent mmap write access.
> >
> > In journaled mode, we need to write dirty pages out before discarding
> > page cache in case of crash before committing the freeing data
> > transaction, which could expose old, stale data, even if synchronization
> > has been performed.
> >
> > Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> > ---
> > fs/ext4/inode.c | 18 +++++-------------
> > 1 file changed, 5 insertions(+), 13 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index bf735d06b621..a5ba2b71d508 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> >
> > trace_ext4_punch_hole(inode, offset, length, 0);
> >
> > - /*
> > - * Write out all dirty pages to avoid race conditions
> > - * Then release them.
> > - */
> > - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> > - ret = filemap_write_and_wait_range(mapping, offset,
> > - offset + length - 1);
> > - if (ret)
> > - return ret;
> > - }
> > -
> > inode_lock(inode);
> >
> > /* No need to punch hole beyond i_size */
> > @@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> > ret = ext4_update_disksize_before_punch(inode, offset, length);
> > if (ret)
> > goto out_dio;
> > - truncate_pagecache_range(inode, first_block_offset,
> > - last_block_offset);
> > +
> > + ret = ext4_truncate_page_cache_block_range(inode,
> > + first_block_offset, last_block_offset + 1);
> > + if (ret)
> > + goto out_dio;
> > }
> >
> > if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> > --
> > 2.46.1
> >
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-17 14:31 ` Ojaswin Mujoo
2024-12-17 14:50 ` Ojaswin Mujoo
@ 2024-12-18 7:10 ` Zhang Yi
2024-12-18 10:13 ` Ojaswin Mujoo
1 sibling, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-18 7:10 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On 2024/12/17 22:31, Ojaswin Mujoo wrote:
> On Mon, Dec 16, 2024 at 09:39:08AM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> There is no need to write back all data before punching a hole in
>> non-journaled mode since it will be dropped soon after removing space.
>> Therefore, the call to filemap_write_and_wait_range() can be eliminated.
>
> Hi, sorry I'm a bit late to this however following the discussion here
> [1], I believe the initial concern was that we don't in PATCH v1 01/10
> was that after truncating the pagecache, the ext4_alloc_file_blocks()
> call might fail with errors like EIO, ENOMEM etc leading to inconsistent
> data.
>
> Is my understanding correct that we realised that these are very rare
> cases and are not worth the performance penalty of writeback? In which
> case, is it really okay to just let the scope for corruption exist even
> though its rare. There might be some other error cases we might be
> missing which might be more easier to hit. For eg I think we can also
> fail ext4_alloc_file_blocks() with ENOSPC in case there is a written to
> unwritten extent conversion causing an extent split leading to extent
> tree node allocation. (Maybe can be avoided by using PRE_IO with
> EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT in the first ext4_alloc_file_blocks() call)
>
> So does it make sense to retain the writeback behavior or am I just
> being paranoid :)
>
Hi, Ojaswin!
Yeah, from my point of view, ENOSPC could happen, and it may be more
likely to happen if we intentionally create conditions for it. However,
all the efforts we can make at this point are merely best efforts and
reduce the probability. We cannot 100% guarantee it will not happen,
even if we write back the whole range before manipulating extents and
blocks. This is because we do not accurately reserve space for extents
split. Additionally, In ext4_punch_hole(), we have used 'nofail' flag
while freeing blocks to reduce the possibility of ENOSPC. So I suppose
it's fine by now, but we may need to implement additional measures if
we truly want to resolve the issue completely.
Thanks,
Yi.
>
>> Besides, similar to ext4_zero_range(), we must address the case of
>> partially punched folios when block size < page size. It is essential to
>> remove writable userspace mappings to ensure that the folio can be
>> faulted again during subsequent mmap write access.
>>
>> In journaled mode, we need to write dirty pages out before discarding
>> page cache in case of crash before committing the freeing data
>> transaction, which could expose old, stale data, even if synchronization
>> has been performed.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> fs/ext4/inode.c | 18 +++++-------------
>> 1 file changed, 5 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index bf735d06b621..a5ba2b71d508 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>>
>> trace_ext4_punch_hole(inode, offset, length, 0);
>>
>> - /*
>> - * Write out all dirty pages to avoid race conditions
>> - * Then release them.
>> - */
>> - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>> - ret = filemap_write_and_wait_range(mapping, offset,
>> - offset + length - 1);
>> - if (ret)
>> - return ret;
>> - }
>> -
>> inode_lock(inode);
>>
>> /* No need to punch hole beyond i_size */
>> @@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>> ret = ext4_update_disksize_before_punch(inode, offset, length);
>> if (ret)
>> goto out_dio;
>> - truncate_pagecache_range(inode, first_block_offset,
>> - last_block_offset);
>> +
>> + ret = ext4_truncate_page_cache_block_range(inode,
>> + first_block_offset, last_block_offset + 1);
>> + if (ret)
>> + goto out_dio;
>> }
>>
>> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> --
>> 2.46.1
>>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode
2024-12-18 7:10 ` Zhang Yi
@ 2024-12-18 10:13 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:13 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Wed, Dec 18, 2024 at 03:10:36PM +0800, Zhang Yi wrote:
> On 2024/12/17 22:31, Ojaswin Mujoo wrote:
> > On Mon, Dec 16, 2024 at 09:39:08AM +0800, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> There is no need to write back all data before punching a hole in
> >> non-journaled mode since it will be dropped soon after removing space.
> >> Therefore, the call to filemap_write_and_wait_range() can be eliminated.
> >
> > Hi, sorry I'm a bit late to this however following the discussion here
> > [1], I believe the initial concern was that we don't in PATCH v1 01/10
> > was that after truncating the pagecache, the ext4_alloc_file_blocks()
> > call might fail with errors like EIO, ENOMEM etc leading to inconsistent
> > data.
> >
> > Is my understanding correct that we realised that these are very rare
> > cases and are not worth the performance penalty of writeback? In which
> > case, is it really okay to just let the scope for corruption exist even
> > though its rare. There might be some other error cases we might be
> > missing which might be more easier to hit. For eg I think we can also
> > fail ext4_alloc_file_blocks() with ENOSPC in case there is a written to
> > unwritten extent conversion causing an extent split leading to extent
> > tree node allocation. (Maybe can be avoided by using PRE_IO with
> > EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT in the first ext4_alloc_file_blocks() call)
> >
> > So does it make sense to retain the writeback behavior or am I just
> > being paranoid :)
> >
>
> Hi, Ojaswin!
>
> Yeah, from my point of view, ENOSPC could happen, and it may be more
> likely to happen if we intentionally create conditions for it. However,
> all the efforts we can make at this point are merely best efforts and
> reduce the probability. We cannot 100% guarantee it will not happen,
> even if we write back the whole range before manipulating extents and
> blocks. This is because we do not accurately reserve space for extents
> split. Additionally, In ext4_punch_hole(), we have used 'nofail' flag
Right, rechecking the ext4_map_blocks code, seems like we can also result
in a failure after unwrit extents have successfully been allocated so
either ways we can't be sure that we'll retain old data on failure even
with writeback.
> while freeing blocks to reduce the possibility of ENOSPC. So I suppose
> it's fine by now, but we may need to implement additional measures if
> we truly want to resolve the issue completely.
Sure I agree that in that case we should ideally have something more
robust to handle these edge cases. For now, this change looks good.
Feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>
> Thanks,
> Yi.
>
> >
> >> Besides, similar to ext4_zero_range(), we must address the case of
> >> partially punched folios when block size < page size. It is essential to
> >> remove writable userspace mappings to ensure that the folio can be
> >> faulted again during subsequent mmap write access.
> >>
> >> In journaled mode, we need to write dirty pages out before discarding
> >> page cache in case of crash before committing the freeing data
> >> transaction, which could expose old, stale data, even if synchronization
> >> has been performed.
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >> fs/ext4/inode.c | 18 +++++-------------
> >> 1 file changed, 5 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index bf735d06b621..a5ba2b71d508 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -4018,17 +4018,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> >>
> >> trace_ext4_punch_hole(inode, offset, length, 0);
> >>
> >> - /*
> >> - * Write out all dirty pages to avoid race conditions
> >> - * Then release them.
> >> - */
> >> - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> >> - ret = filemap_write_and_wait_range(mapping, offset,
> >> - offset + length - 1);
> >> - if (ret)
> >> - return ret;
> >> - }
> >> -
> >> inode_lock(inode);
> >>
> >> /* No need to punch hole beyond i_size */
> >> @@ -4090,8 +4079,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> >> ret = ext4_update_disksize_before_punch(inode, offset, length);
> >> if (ret)
> >> goto out_dio;
> >> - truncate_pagecache_range(inode, first_block_offset,
> >> - last_block_offset);
> >> +
> >> + ret = ext4_truncate_page_cache_block_range(inode,
> >> + first_block_offset, last_block_offset + 1);
> >> + if (ret)
> >> + goto out_dio;
> >> }
> >>
> >> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> >> --
> >> 2.46.1
> >>
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 04/10] ext4: refactor ext4_punch_hole()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (2 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 03/10] ext4: don't write back data before punch hole in nojournal mode Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-16 15:07 ` Jan Kara
2024-12-18 10:17 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 05/10] ext4: refactor ext4_zero_range() Zhang Yi
` (5 subsequent siblings)
9 siblings, 2 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
The current implementation of ext4_punch_hole() contains complex
position calculations and stale error tags. To improve the code's
clarity and maintainability, it is essential to clean up the code and
improve its readability, this can be achieved by: a) simplifying and
renaming variables; b) eliminating unnecessary position calculations;
c) writing back all data in data=journal mode, and drop page cache from
the original offset to the end, rather than using aligned blocks,
d) renaming the stale error tags.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 2 +
fs/ext4/inode.c | 119 +++++++++++++++++++++---------------------------
2 files changed, 55 insertions(+), 66 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8843929b46ce..8be06d5f5b43 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -367,6 +367,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5ba2b71d508..7720d3700b27 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4008,13 +4008,13 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
+ ext4_lblk_t start_lblk, end_lblk;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret = 0;
trace_ext4_punch_hole(inode, offset, length, 0);
@@ -4022,36 +4022,27 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- goto out_mutex;
+ goto out;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size, and also make sure that the hole
+ * within one block before last range.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ goto out;
}
/* Wait all existing dio workers, newcomers will block on i_rwsem */
@@ -4059,7 +4050,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = file_modified(file);
if (ret)
- goto out_mutex;
+ goto out;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -4069,22 +4060,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = ext4_break_layouts(inode);
if (ret)
- goto out_dio;
+ goto out_invalidate_lock;
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_invalidate_lock;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
-
- ret = ext4_truncate_page_cache_block_range(inode,
- first_block_offset, last_block_offset + 1);
- if (ret)
- goto out_dio;
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ goto out_invalidate_lock;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -4094,52 +4079,54 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ goto out_invalidate_lock;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
- ext4_lblk_t hole_len = stop_block - first_block;
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block, hole_len);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
- ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
+out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out_mutex:
+out:
inode_unlock(inode);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 04/10] ext4: refactor ext4_punch_hole()
2024-12-16 1:39 ` [PATCH v4 04/10] ext4: refactor ext4_punch_hole() Zhang Yi
@ 2024-12-16 15:07 ` Jan Kara
2024-12-18 10:17 ` Ojaswin Mujoo
1 sibling, 0 replies; 35+ messages in thread
From: Jan Kara @ 2024-12-16 15:07 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon 16-12-24 09:39:09, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> The current implementation of ext4_punch_hole() contains complex
> position calculations and stale error tags. To improve the code's
> clarity and maintainability, it is essential to clean up the code and
> improve its readability, this can be achieved by: a) simplifying and
> renaming variables; b) eliminating unnecessary position calculations;
> c) writing back all data in data=journal mode, and drop page cache from
> the original offset to the end, rather than using aligned blocks,
> d) renaming the stale error tags.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/ext4.h | 2 +
> fs/ext4/inode.c | 119 +++++++++++++++++++++---------------------------
> 2 files changed, 55 insertions(+), 66 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8843929b46ce..8be06d5f5b43 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -367,6 +367,8 @@ struct ext4_io_submit {
> #define EXT4_MAX_BLOCKS(size, offset, blkbits) \
> ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
> blkbits))
> +#define EXT4_B_TO_LBLK(inode, offset) \
> + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
>
> /* Translate a block number to a cluster number */
> #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a5ba2b71d508..7720d3700b27 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4008,13 +4008,13 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> {
> struct inode *inode = file_inode(file);
> struct super_block *sb = inode->i_sb;
> - ext4_lblk_t first_block, stop_block;
> + ext4_lblk_t start_lblk, end_lblk;
> struct address_space *mapping = inode->i_mapping;
> - loff_t first_block_offset, last_block_offset, max_length;
> - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> + loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
> + loff_t end = offset + length;
> handle_t *handle;
> unsigned int credits;
> - int ret = 0, ret2 = 0;
> + int ret = 0;
>
> trace_ext4_punch_hole(inode, offset, length, 0);
>
> @@ -4022,36 +4022,27 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> /* No need to punch hole beyond i_size */
> if (offset >= inode->i_size)
> - goto out_mutex;
> + goto out;
>
> /*
> - * If the hole extends beyond i_size, set the hole
> - * to end after the page that contains i_size
> + * If the hole extends beyond i_size, set the hole to end after
> + * the page that contains i_size, and also make sure that the hole
> + * within one block before last range.
> */
> - if (offset + length > inode->i_size) {
> - length = inode->i_size +
> - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
> - offset;
> - }
> + if (end > inode->i_size)
> + end = round_up(inode->i_size, PAGE_SIZE);
> + if (end > max_end)
> + end = max_end;
> + length = end - offset;
>
> /*
> - * For punch hole the length + offset needs to be within one block
> - * before last range. Adjust the length if it goes beyond that limit.
> + * Attach jinode to inode for jbd2 if we do any zeroing of partial
> + * block.
> */
> - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
> - if (offset + length > max_length)
> - length = max_length - offset;
> -
> - if (offset & (sb->s_blocksize - 1) ||
> - (offset + length) & (sb->s_blocksize - 1)) {
> - /*
> - * Attach jinode to inode for jbd2 if we do any zeroing of
> - * partial block
> - */
> + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
> ret = ext4_inode_attach_jinode(inode);
> if (ret < 0)
> - goto out_mutex;
> -
> + goto out;
> }
>
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> @@ -4059,7 +4050,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> + goto out;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -4069,22 +4060,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> ret = ext4_break_layouts(inode);
> if (ret)
> - goto out_dio;
> + goto out_invalidate_lock;
>
> - first_block_offset = round_up(offset, sb->s_blocksize);
> - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
> + ret = ext4_update_disksize_before_punch(inode, offset, length);
> + if (ret)
> + goto out_invalidate_lock;
>
> /* Now release the pages and zero block aligned part of pages*/
> - if (last_block_offset > first_block_offset) {
> - ret = ext4_update_disksize_before_punch(inode, offset, length);
> - if (ret)
> - goto out_dio;
> -
> - ret = ext4_truncate_page_cache_block_range(inode,
> - first_block_offset, last_block_offset + 1);
> - if (ret)
> - goto out_dio;
> - }
> + ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> + if (ret)
> + goto out_invalidate_lock;
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> credits = ext4_writepage_trans_blocks(inode);
> @@ -4094,52 +4079,54 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(sb, ret);
> - goto out_dio;
> + goto out_invalidate_lock;
> }
>
> - ret = ext4_zero_partial_blocks(handle, inode, offset,
> - length);
> + ret = ext4_zero_partial_blocks(handle, inode, offset, length);
> if (ret)
> - goto out_stop;
> -
> - first_block = (offset + sb->s_blocksize - 1) >>
> - EXT4_BLOCK_SIZE_BITS(sb);
> - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
> + goto out_handle;
>
> /* If there are blocks to remove, do it */
> - if (stop_block > first_block) {
> - ext4_lblk_t hole_len = stop_block - first_block;
> + start_lblk = EXT4_B_TO_LBLK(inode, offset);
> + end_lblk = end >> inode->i_blkbits;
> +
> + if (end_lblk > start_lblk) {
> + ext4_lblk_t hole_len = end_lblk - start_lblk;
>
> down_write(&EXT4_I(inode)->i_data_sem);
> ext4_discard_preallocations(inode);
>
> - ext4_es_remove_extent(inode, first_block, hole_len);
> + ext4_es_remove_extent(inode, start_lblk, hole_len);
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> - ret = ext4_ext_remove_space(inode, first_block,
> - stop_block - 1);
> + ret = ext4_ext_remove_space(inode, start_lblk,
> + end_lblk - 1);
> else
> - ret = ext4_ind_remove_space(handle, inode, first_block,
> - stop_block);
> + ret = ext4_ind_remove_space(handle, inode, start_lblk,
> + end_lblk);
> + if (ret) {
> + up_write(&EXT4_I(inode)->i_data_sem);
> + goto out_handle;
> + }
>
> - ext4_es_insert_extent(inode, first_block, hole_len, ~0,
> + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
> EXTENT_STATUS_HOLE, 0);
> up_write(&EXT4_I(inode)->i_data_sem);
> }
> - ext4_fc_track_range(handle, inode, first_block, stop_block);
> + ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
> +
> + ret = ext4_mark_inode_dirty(handle, inode);
> + if (unlikely(ret))
> + goto out_handle;
> +
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> if (IS_SYNC(inode))
> ext4_handle_sync(handle);
> -
> - ret2 = ext4_mark_inode_dirty(handle, inode);
> - if (unlikely(ret2))
> - ret = ret2;
> - if (ret >= 0)
> - ext4_update_inode_fsync_trans(handle, inode, 1);
> -out_stop:
> +out_handle:
> ext4_journal_stop(handle);
> -out_dio:
> +out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out_mutex:
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 04/10] ext4: refactor ext4_punch_hole()
2024-12-16 1:39 ` [PATCH v4 04/10] ext4: refactor ext4_punch_hole() Zhang Yi
2024-12-16 15:07 ` Jan Kara
@ 2024-12-18 10:17 ` Ojaswin Mujoo
2024-12-18 13:13 ` Zhang Yi
1 sibling, 1 reply; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:17 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:09AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> The current implementation of ext4_punch_hole() contains complex
> position calculations and stale error tags. To improve the code's
> clarity and maintainability, it is essential to clean up the code and
> improve its readability, this can be achieved by: a) simplifying and
> renaming variables; b) eliminating unnecessary position calculations;
> c) writing back all data in data=journal mode, and drop page cache from
> the original offset to the end, rather than using aligned blocks,
> d) renaming the stale error tags.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> fs/ext4/ext4.h | 2 +
> fs/ext4/inode.c | 119 +++++++++++++++++++++---------------------------
> 2 files changed, 55 insertions(+), 66 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8843929b46ce..8be06d5f5b43 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -367,6 +367,8 @@ struct ext4_io_submit {
> #define EXT4_MAX_BLOCKS(size, offset, blkbits) \
> ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
> blkbits))
> +#define EXT4_B_TO_LBLK(inode, offset) \
> + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
>
> /* Translate a block number to a cluster number */
> #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a5ba2b71d508..7720d3700b27 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4008,13 +4008,13 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> {
> struct inode *inode = file_inode(file);
> struct super_block *sb = inode->i_sb;
> - ext4_lblk_t first_block, stop_block;
> + ext4_lblk_t start_lblk, end_lblk;
> struct address_space *mapping = inode->i_mapping;
> - loff_t first_block_offset, last_block_offset, max_length;
> - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> + loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
> + loff_t end = offset + length;
> handle_t *handle;
> unsigned int credits;
> - int ret = 0, ret2 = 0;
> + int ret = 0;
>
> trace_ext4_punch_hole(inode, offset, length, 0);
>
> @@ -4022,36 +4022,27 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> /* No need to punch hole beyond i_size */
> if (offset >= inode->i_size)
> - goto out_mutex;
> + goto out;
>
> /*
> - * If the hole extends beyond i_size, set the hole
> - * to end after the page that contains i_size
> + * If the hole extends beyond i_size, set the hole to end after
> + * the page that contains i_size, and also make sure that the hole
> + * within one block before last range.
> */
> - if (offset + length > inode->i_size) {
> - length = inode->i_size +
> - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
> - offset;
> - }
> + if (end > inode->i_size)
> + end = round_up(inode->i_size, PAGE_SIZE);
> + if (end > max_end)
> + end = max_end;
> + length = end - offset;
>
> /*
> - * For punch hole the length + offset needs to be within one block
> - * before last range. Adjust the length if it goes beyond that limit.
> + * Attach jinode to inode for jbd2 if we do any zeroing of partial
> + * block.
> */
> - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
> - if (offset + length > max_length)
> - length = max_length - offset;
> -
> - if (offset & (sb->s_blocksize - 1) ||
> - (offset + length) & (sb->s_blocksize - 1)) {
> - /*
> - * Attach jinode to inode for jbd2 if we do any zeroing of
> - * partial block
> - */
> + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
> ret = ext4_inode_attach_jinode(inode);
> if (ret < 0)
> - goto out_mutex;
> -
> + goto out;
> }
>
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> @@ -4059,7 +4050,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> + goto out;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -4069,22 +4060,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> ret = ext4_break_layouts(inode);
> if (ret)
> - goto out_dio;
> + goto out_invalidate_lock;
>
> - first_block_offset = round_up(offset, sb->s_blocksize);
> - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
> + ret = ext4_update_disksize_before_punch(inode, offset, length);
Hey Zhang,
The changes look good to me, just one question, why are we doing
disksize update unconditionally now and not only when the range
spans a complete block or more.
(Same question for the next patch refactoring ext4_zero_range())
Regards,
ojaswin
> + if (ret)
> + goto out_invalidate_lock;
>
> /* Now release the pages and zero block aligned part of pages*/
> - if (last_block_offset > first_block_offset) {
> - ret = ext4_update_disksize_before_punch(inode, offset, length);
> - if (ret)
> - goto out_dio;
> -
> - ret = ext4_truncate_page_cache_block_range(inode,
> - first_block_offset, last_block_offset + 1);
> - if (ret)
> - goto out_dio;
> - }
> + ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> + if (ret)
> + goto out_invalidate_lock;
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> credits = ext4_writepage_trans_blocks(inode);
> @@ -4094,52 +4079,54 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(sb, ret);
> - goto out_dio;
> + goto out_invalidate_lock;
> }
>
> - ret = ext4_zero_partial_blocks(handle, inode, offset,
> - length);
> + ret = ext4_zero_partial_blocks(handle, inode, offset, length);
> if (ret)
> - goto out_stop;
> -
> - first_block = (offset + sb->s_blocksize - 1) >>
> - EXT4_BLOCK_SIZE_BITS(sb);
> - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
> + goto out_handle;
>
> /* If there are blocks to remove, do it */
> - if (stop_block > first_block) {
> - ext4_lblk_t hole_len = stop_block - first_block;
> + start_lblk = EXT4_B_TO_LBLK(inode, offset);
> + end_lblk = end >> inode->i_blkbits;
> +
> + if (end_lblk > start_lblk) {
> + ext4_lblk_t hole_len = end_lblk - start_lblk;
>
> down_write(&EXT4_I(inode)->i_data_sem);
> ext4_discard_preallocations(inode);
>
> - ext4_es_remove_extent(inode, first_block, hole_len);
> + ext4_es_remove_extent(inode, start_lblk, hole_len);
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> - ret = ext4_ext_remove_space(inode, first_block,
> - stop_block - 1);
> + ret = ext4_ext_remove_space(inode, start_lblk,
> + end_lblk - 1);
> else
> - ret = ext4_ind_remove_space(handle, inode, first_block,
> - stop_block);
> + ret = ext4_ind_remove_space(handle, inode, start_lblk,
> + end_lblk);
> + if (ret) {
> + up_write(&EXT4_I(inode)->i_data_sem);
> + goto out_handle;
> + }
>
> - ext4_es_insert_extent(inode, first_block, hole_len, ~0,
> + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
> EXTENT_STATUS_HOLE, 0);
> up_write(&EXT4_I(inode)->i_data_sem);
> }
> - ext4_fc_track_range(handle, inode, first_block, stop_block);
> + ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
> +
> + ret = ext4_mark_inode_dirty(handle, inode);
> + if (unlikely(ret))
> + goto out_handle;
> +
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> if (IS_SYNC(inode))
> ext4_handle_sync(handle);
> -
> - ret2 = ext4_mark_inode_dirty(handle, inode);
> - if (unlikely(ret2))
> - ret = ret2;
> - if (ret >= 0)
> - ext4_update_inode_fsync_trans(handle, inode, 1);
> -out_stop:
> +out_handle:
> ext4_journal_stop(handle);
> -out_dio:
> +out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out_mutex:
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 04/10] ext4: refactor ext4_punch_hole()
2024-12-18 10:17 ` Ojaswin Mujoo
@ 2024-12-18 13:13 ` Zhang Yi
2024-12-19 7:11 ` Ojaswin Mujoo
0 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-18 13:13 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On 2024/12/18 18:17, Ojaswin Mujoo wrote:
> On Mon, Dec 16, 2024 at 09:39:09AM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> The current implementation of ext4_punch_hole() contains complex
>> position calculations and stale error tags. To improve the code's
>> clarity and maintainability, it is essential to clean up the code and
>> improve its readability, this can be achieved by: a) simplifying and
>> renaming variables; b) eliminating unnecessary position calculations;
>> c) writing back all data in data=journal mode, and drop page cache from
>> the original offset to the end, rather than using aligned blocks,
>> d) renaming the stale error tags.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> fs/ext4/ext4.h | 2 +
>> fs/ext4/inode.c | 119 +++++++++++++++++++++---------------------------
>> 2 files changed, 55 insertions(+), 66 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 8843929b46ce..8be06d5f5b43 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -367,6 +367,8 @@ struct ext4_io_submit {
>> #define EXT4_MAX_BLOCKS(size, offset, blkbits) \
>> ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
>> blkbits))
>> +#define EXT4_B_TO_LBLK(inode, offset) \
>> + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
>>
>> /* Translate a block number to a cluster number */
>> #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index a5ba2b71d508..7720d3700b27 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
[..]
>> @@ -4069,22 +4060,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>>
>> ret = ext4_break_layouts(inode);
>> if (ret)
>> - goto out_dio;
>> + goto out_invalidate_lock;
>>
>> - first_block_offset = round_up(offset, sb->s_blocksize);
>> - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
>> + ret = ext4_update_disksize_before_punch(inode, offset, length);
>
> Hey Zhang,
>
> The changes look good to me, just one question, why are we doing
> disksize update unconditionally now and not only when the range
> spans a complete block or more.
>
I want to simplify the code. We only need to update the disksize when
the end of the punching or zeroing range is >= the EOF and i_disksize
is less than i_size. ext4_update_disksize_before_punch() has already
performed this check and has ruled out most cases. Therefore, I
believe that calling it unconditionally will not incur significant
costs.
Thanks,
Yi.
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 04/10] ext4: refactor ext4_punch_hole()
2024-12-18 13:13 ` Zhang Yi
@ 2024-12-19 7:11 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-19 7:11 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Wed, Dec 18, 2024 at 09:13:46PM +0800, Zhang Yi wrote:
> On 2024/12/18 18:17, Ojaswin Mujoo wrote:
> > On Mon, Dec 16, 2024 at 09:39:09AM +0800, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> The current implementation of ext4_punch_hole() contains complex
> >> position calculations and stale error tags. To improve the code's
> >> clarity and maintainability, it is essential to clean up the code and
> >> improve its readability, this can be achieved by: a) simplifying and
> >> renaming variables; b) eliminating unnecessary position calculations;
> >> c) writing back all data in data=journal mode, and drop page cache from
> >> the original offset to the end, rather than using aligned blocks,
> >> d) renaming the stale error tags.
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >> fs/ext4/ext4.h | 2 +
> >> fs/ext4/inode.c | 119 +++++++++++++++++++++---------------------------
> >> 2 files changed, 55 insertions(+), 66 deletions(-)
> >>
> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> >> index 8843929b46ce..8be06d5f5b43 100644
> >> --- a/fs/ext4/ext4.h
> >> +++ b/fs/ext4/ext4.h
> >> @@ -367,6 +367,8 @@ struct ext4_io_submit {
> >> #define EXT4_MAX_BLOCKS(size, offset, blkbits) \
> >> ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
> >> blkbits))
> >> +#define EXT4_B_TO_LBLK(inode, offset) \
> >> + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
> >>
> >> /* Translate a block number to a cluster number */
> >> #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index a5ba2b71d508..7720d3700b27 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> [..]
> >> @@ -4069,22 +4060,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> >>
> >> ret = ext4_break_layouts(inode);
> >> if (ret)
> >> - goto out_dio;
> >> + goto out_invalidate_lock;
> >>
> >> - first_block_offset = round_up(offset, sb->s_blocksize);
> >> - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
> >> + ret = ext4_update_disksize_before_punch(inode, offset, length);
> >
> > Hey Zhang,
> >
> > The changes look good to me, just one question, why are we doing
> > disksize update unconditionally now and not only when the range
> > spans a complete block or more.
> >
>
> I want to simplify the code. We only need to update the disksize when
> the end of the punching or zeroing range is >= the EOF and i_disksize
> is less than i_size. ext4_update_disksize_before_punch() has already
> performed this check and has ruled out most cases. Therefore, I
> believe that calling it unconditionally will not incur significant
> costs.
Okay sure, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Regards,
ojaswin
>
> Thanks,
> Yi.
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 05/10] ext4: refactor ext4_zero_range()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (3 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 04/10] ext4: refactor ext4_punch_hole() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-16 15:24 ` Jan Kara
2024-12-19 7:12 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 06/10] ext4: refactor ext4_collapse_range() Zhang Yi
` (4 subsequent siblings)
9 siblings, 2 replies; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
The current implementation of ext4_zero_range() contains complex
position calculations and stale error tags. To improve the code's
clarity and maintainability, it is essential to clean up the code and
improve its readability, this can be achieved by: a) simplifying and
renaming variables, making the style the same as ext4_punch_hole(); b)
eliminating unnecessary position calculations, writing back all data in
data=journal mode, and drop page cache from the original offset to the
end, rather than using aligned blocks; c) renaming the stale out_mutex
tags.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 142 +++++++++++++++++++---------------------------
1 file changed, 57 insertions(+), 85 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7fb38aab241d..97ad6fea58d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4570,40 +4570,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
- /*
- * Round up offset. This is not fallocate, we need to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range. Here, start and partial_begin are inclusive, end and
- * partial_end are exclusive.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
inode_lock(inode);
/*
@@ -4611,77 +4586,70 @@ static long ext4_zero_range(struct file *file, loff_t offset,
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
- goto out_mutex;
+ goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ goto out;
}
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
- goto out_mutex;
-
- /* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ goto out;
- }
+ /*
+ * Prevent page faults from reinstantiating pages we have released
+ * from page cache.
+ */
+ filemap_invalidate_lock(mapping);
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
+ ret = ext4_break_layouts(inode);
+ if (ret)
+ goto out_invalidate_lock;
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- filemap_invalidate_lock(mapping);
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ /* Preallocate the range including the unaligned edges */
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
- ret = ext4_break_layouts(inode);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ goto out_invalidate_lock;
+ }
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ goto out_invalidate_lock;
- /* Now release the pages and zero block aligned part of pages */
- ret = ext4_truncate_page_cache_block_range(inode, start, end);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ goto out_invalidate_lock;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- filemap_invalidate_unlock(mapping);
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
+
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ goto out_invalidate_lock;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ goto out_invalidate_lock;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4694,25 +4662,29 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ goto out_invalidate_lock;
}
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out:
inode_unlock(inode);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 05/10] ext4: refactor ext4_zero_range()
2024-12-16 1:39 ` [PATCH v4 05/10] ext4: refactor ext4_zero_range() Zhang Yi
@ 2024-12-16 15:24 ` Jan Kara
2024-12-19 7:12 ` Ojaswin Mujoo
1 sibling, 0 replies; 35+ messages in thread
From: Jan Kara @ 2024-12-16 15:24 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon 16-12-24 09:39:10, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> The current implementation of ext4_zero_range() contains complex
> position calculations and stale error tags. To improve the code's
> clarity and maintainability, it is essential to clean up the code and
> improve its readability, this can be achieved by: a) simplifying and
> renaming variables, making the style the same as ext4_punch_hole(); b)
> eliminating unnecessary position calculations, writing back all data in
> data=journal mode, and drop page cache from the original offset to the
> end, rather than using aligned blocks; c) renaming the stale out_mutex
> tags.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/extents.c | 142 +++++++++++++++++++---------------------------
> 1 file changed, 57 insertions(+), 85 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 7fb38aab241d..97ad6fea58d3 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4570,40 +4570,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> struct inode *inode = file_inode(file);
> struct address_space *mapping = file->f_mapping;
> handle_t *handle = NULL;
> - unsigned int max_blocks;
> loff_t new_size = 0;
> - int ret = 0;
> - int flags;
> - int credits;
> - int partial_begin, partial_end;
> - loff_t start, end;
> - ext4_lblk_t lblk;
> + loff_t end = offset + len;
> + ext4_lblk_t start_lblk, end_lblk;
> + unsigned int blocksize = i_blocksize(inode);
> unsigned int blkbits = inode->i_blkbits;
> + int ret, flags, credits;
>
> trace_ext4_zero_range(inode, offset, len, mode);
>
> - /*
> - * Round up offset. This is not fallocate, we need to zero out
> - * blocks, so convert interior block aligned part of the range to
> - * unwritten and possibly manually zero out unaligned parts of the
> - * range. Here, start and partial_begin are inclusive, end and
> - * partial_end are exclusive.
> - */
> - start = round_up(offset, 1 << blkbits);
> - end = round_down((offset + len), 1 << blkbits);
> -
> - if (start < offset || end > offset + len)
> - return -EINVAL;
> - partial_begin = offset & ((1 << blkbits) - 1);
> - partial_end = (offset + len) & ((1 << blkbits) - 1);
> -
> - lblk = start >> blkbits;
> - max_blocks = (end >> blkbits);
> - if (max_blocks < lblk)
> - max_blocks = 0;
> - else
> - max_blocks -= lblk;
> -
> inode_lock(inode);
>
> /*
> @@ -4611,77 +4586,70 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> */
> if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> ret = -EOPNOTSUPP;
> - goto out_mutex;
> + goto out;
> }
>
> if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> - (offset + len > inode->i_size ||
> - offset + len > EXT4_I(inode)->i_disksize)) {
> - new_size = offset + len;
> + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
> + new_size = end;
> ret = inode_newsize_ok(inode, new_size);
> if (ret)
> - goto out_mutex;
> + goto out;
> }
>
> - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> -
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> inode_dio_wait(inode);
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> -
> - /* Preallocate the range including the unaligned edges */
> - if (partial_begin || partial_end) {
> - ret = ext4_alloc_file_blocks(file,
> - round_down(offset, 1 << blkbits) >> blkbits,
> - (round_up((offset + len), 1 << blkbits) -
> - round_down(offset, 1 << blkbits)) >> blkbits,
> - new_size, flags);
> - if (ret)
> - goto out_mutex;
> + goto out;
>
> - }
> + /*
> + * Prevent page faults from reinstantiating pages we have released
> + * from page cache.
> + */
> + filemap_invalidate_lock(mapping);
>
> - /* Zero range excluding the unaligned edges */
> - if (max_blocks > 0) {
> - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
> - EXT4_EX_NOCACHE);
> + ret = ext4_break_layouts(inode);
> + if (ret)
> + goto out_invalidate_lock;
>
> - /*
> - * Prevent page faults from reinstantiating pages we have
> - * released from page cache.
> - */
> - filemap_invalidate_lock(mapping);
> + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> + /* Preallocate the range including the unaligned edges */
> + if (!IS_ALIGNED(offset | end, blocksize)) {
> + ext4_lblk_t alloc_lblk = offset >> blkbits;
> + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
>
> - ret = ext4_break_layouts(inode);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
> + new_size, flags);
> + if (ret)
> + goto out_invalidate_lock;
> + }
>
> - ret = ext4_update_disksize_before_punch(inode, offset, len);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + ret = ext4_update_disksize_before_punch(inode, offset, len);
> + if (ret)
> + goto out_invalidate_lock;
>
> - /* Now release the pages and zero block aligned part of pages */
> - ret = ext4_truncate_page_cache_block_range(inode, start, end);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + /* Now release the pages and zero block aligned part of pages */
> + ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> + if (ret)
> + goto out_invalidate_lock;
>
> - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> - flags);
> - filemap_invalidate_unlock(mapping);
> + /* Zero range excluding the unaligned edges */
> + start_lblk = EXT4_B_TO_LBLK(inode, offset);
> + end_lblk = end >> blkbits;
> + if (end_lblk > start_lblk) {
> + ext4_lblk_t zero_blks = end_lblk - start_lblk;
> +
> + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
> + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
> + new_size, flags);
> if (ret)
> - goto out_mutex;
> + goto out_invalidate_lock;
> }
> - if (!partial_begin && !partial_end)
> - goto out_mutex;
> + /* Finish zeroing out if it doesn't contain partial block */
> + if (IS_ALIGNED(offset | end, blocksize))
> + goto out_invalidate_lock;
>
> /*
> * In worst case we have to writeout two nonadjacent unwritten
> @@ -4694,25 +4662,29 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(inode->i_sb, ret);
> - goto out_mutex;
> + goto out_invalidate_lock;
> }
>
> + /* Zero out partial block at the edges of the range */
> + ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> + if (ret)
> + goto out_handle;
> +
> if (new_size)
> ext4_update_inode_size(inode, new_size);
> ret = ext4_mark_inode_dirty(handle, inode);
> if (unlikely(ret))
> goto out_handle;
> - /* Zero out partial block at the edges of the range */
> - ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> - if (ret >= 0)
> - ext4_update_inode_fsync_trans(handle, inode, 1);
>
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> if (file->f_flags & O_SYNC)
> ext4_handle_sync(handle);
>
> out_handle:
> ext4_journal_stop(handle);
> -out_mutex:
> +out_invalidate_lock:
> + filemap_invalidate_unlock(mapping);
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH v4 05/10] ext4: refactor ext4_zero_range()
2024-12-16 1:39 ` [PATCH v4 05/10] ext4: refactor ext4_zero_range() Zhang Yi
2024-12-16 15:24 ` Jan Kara
@ 2024-12-19 7:12 ` Ojaswin Mujoo
1 sibling, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-19 7:12 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:10AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> The current implementation of ext4_zero_range() contains complex
> position calculations and stale error tags. To improve the code's
> clarity and maintainability, it is essential to clean up the code and
> improve its readability, this can be achieved by: a) simplifying and
> renaming variables, making the style the same as ext4_punch_hole(); b)
> eliminating unnecessary position calculations, writing back all data in
> data=journal mode, and drop page cache from the original offset to the
> end, rather than using aligned blocks; c) renaming the stale out_mutex
> tags.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good Zhang, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Regards,
ojaswin
> ---
> fs/ext4/extents.c | 142 +++++++++++++++++++---------------------------
> 1 file changed, 57 insertions(+), 85 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 7fb38aab241d..97ad6fea58d3 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4570,40 +4570,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> struct inode *inode = file_inode(file);
> struct address_space *mapping = file->f_mapping;
> handle_t *handle = NULL;
> - unsigned int max_blocks;
> loff_t new_size = 0;
> - int ret = 0;
> - int flags;
> - int credits;
> - int partial_begin, partial_end;
> - loff_t start, end;
> - ext4_lblk_t lblk;
> + loff_t end = offset + len;
> + ext4_lblk_t start_lblk, end_lblk;
> + unsigned int blocksize = i_blocksize(inode);
> unsigned int blkbits = inode->i_blkbits;
> + int ret, flags, credits;
>
> trace_ext4_zero_range(inode, offset, len, mode);
>
> - /*
> - * Round up offset. This is not fallocate, we need to zero out
> - * blocks, so convert interior block aligned part of the range to
> - * unwritten and possibly manually zero out unaligned parts of the
> - * range. Here, start and partial_begin are inclusive, end and
> - * partial_end are exclusive.
> - */
> - start = round_up(offset, 1 << blkbits);
> - end = round_down((offset + len), 1 << blkbits);
> -
> - if (start < offset || end > offset + len)
> - return -EINVAL;
> - partial_begin = offset & ((1 << blkbits) - 1);
> - partial_end = (offset + len) & ((1 << blkbits) - 1);
> -
> - lblk = start >> blkbits;
> - max_blocks = (end >> blkbits);
> - if (max_blocks < lblk)
> - max_blocks = 0;
> - else
> - max_blocks -= lblk;
> -
> inode_lock(inode);
>
> /*
> @@ -4611,77 +4586,70 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> */
> if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> ret = -EOPNOTSUPP;
> - goto out_mutex;
> + goto out;
> }
>
> if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> - (offset + len > inode->i_size ||
> - offset + len > EXT4_I(inode)->i_disksize)) {
> - new_size = offset + len;
> + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
> + new_size = end;
> ret = inode_newsize_ok(inode, new_size);
> if (ret)
> - goto out_mutex;
> + goto out;
> }
>
> - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> -
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> inode_dio_wait(inode);
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> -
> - /* Preallocate the range including the unaligned edges */
> - if (partial_begin || partial_end) {
> - ret = ext4_alloc_file_blocks(file,
> - round_down(offset, 1 << blkbits) >> blkbits,
> - (round_up((offset + len), 1 << blkbits) -
> - round_down(offset, 1 << blkbits)) >> blkbits,
> - new_size, flags);
> - if (ret)
> - goto out_mutex;
> + goto out;
>
> - }
> + /*
> + * Prevent page faults from reinstantiating pages we have released
> + * from page cache.
> + */
> + filemap_invalidate_lock(mapping);
>
> - /* Zero range excluding the unaligned edges */
> - if (max_blocks > 0) {
> - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
> - EXT4_EX_NOCACHE);
> + ret = ext4_break_layouts(inode);
> + if (ret)
> + goto out_invalidate_lock;
>
> - /*
> - * Prevent page faults from reinstantiating pages we have
> - * released from page cache.
> - */
> - filemap_invalidate_lock(mapping);
> + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> + /* Preallocate the range including the unaligned edges */
> + if (!IS_ALIGNED(offset | end, blocksize)) {
> + ext4_lblk_t alloc_lblk = offset >> blkbits;
> + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
>
> - ret = ext4_break_layouts(inode);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
> + new_size, flags);
> + if (ret)
> + goto out_invalidate_lock;
> + }
>
> - ret = ext4_update_disksize_before_punch(inode, offset, len);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + ret = ext4_update_disksize_before_punch(inode, offset, len);
> + if (ret)
> + goto out_invalidate_lock;
>
> - /* Now release the pages and zero block aligned part of pages */
> - ret = ext4_truncate_page_cache_block_range(inode, start, end);
> - if (ret) {
> - filemap_invalidate_unlock(mapping);
> - goto out_mutex;
> - }
> + /* Now release the pages and zero block aligned part of pages */
> + ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> + if (ret)
> + goto out_invalidate_lock;
>
> - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> - flags);
> - filemap_invalidate_unlock(mapping);
> + /* Zero range excluding the unaligned edges */
> + start_lblk = EXT4_B_TO_LBLK(inode, offset);
> + end_lblk = end >> blkbits;
> + if (end_lblk > start_lblk) {
> + ext4_lblk_t zero_blks = end_lblk - start_lblk;
> +
> + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
> + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
> + new_size, flags);
> if (ret)
> - goto out_mutex;
> + goto out_invalidate_lock;
> }
> - if (!partial_begin && !partial_end)
> - goto out_mutex;
> + /* Finish zeroing out if it doesn't contain partial block */
> + if (IS_ALIGNED(offset | end, blocksize))
> + goto out_invalidate_lock;
>
> /*
> * In worst case we have to writeout two nonadjacent unwritten
> @@ -4694,25 +4662,29 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(inode->i_sb, ret);
> - goto out_mutex;
> + goto out_invalidate_lock;
> }
>
> + /* Zero out partial block at the edges of the range */
> + ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> + if (ret)
> + goto out_handle;
> +
> if (new_size)
> ext4_update_inode_size(inode, new_size);
> ret = ext4_mark_inode_dirty(handle, inode);
> if (unlikely(ret))
> goto out_handle;
> - /* Zero out partial block at the edges of the range */
> - ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> - if (ret >= 0)
> - ext4_update_inode_fsync_trans(handle, inode, 1);
>
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> if (file->f_flags & O_SYNC)
> ext4_handle_sync(handle);
>
> out_handle:
> ext4_journal_stop(handle);
> -out_mutex:
> +out_invalidate_lock:
> + filemap_invalidate_unlock(mapping);
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 06/10] ext4: refactor ext4_collapse_range()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (4 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 05/10] ext4: refactor ext4_zero_range() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 10:18 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 07/10] ext4: refactor ext4_insert_range() Zhang Yi
` (3 subsequent siblings)
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Simplify ext4_collapse_range() and align its code style with that of
ext4_zero_range() and ext4_punch_hole(). Refactor it by: a) renaming
variables, b) removing redundant input parameter checks and moving
the remaining checks under i_rwsem in preparation for future
refactoring, and c) renaming the three stale error tags.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 103 +++++++++++++++++++++-------------------------
1 file changed, 48 insertions(+), 55 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 97ad6fea58d3..8a0a720803a8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5292,43 +5292,36 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t punch_start, punch_stop;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return -EOPNOTSUPP;
+ trace_ext4_collapse_range(inode, offset, len);
- /* Collapse range works only on fs cluster size aligned regions. */
- if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
- return -EINVAL;
+ inode_lock(inode);
- trace_ext4_collapse_range(inode, offset, len);
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+ /* Collapse range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
+ ret = -EINVAL;
+ goto out;
+ }
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
+ if (end >= inode->i_size) {
ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
+ goto out;
}
/* Wait for existing dio to complete */
@@ -5336,7 +5329,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ret = file_modified(file);
if (ret)
- goto out_mutex;
+ goto out;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -5346,55 +5339,52 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ret = ext4_break_layouts(inode);
if (ret)
- goto out_mmap;
+ goto out_invalidate_lock;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_rwsem and invalidate_lock.
- */
- ret = filemap_write_and_wait_range(mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ goto out_invalidate_lock;
+
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_mmap;
+ goto out_invalidate_lock;
}
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5402,16 +5392,19 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
+out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out_mutex:
+out:
inode_unlock(inode);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 06/10] ext4: refactor ext4_collapse_range()
2024-12-16 1:39 ` [PATCH v4 06/10] ext4: refactor ext4_collapse_range() Zhang Yi
@ 2024-12-18 10:18 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:18 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:11AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Simplify ext4_collapse_range() and align its code style with that of
> ext4_zero_range() and ext4_punch_hole(). Refactor it by: a) renaming
> variables, b) removing redundant input parameter checks and moving
> the remaining checks under i_rwsem in preparation for future
> refactoring, and c) renaming the three stale error tags.
Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
> fs/ext4/extents.c | 103 +++++++++++++++++++++-------------------------
> 1 file changed, 48 insertions(+), 55 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 97ad6fea58d3..8a0a720803a8 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -5292,43 +5292,36 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> struct inode *inode = file_inode(file);
> struct super_block *sb = inode->i_sb;
> struct address_space *mapping = inode->i_mapping;
> - ext4_lblk_t punch_start, punch_stop;
> + loff_t end = offset + len;
> + ext4_lblk_t start_lblk, end_lblk;
> handle_t *handle;
> unsigned int credits;
> - loff_t new_size, ioffset;
> + loff_t start, new_size;
> int ret;
>
> - /*
> - * We need to test this early because xfstests assumes that a
> - * collapse range of (0, 1) will return EOPNOTSUPP if the file
> - * system does not support collapse range.
> - */
> - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> - return -EOPNOTSUPP;
> + trace_ext4_collapse_range(inode, offset, len);
>
> - /* Collapse range works only on fs cluster size aligned regions. */
> - if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
> - return -EINVAL;
> + inode_lock(inode);
>
> - trace_ext4_collapse_range(inode, offset, len);
> + /* Currently just for extent based files */
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
>
> - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
> - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
> + /* Collapse range works only on fs cluster size aligned regions. */
> + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
> + ret = -EINVAL;
> + goto out;
> + }
>
> - inode_lock(inode);
> /*
> * There is no need to overlap collapse range with EOF, in which case
> * it is effectively a truncate operation
> */
> - if (offset + len >= inode->i_size) {
> + if (end >= inode->i_size) {
> ret = -EINVAL;
> - goto out_mutex;
> - }
> -
> - /* Currently just for extent based files */
> - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
> - ret = -EOPNOTSUPP;
> - goto out_mutex;
> + goto out;
> }
>
> /* Wait for existing dio to complete */
> @@ -5336,7 +5329,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> + goto out;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -5346,55 +5339,52 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>
> ret = ext4_break_layouts(inode);
> if (ret)
> - goto out_mmap;
> + goto out_invalidate_lock;
>
> /*
> + * Write tail of the last page before removed range and data that
> + * will be shifted since they will get removed from the page cache
> + * below. We are also protected from pages becoming dirty by
> + * i_rwsem and invalidate_lock.
> * Need to round down offset to be aligned with page size boundary
> * for page size > block size.
> */
> - ioffset = round_down(offset, PAGE_SIZE);
> - /*
> - * Write tail of the last page before removed range since it will get
> - * removed from the page cache below.
> - */
> - ret = filemap_write_and_wait_range(mapping, ioffset, offset);
> - if (ret)
> - goto out_mmap;
> - /*
> - * Write data that will be shifted to preserve them when discarding
> - * page cache below. We are also protected from pages becoming dirty
> - * by i_rwsem and invalidate_lock.
> - */
> - ret = filemap_write_and_wait_range(mapping, offset + len,
> - LLONG_MAX);
> + start = round_down(offset, PAGE_SIZE);
> + ret = filemap_write_and_wait_range(mapping, start, offset);
> + if (!ret)
> + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
> if (ret)
> - goto out_mmap;
> - truncate_pagecache(inode, ioffset);
> + goto out_invalidate_lock;
> +
> + truncate_pagecache(inode, start);
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> - goto out_mmap;
> + goto out_invalidate_lock;
> }
> ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
>
> + start_lblk = offset >> inode->i_blkbits;
> + end_lblk = (offset + len) >> inode->i_blkbits;
> +
> down_write(&EXT4_I(inode)->i_data_sem);
> ext4_discard_preallocations(inode);
> - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
> + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
>
> - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
> + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
> if (ret) {
> up_write(&EXT4_I(inode)->i_data_sem);
> - goto out_stop;
> + goto out_handle;
> }
> ext4_discard_preallocations(inode);
>
> - ret = ext4_ext_shift_extents(inode, handle, punch_stop,
> - punch_stop - punch_start, SHIFT_LEFT);
> + ret = ext4_ext_shift_extents(inode, handle, end_lblk,
> + end_lblk - start_lblk, SHIFT_LEFT);
> if (ret) {
> up_write(&EXT4_I(inode)->i_data_sem);
> - goto out_stop;
> + goto out_handle;
> }
>
> new_size = inode->i_size - len;
> @@ -5402,16 +5392,19 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> EXT4_I(inode)->i_disksize = new_size;
>
> up_write(&EXT4_I(inode)->i_data_sem);
> - if (IS_SYNC(inode))
> - ext4_handle_sync(handle);
> ret = ext4_mark_inode_dirty(handle, inode);
> + if (ret)
> + goto out_handle;
> +
> ext4_update_inode_fsync_trans(handle, inode, 1);
> + if (IS_SYNC(inode))
> + ext4_handle_sync(handle);
>
> -out_stop:
> +out_handle:
> ext4_journal_stop(handle);
> -out_mmap:
> +out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out_mutex:
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 07/10] ext4: refactor ext4_insert_range()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (5 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 06/10] ext4: refactor ext4_collapse_range() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 10:18 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 08/10] ext4: factor out ext4_do_fallocate() Zhang Yi
` (2 subsequent siblings)
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Simplify ext4_insert_range() and align its code style with that of
ext4_collapse_range(). Refactor it by: a) renaming variables, b)
removing redundant input parameter checks and moving the remaining
checks under i_rwsem in preparation for future refactoring, and c)
renaming the three stale error tags.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 101 ++++++++++++++++++++++------------------------
1 file changed, 48 insertions(+), 53 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8a0a720803a8..be44dd7aacdb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5425,45 +5425,37 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
-
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return -EOPNOTSUPP;
-
- /* Insert range works only on fs cluster size aligned regions. */
- if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
- return -EINVAL;
+ int ret, depth, split_flag = 0;
+ loff_t start;
trace_ext4_insert_range(inode, offset, len);
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
inode_lock(inode);
+
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
- goto out_mutex;
+ goto out;
}
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
+ /* Insert range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
+ ret = -EINVAL;
+ goto out;
}
/* Offset must be less than i_size */
if (offset >= inode->i_size) {
ret = -EINVAL;
- goto out_mutex;
+ goto out;
+ }
+
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size) {
+ ret = -EFBIG;
+ goto out;
}
/* Wait for existing dio to complete */
@@ -5471,7 +5463,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ret = file_modified(file);
if (ret)
- goto out_mutex;
+ goto out;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -5481,25 +5473,24 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ret = ext4_break_layouts(inode);
if (ret)
- goto out_mmap;
+ goto out_invalidate_lock;
/*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ goto out_invalidate_lock;
+
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_mmap;
+ goto out_invalidate_lock;
}
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
@@ -5508,16 +5499,19 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize += len;
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5527,16 +5521,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
- offset_lblk, split_flag,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -5545,31 +5539,32 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
}
ext4_free_ext_path(path);
- ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
+out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out_mutex:
+out:
inode_unlock(inode);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 07/10] ext4: refactor ext4_insert_range()
2024-12-16 1:39 ` [PATCH v4 07/10] ext4: refactor ext4_insert_range() Zhang Yi
@ 2024-12-18 10:18 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:18 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:12AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Simplify ext4_insert_range() and align its code style with that of
> ext4_collapse_range(). Refactor it by: a) renaming variables, b)
> removing redundant input parameter checks and moving the remaining
> checks under i_rwsem in preparation for future refactoring, and c)
> renaming the three stale error tags.
Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
> fs/ext4/extents.c | 101 ++++++++++++++++++++++------------------------
> 1 file changed, 48 insertions(+), 53 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 8a0a720803a8..be44dd7aacdb 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -5425,45 +5425,37 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> handle_t *handle;
> struct ext4_ext_path *path;
> struct ext4_extent *extent;
> - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
> + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
> unsigned int credits, ee_len;
> - int ret = 0, depth, split_flag = 0;
> - loff_t ioffset;
> -
> - /*
> - * We need to test this early because xfstests assumes that an
> - * insert range of (0, 1) will return EOPNOTSUPP if the file
> - * system does not support insert range.
> - */
> - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> - return -EOPNOTSUPP;
> -
> - /* Insert range works only on fs cluster size aligned regions. */
> - if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
> - return -EINVAL;
> + int ret, depth, split_flag = 0;
> + loff_t start;
>
> trace_ext4_insert_range(inode, offset, len);
>
> - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
> - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
> -
> inode_lock(inode);
> +
> /* Currently just for extent based files */
> if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
> ret = -EOPNOTSUPP;
> - goto out_mutex;
> + goto out;
> }
>
> - /* Check whether the maximum file size would be exceeded */
> - if (len > inode->i_sb->s_maxbytes - inode->i_size) {
> - ret = -EFBIG;
> - goto out_mutex;
> + /* Insert range works only on fs cluster size aligned regions. */
> + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
> + ret = -EINVAL;
> + goto out;
> }
>
> /* Offset must be less than i_size */
> if (offset >= inode->i_size) {
> ret = -EINVAL;
> - goto out_mutex;
> + goto out;
> + }
> +
> + /* Check whether the maximum file size would be exceeded */
> + if (len > inode->i_sb->s_maxbytes - inode->i_size) {
> + ret = -EFBIG;
> + goto out;
> }
>
> /* Wait for existing dio to complete */
> @@ -5471,7 +5463,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
>
> ret = file_modified(file);
> if (ret)
> - goto out_mutex;
> + goto out;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -5481,25 +5473,24 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
>
> ret = ext4_break_layouts(inode);
> if (ret)
> - goto out_mmap;
> + goto out_invalidate_lock;
>
> /*
> - * Need to round down to align start offset to page size boundary
> - * for page size > block size.
> + * Write out all dirty pages. Need to round down to align start offset
> + * to page size boundary for page size > block size.
> */
> - ioffset = round_down(offset, PAGE_SIZE);
> - /* Write out all dirty pages */
> - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
> - LLONG_MAX);
> + start = round_down(offset, PAGE_SIZE);
> + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
> if (ret)
> - goto out_mmap;
> - truncate_pagecache(inode, ioffset);
> + goto out_invalidate_lock;
> +
> + truncate_pagecache(inode, start);
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> - goto out_mmap;
> + goto out_invalidate_lock;
> }
> ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
>
> @@ -5508,16 +5499,19 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> EXT4_I(inode)->i_disksize += len;
> ret = ext4_mark_inode_dirty(handle, inode);
> if (ret)
> - goto out_stop;
> + goto out_handle;
> +
> + start_lblk = offset >> inode->i_blkbits;
> + len_lblk = len >> inode->i_blkbits;
>
> down_write(&EXT4_I(inode)->i_data_sem);
> ext4_discard_preallocations(inode);
>
> - path = ext4_find_extent(inode, offset_lblk, NULL, 0);
> + path = ext4_find_extent(inode, start_lblk, NULL, 0);
> if (IS_ERR(path)) {
> up_write(&EXT4_I(inode)->i_data_sem);
> ret = PTR_ERR(path);
> - goto out_stop;
> + goto out_handle;
> }
>
> depth = ext_depth(inode);
> @@ -5527,16 +5521,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> ee_len = ext4_ext_get_actual_len(extent);
>
> /*
> - * If offset_lblk is not the starting block of extent, split
> - * the extent @offset_lblk
> + * If start_lblk is not the starting block of extent, split
> + * the extent @start_lblk
> */
> - if ((offset_lblk > ee_start_lblk) &&
> - (offset_lblk < (ee_start_lblk + ee_len))) {
> + if ((start_lblk > ee_start_lblk) &&
> + (start_lblk < (ee_start_lblk + ee_len))) {
> if (ext4_ext_is_unwritten(extent))
> split_flag = EXT4_EXT_MARK_UNWRIT1 |
> EXT4_EXT_MARK_UNWRIT2;
> path = ext4_split_extent_at(handle, inode, path,
> - offset_lblk, split_flag,
> + start_lblk, split_flag,
> EXT4_EX_NOCACHE |
> EXT4_GET_BLOCKS_PRE_IO |
> EXT4_GET_BLOCKS_METADATA_NOFAIL);
> @@ -5545,31 +5539,32 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> if (IS_ERR(path)) {
> up_write(&EXT4_I(inode)->i_data_sem);
> ret = PTR_ERR(path);
> - goto out_stop;
> + goto out_handle;
> }
> }
>
> ext4_free_ext_path(path);
> - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
> + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
>
> /*
> - * if offset_lblk lies in a hole which is at start of file, use
> + * if start_lblk lies in a hole which is at start of file, use
> * ee_start_lblk to shift extents
> */
> ret = ext4_ext_shift_extents(inode, handle,
> - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
> -
> + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
> up_write(&EXT4_I(inode)->i_data_sem);
> + if (ret)
> + goto out_handle;
> +
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> if (IS_SYNC(inode))
> ext4_handle_sync(handle);
> - if (ret >= 0)
> - ext4_update_inode_fsync_trans(handle, inode, 1);
>
> -out_stop:
> +out_handle:
> ext4_journal_stop(handle);
> -out_mmap:
> +out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out_mutex:
> +out:
> inode_unlock(inode);
> return ret;
> }
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 08/10] ext4: factor out ext4_do_fallocate()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (6 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 07/10] ext4: refactor ext4_insert_range() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 10:18 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 09/10] ext4: move out inode_lock into ext4_fallocate() Zhang Yi
2024-12-16 1:39 ` [PATCH v4 10/10] ext4: move out common parts " Zhang Yi
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Now the real job of normal fallocate are open coded in ext4_fallocate(),
factor out a new helper ext4_do_fallocate() to do the real job, like
others functions (e.g. ext4_zero_range()) in ext4_fallocate() do, this
can make the code more clear, no functional changes.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 125 ++++++++++++++++++++++------------------------
1 file changed, 60 insertions(+), 65 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index be44dd7aacdb..a8bbbf8a9950 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4689,6 +4689,58 @@ static long ext4_zero_range(struct file *file, loff_t offset,
return ret;
}
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ inode_lock(inode);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
+
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ inode_unlock(inode);
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
+ return ret;
+}
+
/*
* preallocate space for a file. This implements ext4's fallocate file
* operation, which gets called from sys_fallocate system call.
@@ -4699,12 +4751,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4726,71 +4773,19 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ret = ext4_convert_inline_data(inode);
inode_unlock(inode);
if (ret)
- goto exit;
+ return ret;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (mode & FALLOC_FL_PUNCH_HOLE)
ret = ext4_punch_hole(file, offset, len);
- goto exit;
- }
-
- if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ else if (mode & FALLOC_FL_COLLAPSE_RANGE)
ret = ext4_collapse_range(file, offset, len);
- goto exit;
- }
-
- if (mode & FALLOC_FL_INSERT_RANGE) {
+ else if (mode & FALLOC_FL_INSERT_RANGE)
ret = ext4_insert_range(file, offset, len);
- goto exit;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
+ else if (mode & FALLOC_FL_ZERO_RANGE)
ret = ext4_zero_range(file, offset, len, mode);
- goto exit;
- }
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
-
- max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- inode_lock(inode);
-
- /*
- * We only support preallocation for extent-based files only
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
- }
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out;
-
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ else
+ ret = ext4_do_fallocate(file, offset, len, mode);
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
- }
-out:
- inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-exit:
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 08/10] ext4: factor out ext4_do_fallocate()
2024-12-16 1:39 ` [PATCH v4 08/10] ext4: factor out ext4_do_fallocate() Zhang Yi
@ 2024-12-18 10:18 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:18 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:13AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Now the real job of normal fallocate are open coded in ext4_fallocate(),
> factor out a new helper ext4_do_fallocate() to do the real job, like
> others functions (e.g. ext4_zero_range()) in ext4_fallocate() do, this
> can make the code more clear, no functional changes.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> ---
> fs/ext4/extents.c | 125 ++++++++++++++++++++++------------------------
> 1 file changed, 60 insertions(+), 65 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index be44dd7aacdb..a8bbbf8a9950 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4689,6 +4689,58 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> return ret;
> }
>
> +static long ext4_do_fallocate(struct file *file, loff_t offset,
> + loff_t len, int mode)
> +{
> + struct inode *inode = file_inode(file);
> + loff_t end = offset + len;
> + loff_t new_size = 0;
> + ext4_lblk_t start_lblk, len_lblk;
> + int ret;
> +
> + trace_ext4_fallocate_enter(inode, offset, len, mode);
> +
> + start_lblk = offset >> inode->i_blkbits;
> + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
> +
> + inode_lock(inode);
> +
> + /* We only support preallocation for extent-based files only. */
> + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
> +
> + if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
> + new_size = end;
> + ret = inode_newsize_ok(inode, new_size);
> + if (ret)
> + goto out;
> + }
> +
> + /* Wait all existing dio workers, newcomers will block on i_rwsem */
> + inode_dio_wait(inode);
> +
> + ret = file_modified(file);
> + if (ret)
> + goto out;
> +
> + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
> + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> + if (ret)
> + goto out;
> +
> + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
> + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
> + EXT4_I(inode)->i_sync_tid);
> + }
> +out:
> + inode_unlock(inode);
> + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
> + return ret;
> +}
> +
> /*
> * preallocate space for a file. This implements ext4's fallocate file
> * operation, which gets called from sys_fallocate system call.
> @@ -4699,12 +4751,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> {
> struct inode *inode = file_inode(file);
> - loff_t new_size = 0;
> - unsigned int max_blocks;
> - int ret = 0;
> - int flags;
> - ext4_lblk_t lblk;
> - unsigned int blkbits = inode->i_blkbits;
> + int ret;
>
> /*
> * Encrypted inodes can't handle collapse range or insert
> @@ -4726,71 +4773,19 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> ret = ext4_convert_inline_data(inode);
> inode_unlock(inode);
> if (ret)
> - goto exit;
> + return ret;
>
> - if (mode & FALLOC_FL_PUNCH_HOLE) {
> + if (mode & FALLOC_FL_PUNCH_HOLE)
> ret = ext4_punch_hole(file, offset, len);
> - goto exit;
> - }
> -
> - if (mode & FALLOC_FL_COLLAPSE_RANGE) {
> + else if (mode & FALLOC_FL_COLLAPSE_RANGE)
> ret = ext4_collapse_range(file, offset, len);
> - goto exit;
> - }
> -
> - if (mode & FALLOC_FL_INSERT_RANGE) {
> + else if (mode & FALLOC_FL_INSERT_RANGE)
> ret = ext4_insert_range(file, offset, len);
> - goto exit;
> - }
> -
> - if (mode & FALLOC_FL_ZERO_RANGE) {
> + else if (mode & FALLOC_FL_ZERO_RANGE)
> ret = ext4_zero_range(file, offset, len, mode);
> - goto exit;
> - }
> - trace_ext4_fallocate_enter(inode, offset, len, mode);
> - lblk = offset >> blkbits;
> -
> - max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
> - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> -
> - inode_lock(inode);
> -
> - /*
> - * We only support preallocation for extent-based files only
> - */
> - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> - ret = -EOPNOTSUPP;
> - goto out;
> - }
> -
> - if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> - (offset + len > inode->i_size ||
> - offset + len > EXT4_I(inode)->i_disksize)) {
> - new_size = offset + len;
> - ret = inode_newsize_ok(inode, new_size);
> - if (ret)
> - goto out;
> - }
> -
> - /* Wait all existing dio workers, newcomers will block on i_rwsem */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - goto out;
> -
> - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
> - if (ret)
> - goto out;
> + else
> + ret = ext4_do_fallocate(file, offset, len, mode);
>
> - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
> - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
> - EXT4_I(inode)->i_sync_tid);
> - }
> -out:
> - inode_unlock(inode);
> - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
> -exit:
> return ret;
> }
>
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 09/10] ext4: move out inode_lock into ext4_fallocate()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (7 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 08/10] ext4: factor out ext4_do_fallocate() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 10:19 ` Ojaswin Mujoo
2024-12-16 1:39 ` [PATCH v4 10/10] ext4: move out common parts " Zhang Yi
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Currently, all five sub-functions of ext4_fallocate() acquire the
inode's i_rwsem at the beginning and release it before exiting. This
process can be simplified by factoring out the management of i_rwsem
into the ext4_fallocate() function.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 90 +++++++++++++++--------------------------------
fs/ext4/inode.c | 13 +++----
2 files changed, 33 insertions(+), 70 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a8bbbf8a9950..85f0de1abe78 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4578,23 +4578,18 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out;
+ return ret;
}
/* Wait all existing dio workers, newcomers will block on i_rwsem */
@@ -4602,7 +4597,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = file_modified(file);
if (ret)
- goto out;
+ return ret;
/*
* Prevent page faults from reinstantiating pages we have released
@@ -4684,8 +4679,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ext4_journal_stop(handle);
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out:
- inode_unlock(inode);
return ret;
}
@@ -4699,12 +4692,11 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
int ret;
trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
start_lblk = offset >> inode->i_blkbits;
len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
- inode_lock(inode);
-
/* We only support preallocation for extent-based files only. */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
@@ -4736,7 +4728,6 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
EXT4_I(inode)->i_sync_tid);
}
out:
- inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4771,9 +4762,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
- inode_unlock(inode);
if (ret)
- return ret;
+ goto out_inode_lock;
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = ext4_punch_hole(file, offset, len);
@@ -4785,7 +4775,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ret = ext4_zero_range(file, offset, len, mode);
else
ret = ext4_do_fallocate(file, offset, len, mode);
-
+out_inode_lock:
+ inode_unlock(inode);
return ret;
}
@@ -5295,36 +5286,27 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
int ret;
trace_ext4_collapse_range(inode, offset, len);
-
- inode_lock(inode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
/* Collapse range works only on fs cluster size aligned regions. */
- if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
- ret = -EINVAL;
- goto out;
- }
-
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
+ return -EINVAL;
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (end >= inode->i_size) {
- ret = -EINVAL;
- goto out;
- }
+ if (end >= inode->i_size)
+ return -EINVAL;
/* Wait for existing dio to complete */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
- goto out;
+ return ret;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -5399,8 +5381,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ext4_journal_stop(handle);
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out:
- inode_unlock(inode);
return ret;
}
@@ -5426,39 +5406,27 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
loff_t start;
trace_ext4_insert_range(inode, offset, len);
-
- inode_lock(inode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
/* Insert range works only on fs cluster size aligned regions. */
- if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
- ret = -EINVAL;
- goto out;
- }
-
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
+ return -EINVAL;
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out;
- }
-
+ if (offset >= inode->i_size)
+ return -EINVAL;
/* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out;
- }
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/* Wait for existing dio to complete */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
- goto out;
+ return ret;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -5559,8 +5527,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ext4_journal_stop(handle);
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out:
- inode_unlock(inode);
return ret;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7720d3700b27..2e1f070ab449 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4014,15 +4014,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
-
- inode_lock(inode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- goto out;
+ return 0;
/*
* If the hole extends beyond i_size, set the hole to end after
@@ -4042,7 +4041,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out;
+ return ret;
}
/* Wait all existing dio workers, newcomers will block on i_rwsem */
@@ -4050,7 +4049,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = file_modified(file);
if (ret)
- goto out;
+ return ret;
/*
* Prevent page faults from reinstantiating pages we have released from
@@ -4126,8 +4125,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ext4_journal_stop(handle);
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
-out:
- inode_unlock(inode);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 09/10] ext4: move out inode_lock into ext4_fallocate()
2024-12-16 1:39 ` [PATCH v4 09/10] ext4: move out inode_lock into ext4_fallocate() Zhang Yi
@ 2024-12-18 10:19 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:19 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:14AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Currently, all five sub-functions of ext4_fallocate() acquire the
> inode's i_rwsem at the beginning and release it before exiting. This
> process can be simplified by factoring out the management of i_rwsem
> into the ext4_fallocate() function.
>
Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
> fs/ext4/extents.c | 90 +++++++++++++++--------------------------------
> fs/ext4/inode.c | 13 +++----
> 2 files changed, 33 insertions(+), 70 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a8bbbf8a9950..85f0de1abe78 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4578,23 +4578,18 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> int ret, flags, credits;
>
> trace_ext4_zero_range(inode, offset, len, mode);
> + WARN_ON_ONCE(!inode_is_locked(inode));
>
> - inode_lock(inode);
> -
> - /*
> - * Indirect files do not support unwritten extents
> - */
> - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> - ret = -EOPNOTSUPP;
> - goto out;
> - }
> + /* Indirect files do not support unwritten extents */
> + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
> + return -EOPNOTSUPP;
>
> if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
> new_size = end;
> ret = inode_newsize_ok(inode, new_size);
> if (ret)
> - goto out;
> + return ret;
> }
>
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> @@ -4602,7 +4597,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>
> ret = file_modified(file);
> if (ret)
> - goto out;
> + return ret;
>
> /*
> * Prevent page faults from reinstantiating pages we have released
> @@ -4684,8 +4679,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> ext4_journal_stop(handle);
> out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out:
> - inode_unlock(inode);
> return ret;
> }
>
> @@ -4699,12 +4692,11 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
> int ret;
>
> trace_ext4_fallocate_enter(inode, offset, len, mode);
> + WARN_ON_ONCE(!inode_is_locked(inode));
>
> start_lblk = offset >> inode->i_blkbits;
> len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
>
> - inode_lock(inode);
> -
> /* We only support preallocation for extent-based files only. */
> if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> ret = -EOPNOTSUPP;
> @@ -4736,7 +4728,6 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
> EXT4_I(inode)->i_sync_tid);
> }
> out:
> - inode_unlock(inode);
> trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
> return ret;
> }
> @@ -4771,9 +4762,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>
> inode_lock(inode);
> ret = ext4_convert_inline_data(inode);
> - inode_unlock(inode);
> if (ret)
> - return ret;
> + goto out_inode_lock;
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> ret = ext4_punch_hole(file, offset, len);
> @@ -4785,7 +4775,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> ret = ext4_zero_range(file, offset, len, mode);
> else
> ret = ext4_do_fallocate(file, offset, len, mode);
> -
> +out_inode_lock:
> + inode_unlock(inode);
> return ret;
> }
>
> @@ -5295,36 +5286,27 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> int ret;
>
> trace_ext4_collapse_range(inode, offset, len);
> -
> - inode_lock(inode);
> + WARN_ON_ONCE(!inode_is_locked(inode));
>
> /* Currently just for extent based files */
> - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
> - ret = -EOPNOTSUPP;
> - goto out;
> - }
> -
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> + return -EOPNOTSUPP;
> /* Collapse range works only on fs cluster size aligned regions. */
> - if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
> - ret = -EINVAL;
> - goto out;
> - }
> -
> + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
> + return -EINVAL;
> /*
> * There is no need to overlap collapse range with EOF, in which case
> * it is effectively a truncate operation
> */
> - if (end >= inode->i_size) {
> - ret = -EINVAL;
> - goto out;
> - }
> + if (end >= inode->i_size)
> + return -EINVAL;
>
> /* Wait for existing dio to complete */
> inode_dio_wait(inode);
>
> ret = file_modified(file);
> if (ret)
> - goto out;
> + return ret;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -5399,8 +5381,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> ext4_journal_stop(handle);
> out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out:
> - inode_unlock(inode);
> return ret;
> }
>
> @@ -5426,39 +5406,27 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> loff_t start;
>
> trace_ext4_insert_range(inode, offset, len);
> -
> - inode_lock(inode);
> + WARN_ON_ONCE(!inode_is_locked(inode));
>
> /* Currently just for extent based files */
> - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
> - ret = -EOPNOTSUPP;
> - goto out;
> - }
> -
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> + return -EOPNOTSUPP;
> /* Insert range works only on fs cluster size aligned regions. */
> - if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) {
> - ret = -EINVAL;
> - goto out;
> - }
> -
> + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
> + return -EINVAL;
> /* Offset must be less than i_size */
> - if (offset >= inode->i_size) {
> - ret = -EINVAL;
> - goto out;
> - }
> -
> + if (offset >= inode->i_size)
> + return -EINVAL;
> /* Check whether the maximum file size would be exceeded */
> - if (len > inode->i_sb->s_maxbytes - inode->i_size) {
> - ret = -EFBIG;
> - goto out;
> - }
> + if (len > inode->i_sb->s_maxbytes - inode->i_size)
> + return -EFBIG;
>
> /* Wait for existing dio to complete */
> inode_dio_wait(inode);
>
> ret = file_modified(file);
> if (ret)
> - goto out;
> + return ret;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -5559,8 +5527,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> ext4_journal_stop(handle);
> out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out:
> - inode_unlock(inode);
> return ret;
> }
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 7720d3700b27..2e1f070ab449 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4014,15 +4014,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> loff_t end = offset + length;
> handle_t *handle;
> unsigned int credits;
> - int ret = 0;
> + int ret;
>
> trace_ext4_punch_hole(inode, offset, length, 0);
> -
> - inode_lock(inode);
> + WARN_ON_ONCE(!inode_is_locked(inode));
>
> /* No need to punch hole beyond i_size */
> if (offset >= inode->i_size)
> - goto out;
> + return 0;
>
> /*
> * If the hole extends beyond i_size, set the hole to end after
> @@ -4042,7 +4041,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
> ret = ext4_inode_attach_jinode(inode);
> if (ret < 0)
> - goto out;
> + return ret;
> }
>
> /* Wait all existing dio workers, newcomers will block on i_rwsem */
> @@ -4050,7 +4049,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> ret = file_modified(file);
> if (ret)
> - goto out;
> + return ret;
>
> /*
> * Prevent page faults from reinstantiating pages we have released from
> @@ -4126,8 +4125,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> ext4_journal_stop(handle);
> out_invalidate_lock:
> filemap_invalidate_unlock(mapping);
> -out:
> - inode_unlock(inode);
> return ret;
> }
>
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH v4 10/10] ext4: move out common parts into ext4_fallocate()
2024-12-16 1:39 [PATCH v4 00/10] ext4: clean up and refactor fallocate Zhang Yi
` (8 preceding siblings ...)
2024-12-16 1:39 ` [PATCH v4 09/10] ext4: move out inode_lock into ext4_fallocate() Zhang Yi
@ 2024-12-16 1:39 ` Zhang Yi
2024-12-18 10:20 ` Ojaswin Mujoo
9 siblings, 1 reply; 35+ messages in thread
From: Zhang Yi @ 2024-12-16 1:39 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack,
yi.zhang, yi.zhang, chengzhihao1, yukuai3, yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Currently, all zeroing ranges, punch holes, collapse ranges, and insert
ranges first wait for all existing direct I/O workers to complete, and
then they acquire the mapping's invalidate lock before performing the
actual work. These common components are nearly identical, so we can
simplify the code by factoring them out into the ext4_fallocate().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 124 ++++++++++++++++------------------------------
fs/ext4/inode.c | 25 ++--------
2 files changed, 45 insertions(+), 104 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 85f0de1abe78..1b028be19193 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4568,7 +4568,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
loff_t new_size = 0;
loff_t end = offset + len;
@@ -4592,23 +4591,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- return ret;
-
- /*
- * Prevent page faults from reinstantiating pages we have released
- * from page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_invalidate_lock;
-
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
/* Preallocate the range including the unaligned edges */
if (!IS_ALIGNED(offset | end, blocksize)) {
@@ -4618,17 +4600,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
new_size, flags);
if (ret)
- goto out_invalidate_lock;
+ return ret;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret)
- goto out_invalidate_lock;
+ return ret;
/* Now release the pages and zero block aligned part of pages */
ret = ext4_truncate_page_cache_block_range(inode, offset, end);
if (ret)
- goto out_invalidate_lock;
+ return ret;
/* Zero range excluding the unaligned edges */
start_lblk = EXT4_B_TO_LBLK(inode, offset);
@@ -4640,11 +4622,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
- goto out_invalidate_lock;
+ return ret;
}
/* Finish zeroing out if it doesn't contain partial block */
if (IS_ALIGNED(offset | end, blocksize))
- goto out_invalidate_lock;
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4657,7 +4639,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_invalidate_lock;
+ return ret;
}
/* Zero out partial block at the edges of the range */
@@ -4677,8 +4659,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
out_handle:
ext4_journal_stop(handle);
-out_invalidate_lock:
- filemap_invalidate_unlock(mapping);
return ret;
}
@@ -4711,13 +4691,6 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
goto out;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out;
-
ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
if (ret)
@@ -4742,6 +4715,7 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
int ret;
/*
@@ -4765,6 +4739,29 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
goto out_inode_lock;
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
+
+ ret = file_modified(file);
+ if (ret)
+ return ret;
+
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
+ }
+
+ /*
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ filemap_invalidate_lock(mapping);
+
+ ret = ext4_break_layouts(inode);
+ if (ret)
+ goto out_invalidate_lock;
+
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = ext4_punch_hole(file, offset, len);
else if (mode & FALLOC_FL_COLLAPSE_RANGE)
@@ -4774,7 +4771,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
else if (mode & FALLOC_FL_ZERO_RANGE)
ret = ext4_zero_range(file, offset, len, mode);
else
- ret = ext4_do_fallocate(file, offset, len, mode);
+ ret = -EOPNOTSUPP;
+
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
out_inode_lock:
inode_unlock(inode);
return ret;
@@ -5301,23 +5301,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
if (end >= inode->i_size)
return -EINVAL;
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- return ret;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_invalidate_lock;
-
/*
* Write tail of the last page before removed range and data that
* will be shifted since they will get removed from the page cache
@@ -5331,16 +5314,15 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
if (!ret)
ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_invalidate_lock;
+ return ret;
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_invalidate_lock;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
start_lblk = offset >> inode->i_blkbits;
@@ -5379,8 +5361,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
out_handle:
ext4_journal_stop(handle);
-out_invalidate_lock:
- filemap_invalidate_unlock(mapping);
return ret;
}
@@ -5421,23 +5401,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (len > inode->i_sb->s_maxbytes - inode->i_size)
return -EFBIG;
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- return ret;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_invalidate_lock;
-
/*
* Write out all dirty pages. Need to round down to align start offset
* to page size boundary for page size > block size.
@@ -5445,16 +5408,15 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
start = round_down(offset, PAGE_SIZE);
ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_invalidate_lock;
+ return ret;
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_invalidate_lock;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
@@ -5525,8 +5487,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
out_handle:
ext4_journal_stop(handle);
-out_invalidate_lock:
- filemap_invalidate_unlock(mapping);
return ret;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2e1f070ab449..2677a2cace58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4009,7 +4009,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t start_lblk, end_lblk;
- struct address_space *mapping = inode->i_mapping;
loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
loff_t end = offset + length;
handle_t *handle;
@@ -4044,31 +4043,15 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- return ret;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_invalidate_lock;
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_invalidate_lock;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
ret = ext4_truncate_page_cache_block_range(inode, offset, end);
if (ret)
- goto out_invalidate_lock;
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -4078,7 +4061,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_invalidate_lock;
+ return ret;
}
ret = ext4_zero_partial_blocks(handle, inode, offset, length);
@@ -4123,8 +4106,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_invalidate_lock:
- filemap_invalidate_unlock(mapping);
return ret;
}
--
2.46.1
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH v4 10/10] ext4: move out common parts into ext4_fallocate()
2024-12-16 1:39 ` [PATCH v4 10/10] ext4: move out common parts " Zhang Yi
@ 2024-12-18 10:20 ` Ojaswin Mujoo
0 siblings, 0 replies; 35+ messages in thread
From: Ojaswin Mujoo @ 2024-12-18 10:20 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, yi.zhang, chengzhihao1, yukuai3, yangerkun
On Mon, Dec 16, 2024 at 09:39:15AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> Currently, all zeroing ranges, punch holes, collapse ranges, and insert
> ranges first wait for all existing direct I/O workers to complete, and
> then they acquire the mapping's invalidate lock before performing the
> actual work. These common components are nearly identical, so we can
> simplify the code by factoring them out into the ext4_fallocate().
>
I really like the long due cleanup of fallocate paths. Thanks for
taking it up! Feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
> fs/ext4/extents.c | 124 ++++++++++++++++------------------------------
> fs/ext4/inode.c | 25 ++--------
> 2 files changed, 45 insertions(+), 104 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 85f0de1abe78..1b028be19193 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4568,7 +4568,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> loff_t len, int mode)
> {
> struct inode *inode = file_inode(file);
> - struct address_space *mapping = file->f_mapping;
> handle_t *handle = NULL;
> loff_t new_size = 0;
> loff_t end = offset + len;
> @@ -4592,23 +4591,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> return ret;
> }
>
> - /* Wait all existing dio workers, newcomers will block on i_rwsem */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - return ret;
> -
> - /*
> - * Prevent page faults from reinstantiating pages we have released
> - * from page cache.
> - */
> - filemap_invalidate_lock(mapping);
> -
> - ret = ext4_break_layouts(inode);
> - if (ret)
> - goto out_invalidate_lock;
> -
> flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> /* Preallocate the range including the unaligned edges */
> if (!IS_ALIGNED(offset | end, blocksize)) {
> @@ -4618,17 +4600,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
> new_size, flags);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
> }
>
> ret = ext4_update_disksize_before_punch(inode, offset, len);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> /* Now release the pages and zero block aligned part of pages */
> ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> /* Zero range excluding the unaligned edges */
> start_lblk = EXT4_B_TO_LBLK(inode, offset);
> @@ -4640,11 +4622,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
> new_size, flags);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
> }
> /* Finish zeroing out if it doesn't contain partial block */
> if (IS_ALIGNED(offset | end, blocksize))
> - goto out_invalidate_lock;
> + return ret;
>
> /*
> * In worst case we have to writeout two nonadjacent unwritten
> @@ -4657,7 +4639,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(inode->i_sb, ret);
> - goto out_invalidate_lock;
> + return ret;
> }
>
> /* Zero out partial block at the edges of the range */
> @@ -4677,8 +4659,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>
> out_handle:
> ext4_journal_stop(handle);
> -out_invalidate_lock:
> - filemap_invalidate_unlock(mapping);
> return ret;
> }
>
> @@ -4711,13 +4691,6 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
> goto out;
> }
>
> - /* Wait all existing dio workers, newcomers will block on i_rwsem */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - goto out;
> -
> ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
> EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> if (ret)
> @@ -4742,6 +4715,7 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
> long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> {
> struct inode *inode = file_inode(file);
> + struct address_space *mapping = file->f_mapping;
> int ret;
>
> /*
> @@ -4765,6 +4739,29 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> if (ret)
> goto out_inode_lock;
>
> + /* Wait all existing dio workers, newcomers will block on i_rwsem */
> + inode_dio_wait(inode);
> +
> + ret = file_modified(file);
> + if (ret)
> + return ret;
> +
> + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
> + ret = ext4_do_fallocate(file, offset, len, mode);
> + goto out_inode_lock;
> + }
> +
> + /*
> + * Follow-up operations will drop page cache, hold invalidate lock
> + * to prevent page faults from reinstantiating pages we have
> + * released from page cache.
> + */
> + filemap_invalidate_lock(mapping);
> +
> + ret = ext4_break_layouts(inode);
> + if (ret)
> + goto out_invalidate_lock;
> +
> if (mode & FALLOC_FL_PUNCH_HOLE)
> ret = ext4_punch_hole(file, offset, len);
> else if (mode & FALLOC_FL_COLLAPSE_RANGE)
> @@ -4774,7 +4771,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> else if (mode & FALLOC_FL_ZERO_RANGE)
> ret = ext4_zero_range(file, offset, len, mode);
> else
> - ret = ext4_do_fallocate(file, offset, len, mode);
> + ret = -EOPNOTSUPP;
> +
> +out_invalidate_lock:
> + filemap_invalidate_unlock(mapping);
> out_inode_lock:
> inode_unlock(inode);
> return ret;
> @@ -5301,23 +5301,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> if (end >= inode->i_size)
> return -EINVAL;
>
> - /* Wait for existing dio to complete */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - return ret;
> -
> - /*
> - * Prevent page faults from reinstantiating pages we have released from
> - * page cache.
> - */
> - filemap_invalidate_lock(mapping);
> -
> - ret = ext4_break_layouts(inode);
> - if (ret)
> - goto out_invalidate_lock;
> -
> /*
> * Write tail of the last page before removed range and data that
> * will be shifted since they will get removed from the page cache
> @@ -5331,16 +5314,15 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> if (!ret)
> ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> truncate_pagecache(inode, start);
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out_invalidate_lock;
> - }
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> +
> ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
>
> start_lblk = offset >> inode->i_blkbits;
> @@ -5379,8 +5361,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>
> out_handle:
> ext4_journal_stop(handle);
> -out_invalidate_lock:
> - filemap_invalidate_unlock(mapping);
> return ret;
> }
>
> @@ -5421,23 +5401,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> if (len > inode->i_sb->s_maxbytes - inode->i_size)
> return -EFBIG;
>
> - /* Wait for existing dio to complete */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - return ret;
> -
> - /*
> - * Prevent page faults from reinstantiating pages we have released from
> - * page cache.
> - */
> - filemap_invalidate_lock(mapping);
> -
> - ret = ext4_break_layouts(inode);
> - if (ret)
> - goto out_invalidate_lock;
> -
> /*
> * Write out all dirty pages. Need to round down to align start offset
> * to page size boundary for page size > block size.
> @@ -5445,16 +5408,15 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
> start = round_down(offset, PAGE_SIZE);
> ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> truncate_pagecache(inode, start);
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out_invalidate_lock;
> - }
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> +
> ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
>
> /* Expand file to avoid data loss if there is error while shifting */
> @@ -5525,8 +5487,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
>
> out_handle:
> ext4_journal_stop(handle);
> -out_invalidate_lock:
> - filemap_invalidate_unlock(mapping);
> return ret;
> }
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 2e1f070ab449..2677a2cace58 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4009,7 +4009,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> struct inode *inode = file_inode(file);
> struct super_block *sb = inode->i_sb;
> ext4_lblk_t start_lblk, end_lblk;
> - struct address_space *mapping = inode->i_mapping;
> loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
> loff_t end = offset + length;
> handle_t *handle;
> @@ -4044,31 +4043,15 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> return ret;
> }
>
> - /* Wait all existing dio workers, newcomers will block on i_rwsem */
> - inode_dio_wait(inode);
> -
> - ret = file_modified(file);
> - if (ret)
> - return ret;
> -
> - /*
> - * Prevent page faults from reinstantiating pages we have released from
> - * page cache.
> - */
> - filemap_invalidate_lock(mapping);
> -
> - ret = ext4_break_layouts(inode);
> - if (ret)
> - goto out_invalidate_lock;
>
> ret = ext4_update_disksize_before_punch(inode, offset, length);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> /* Now release the pages and zero block aligned part of pages*/
> ret = ext4_truncate_page_cache_block_range(inode, offset, end);
> if (ret)
> - goto out_invalidate_lock;
> + return ret;
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> credits = ext4_writepage_trans_blocks(inode);
> @@ -4078,7 +4061,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> ext4_std_error(sb, ret);
> - goto out_invalidate_lock;
> + return ret;
> }
>
> ret = ext4_zero_partial_blocks(handle, inode, offset, length);
> @@ -4123,8 +4106,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
> ext4_handle_sync(handle);
> out_handle:
> ext4_journal_stop(handle);
> -out_invalidate_lock:
> - filemap_invalidate_unlock(mapping);
> return ret;
> }
>
> --
> 2.46.1
>
^ permalink raw reply [flat|nested] 35+ messages in thread