linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: John Garry <john.g.garry@oracle.com>
Cc: axboe@kernel.dk, tytso@mit.edu, dchinner@redhat.com,
	viro@zeniv.linux.org.uk, brauner@kernel.org, jack@suse.com,
	chandan.babu@oracle.com, hch@lst.de, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-btrfs@vger.kernel.org,
	linux-erofs@lists.ozlabs.org, linux-ext4@vger.kernel.org,
	linux-f2fs-devel@lists.sourceforge.net,
	linux-fsdevel@vger.kernel.org, gfs2@lists.linux.dev,
	linux-xfs@vger.kernel.org, catherine.hoang@oracle.com,
	ritesh.list@gmail.com, mcgrof@kernel.org,
	mikulas@artax.karlin.mff.cuni.cz, agruenba@redhat.com,
	miklos@szeredi.hu, martin.petersen@oracle.com
Subject: Re: [PATCH v4 02/22] iomap: Allow filesystems set IO block zeroing size
Date: Wed, 12 Jun 2024 14:32:35 -0700	[thread overview]
Message-ID: <20240612213235.GK2764752@frogsfrogsfrogs> (raw)
In-Reply-To: <20240607143919.2622319-3-john.g.garry@oracle.com>

On Fri, Jun 07, 2024 at 02:38:59PM +0000, John Garry wrote:
> Allow filesystems to set the io_block_size for sub-fs block size zeroing,
> as in future we will want to extend this feature to support zeroing of
> block sizes of larger than the inode block size.
> 
> The value in io_block_size does not have to be a power-of-2, so fix up
> zeroing code to handle that.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  block/fops.c          |  1 +
>  fs/btrfs/inode.c      |  1 +
>  fs/erofs/data.c       |  1 +
>  fs/erofs/zmap.c       |  1 +
>  fs/ext2/inode.c       |  1 +
>  fs/ext4/extents.c     |  1 +
>  fs/ext4/inode.c       |  1 +
>  fs/f2fs/data.c        |  1 +
>  fs/fuse/dax.c         |  1 +
>  fs/gfs2/bmap.c        |  1 +
>  fs/hpfs/file.c        |  1 +
>  fs/iomap/direct-io.c  | 23 +++++++++++++++++++----
>  fs/xfs/xfs_iomap.c    |  1 +
>  fs/zonefs/file.c      |  2 ++
>  include/linux/iomap.h |  2 ++
>  15 files changed, 35 insertions(+), 4 deletions(-)
> 
> diff --git a/block/fops.c b/block/fops.c
> index 9d6d86ebefb9..020443078630 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -402,6 +402,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  	iomap->addr = iomap->offset;
>  	iomap->length = isize - iomap->offset;
>  	iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
> +	iomap->io_block_size = i_blocksize(inode);
>  	return 0;
>  }
>  

<snip a bunch of filesystems setting io_block_size to i_blocksize>

> diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
> index 1bb8d97cd9ae..5d2718faf520 100644
> --- a/fs/hpfs/file.c
> +++ b/fs/hpfs/file.c
> @@ -149,6 +149,7 @@ static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  		iomap->addr = IOMAP_NULL_ADDR;
>  		iomap->length = 1 << blkbits;
>  	}
> +	iomap->io_block_size = i_blocksize(inode);

HPFS does iomap now?  Yikes.

>  
>  	hpfs_unlock(sb);
>  	return 0;
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..5be8d886ab4a 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  {
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	u64 io_block_size = iomap->io_block_size;

I wonder, should iomap be nice and not require filesystems to set
io_block_size themselves unless they really need it?  Anyone working on
an iomap port while this patchset is in progress may or may not remember
to add this bit if they get their port merged after atomicwrites is
merged; and you might not remember to prevent the bitrot if the reverse
order happens.

	u64 io_block_size = iomap->io_block_size ?: i_blocksize(inode);

>  	loff_t length = iomap_length(iter);
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
> @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	int nr_pages, ret = 0;
>  	size_t copied = 0;
>  	size_t orig_count;
> +	unsigned int pad;
>  
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  
>  	if (need_zeroout) {
>  		/* zero out from the start of the block to the write offset */
> -		pad = pos & (fs_block_size - 1);
> +		if (is_power_of_2(io_block_size)) {
> +			pad = pos & (io_block_size - 1);
> +		} else {
> +			loff_t _pos = pos;
> +
> +			pad = do_div(_pos, io_block_size);
> +		}

Please don't opencode this twice.

static unsigned int offset_in_block(loff_t pos, u64 blocksize)
{
	if (likely(is_power_of_2(blocksize)))
		return pos & (blocksize - 1);
	return do_div(pos, blocksize);
}

		pad = offset_in_block(pos, io_block_size);
		if (pad)
			...

Also, what happens if pos-pad points to a byte before the mapping?

> +
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
>  	}
> @@ -429,9 +437,16 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	if (need_zeroout ||
>  	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
>  		/* zero out from the end of the write to the end of the block */
> -		pad = pos & (fs_block_size - 1);
> +		if (is_power_of_2(io_block_size)) {
> +			pad = pos & (io_block_size - 1);
> +		} else {
> +			loff_t _pos = pos;
> +
> +			pad = do_div(_pos, io_block_size);
> +		}
> +
>  		if (pad)
> -			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
> +			iomap_dio_zero(iter, dio, pos, io_block_size - pad);

What if pos + io_block_size - pad points to a byte after the end of the
mapping?

>  	}
>  out:
>  	/* Undo iter limitation to current extent */
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 378342673925..ecb4cae88248 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -127,6 +127,7 @@ xfs_bmbt_to_iomap(
>  	}
>  	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
>  	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
> +	iomap->io_block_size = i_blocksize(VFS_I(ip));
>  	if (mapping_flags & IOMAP_DAX)
>  		iomap->dax_dev = target->bt_daxdev;
>  	else
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index 3b103715acc9..bf2cc4bee309 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
>  		iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
>  		iomap->length = isize - iomap->offset;
>  	}
> +	iomap->io_block_size = i_blocksize(inode);
>  	mutex_unlock(&zi->i_truncate_mutex);
>  
>  	trace_zonefs_iomap_begin(inode, iomap);
> @@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
>  		iomap->type = IOMAP_MAPPED;
>  		iomap->length = isize - iomap->offset;
>  	}
> +	iomap->io_block_size = i_blocksize(inode);
>  	mutex_unlock(&zi->i_truncate_mutex);
>  
>  	trace_zonefs_iomap_begin(inode, iomap);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6fc1c858013d..d63a35b77907 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -103,6 +103,8 @@ struct iomap {
>  	void			*private; /* filesystem private */
>  	const struct iomap_folio_ops *folio_ops;
>  	u64			validity_cookie; /* used with .iomap_valid() */
> +	/* io block zeroing size, not necessarily a power-of-2  */

size in bytes?

I'm not sure what "io block zeroing" means.  What are you trying to
accomplish here?  Let's say the fsblock size is 4k and the allocation
unit (aka the atomic write size) is 16k.  Userspace wants a direct write
to file offset 8192-12287, and that space is unwritten:

uuuu
  ^

Currently we'd just write the 4k and run the io completion handler, so
the final state is:

uuWu

Instead, if the fs sets io_block_size to 16384, does this direct write
now amplify into a full 16k write?  With the end result being:

ZZWZ

only.... I don't see the unwritten areas being converted to written?
I guess for an atomic write you'd require the user to write 0-16383?

<still confused about why we need to do this, maybe i'll figure it out
as I go along>

--D

> +	u64			io_block_size;
>  };
>  
>  static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
> -- 
> 2.31.1
> 
> 

  reply	other threads:[~2024-06-12 21:32 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-07 14:38 [PATCH v4 00/22] block atomic writes for xfs John Garry
2024-06-07 14:38 ` [PATCH v4 01/22] fs: Add generic_atomic_write_valid_size() John Garry
2024-06-12 21:10   ` Darrick J. Wong
2024-06-13  7:35     ` John Garry
2024-06-20 21:24       ` Darrick J. Wong
2024-06-07 14:38 ` [PATCH v4 02/22] iomap: Allow filesystems set IO block zeroing size John Garry
2024-06-12 21:32   ` Darrick J. Wong [this message]
2024-06-13 10:31     ` John Garry
2024-06-21 21:18       ` Darrick J. Wong
2024-06-24 13:58         ` John Garry
2024-06-07 14:39 ` [PATCH v4 03/22] xfs: Use extent size granularity for iomap->io_block_size John Garry
2024-06-12 21:47   ` Darrick J. Wong
2024-06-13 11:13     ` John Garry
2024-06-07 14:39 ` [PATCH v4 04/22] xfs: only allow minlen allocations when near ENOSPC John Garry
2024-06-07 14:39 ` [PATCH v4 05/22] xfs: always tail align maxlen allocations John Garry
2024-06-07 14:39 ` [PATCH v4 06/22] xfs: simplify extent allocation alignment John Garry
2024-06-07 14:39 ` [PATCH v4 07/22] xfs: make EOF allocation simpler John Garry
2024-06-07 14:39 ` [PATCH v4 08/22] xfs: introduce forced allocation alignment John Garry
2024-06-07 14:39 ` [PATCH v4 09/22] xfs: align args->minlen for " John Garry
2024-06-07 14:39 ` [PATCH v4 10/22] xfs: Introduce FORCEALIGN inode flag John Garry
2024-06-07 14:39 ` [PATCH v4 11/22] xfs: Do not free EOF blocks for forcealign John Garry
2024-06-07 14:39 ` [PATCH v4 12/22] xfs: Update xfs_inode_alloc_unitsize_fsb() " John Garry
2024-06-07 14:39 ` [PATCH v4 13/22] xfs: Unmap blocks according to forcealign John Garry
2024-06-11 10:08   ` John Garry
2024-06-07 14:39 ` [PATCH v4 14/22] xfs: Only free full extents for forcealign John Garry
2024-06-07 14:39 ` [PATCH v4 15/22] xfs: Don't revert allocated offset " John Garry
2024-06-07 14:39 ` [PATCH v4 16/22] xfs: Enable file data forcealign feature John Garry
2024-06-07 14:39 ` [PATCH v4 17/22] fs: Add FS_XFLAG_ATOMICWRITES flag John Garry
2024-06-07 14:39 ` [PATCH v4 18/22] iomap: Atomic write support John Garry
2024-06-07 14:39 ` [PATCH v4 19/22] xfs: Support FS_XFLAG_ATOMICWRITES for forcealign John Garry
2024-06-07 14:39 ` [PATCH v4 20/22] xfs: Support atomic write for statx John Garry
2024-06-07 14:39 ` [PATCH v4 21/22] xfs: Validate atomic writes John Garry
2024-06-07 14:39 ` [PATCH v4 22/22] xfs: Support setting FMODE_CAN_ATOMIC_WRITE John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240612213235.GK2764752@frogsfrogsfrogs \
    --to=djwong@kernel.org \
    --cc=agruenba@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=catherine.hoang@oracle.com \
    --cc=chandan.babu@oracle.com \
    --cc=dchinner@redhat.com \
    --cc=gfs2@lists.linux.dev \
    --cc=hch@lst.de \
    --cc=jack@suse.com \
    --cc=john.g.garry@oracle.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-erofs@lists.ozlabs.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-f2fs-devel@lists.sourceforge.net \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=mcgrof@kernel.org \
    --cc=miklos@szeredi.hu \
    --cc=mikulas@artax.karlin.mff.cuni.cz \
    --cc=ritesh.list@gmail.com \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).