From: David Sterba <dsterba@suse.cz>
To: Mark Harmstone <maharmstone@fb.com>
Cc: linux-btrfs@vger.kernel.org, io-uring@vger.kernel.org
Subject: Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
Date: Mon, 21 Oct 2024 15:50:05 +0200 [thread overview]
Message-ID: <20241021135005.GC17835@twin.jikos.cz> (raw)
In-Reply-To: <20241014171838.304953-6-maharmstone@fb.com>
On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote:
> Adds an io_uring command for encoded reads, using the same interface as
Add ...
> the existing BTRFS_IOC_ENCODED_READ ioctl.
This is probably a good summary in a changelog but the patch is quite
long so it feels like this should be described in a more detail how it's
done. Connecting two interfaces can be done in various ways, so at least
mention that it's a simple pass through, or if there are any
complications regardign locking, object lifetime and such.
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
> fs/btrfs/file.c | 1 +
> fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/ioctl.h | 1 +
> 3 files changed, 285 insertions(+)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 2aeb8116549c..e33ca73fef8c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = {
> .compat_ioctl = btrfs_compat_ioctl,
> #endif
> .remap_file_range = btrfs_remap_file_range,
> + .uring_cmd = btrfs_uring_cmd,
> .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
> };
>
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 8c9ff4898ab0..c0393575cf5e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -29,6 +29,7 @@
> #include <linux/fileattr.h>
> #include <linux/fsverity.h>
> #include <linux/sched/xacct.h>
> +#include <linux/io_uring/cmd.h>
> #include "ctree.h"
> #include "disk-io.h"
> #include "export.h"
> @@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
> return ret;
> }
>
> +struct btrfs_uring_priv {
> + struct io_uring_cmd *cmd;
> + struct page **pages;
> + unsigned long nr_pages;
> + struct kiocb iocb;
> + struct iovec *iov;
> + struct iov_iter iter;
> + struct extent_state *cached_state;
> + u64 count;
> + bool compressed;
This leaves a 7 byte hole.
> + u64 start;
> + u64 lockend;
> +};
The whole structure should be documented and the members too if it's not
obvious what they are used for.
> +
> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu;
> + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + unsigned long i;
Why is this long?
> + u64 cur;
> + size_t page_offset;
> + ssize_t ret;
> +
> + if (priv->compressed) {
> + i = 0;
> + page_offset = 0;
> + } else {
> + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> + page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1);
Please don't open code page_offset()
> + }
> + cur = 0;
> + while (cur < priv->count) {
> + size_t bytes = min_t(size_t, priv->count - cur,
> + PAGE_SIZE - page_offset);
> +
> + if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> + &priv->iter) != bytes) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + i++;
> + cur += bytes;
> + page_offset = 0;
> + }
> + ret = priv->count;
> +
> +out:
> + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +
> + io_uring_cmd_done(cmd, ret, 0, issue_flags);
> + add_rchar(current, ret);
> +
> + for (unsigned long i = 0; i < priv->nr_pages; i++) {
Shadowing 'i' of the same type as is declared in the function. Maybe
don't call it just 'i' but index as it's used outside of a loop.
> + __free_page(priv->pages[i]);
> + }
Please drop the outer { } for a single statement block.
> +
> + kfree(priv->pages);
> + kfree(priv->iov);
> + kfree(priv);
> +}
> +
> +static void btrfs_uring_read_extent_cb(void *ctx, int err)
> +{
> + struct btrfs_uring_priv *priv = ctx;
> +
> + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
Isn't there a helper for that? Type casting should be done in justified
cases and as an exception.
> + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> +}
> +
> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> + u64 start, u64 lockend,
> + struct extent_state *cached_state,
> + u64 disk_bytenr, u64 disk_io_size,
> + size_t count, bool compressed,
> + struct iovec *iov,
> + struct io_uring_cmd *cmd)
> +{
> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct page **pages;
> + struct btrfs_uring_priv *priv = NULL;
> + unsigned long nr_pages;
> + int ret;
> +
> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
The allocation sizes are derived from disk_io_size that comes from the
outside, potentially making large allocatoins. Or is there some inherent
limit on the maximu size?
> + if (ret) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv = kmalloc(sizeof(*priv), GFP_NOFS);
> + if (!priv) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv->iocb = *iocb;
> + priv->iov = iov;
> + priv->iter = *iter;
> + priv->count = count;
> + priv->cmd = cmd;
> + priv->cached_state = cached_state;
> + priv->compressed = compressed;
> + priv->nr_pages = nr_pages;
> + priv->pages = pages;
> + priv->start = start;
> + priv->lockend = lockend;
> +
> + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> + disk_io_size, pages,
> + btrfs_uring_read_extent_cb,
> + priv);
> + if (ret)
> + goto fail;
> +
> + return -EIOCBQUEUED;
> +
> +fail:
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + kfree(priv);
Does this leak pages and priv->pages?
> + return ret;
> +}
> +
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> + flags);
> + size_t copy_end;
> + struct btrfs_ioctl_encoded_io_args args = {0};
= { 0 }
> + int ret;
> + u64 disk_bytenr, disk_io_size;
> + struct file *file = cmd->file;
> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct iovec iovstack[UIO_FASTIOV];
> + struct iovec *iov = iovstack;
> + struct iov_iter iter;
> + loff_t pos;
> + struct kiocb kiocb;
> + struct extent_state *cached_state = NULL;
> + u64 start, lockend;
The stack consumption looks quite high.
> +
> + if (!capable(CAP_SYS_ADMIN)) {
> + ret = -EPERM;
> + goto out_acct;
> + }
> +
> + if (issue_flags & IO_URING_F_COMPAT) {
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> + struct btrfs_ioctl_encoded_io_args_32 args32;
> +
> + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> + flags);
> + if (copy_from_user(&args32, (const void *)cmd->sqe->addr,
> + copy_end)) {
> + ret = -EFAULT;
> + goto out_acct;
> + }
> + args.iov = compat_ptr(args32.iov);
> + args.iovcnt = args32.iovcnt;
> + args.offset = args32.offset;
> + args.flags = args32.flags;
> +#else
> + return -ENOTTY;
> +#endif
> + } else {
> + copy_end = copy_end_kernel;
> + if (copy_from_user(&args, (const void *)cmd->sqe->addr,
> + copy_end)) {
> + ret = -EFAULT;
> + goto out_acct;
> + }
> + }
> +
> + if (args.flags != 0)
> + return -EINVAL;
> +
> + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> + &iov, &iter);
> + if (ret < 0)
> + goto out_acct;
> +
> + if (iov_iter_count(&iter) == 0) {
> + ret = 0;
> + goto out_free;
> + }
> +
> + pos = args.offset;
> + ret = rw_verify_area(READ, file, &pos, args.len);
> + if (ret < 0)
> + goto out_free;
> +
> + init_sync_kiocb(&kiocb, file);
> + kiocb.ki_pos = pos;
> +
> + start = ALIGN_DOWN(pos, fs_info->sectorsize);
> + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> + ret = btrfs_encoded_read(&kiocb, &iter, &args,
> + issue_flags & IO_URING_F_NONBLOCK,
> + &cached_state, &disk_bytenr, &disk_io_size);
> + if (ret < 0 && ret != -EIOCBQUEUED)
> + goto out_free;
> +
> + file_accessed(file);
> +
> + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
> + (char *)&args + copy_end_kernel,
So many type casts again
> + sizeof(args) - copy_end_kernel)) {
> + if (ret == -EIOCBQUEUED) {
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + }
> + ret = -EFAULT;
> + goto out_free;
> + }
> +
> + if (ret == -EIOCBQUEUED) {
> + u64 count;
> +
> + /* If we've optimized things by storing the iovecs on the stack,
> + * undo this. */
This is not proper comment formatting.
> + if (!iov) {
> + iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
> + GFP_NOFS);
> + if (!iov) {
> + unlock_extent(io_tree, start, lockend,
> + &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + ret = -ENOMEM;
> + goto out_acct;
> + }
> +
> + memcpy(iov, iovstack,
> + sizeof(struct iovec) * args.iovcnt);
> + }
> +
> + count = min_t(u64, iov_iter_count(&iter), disk_io_size);
> +
> + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
> + cached_state, disk_bytenr,
> + disk_io_size, count,
> + args.compression, iov, cmd);
> +
> + goto out_acct;
> + }
> +
> +out_free:
> + kfree(iov);
> +
> +out_acct:
> + if (ret > 0)
> + add_rchar(current, ret);
> + inc_syscr(current);
> +
> + return ret;
> +}
> +
> +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
> +{
> + switch (cmd->cmd_op) {
> + case BTRFS_IOC_ENCODED_READ:
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> + case BTRFS_IOC_ENCODED_READ_32:
> +#endif
> + return btrfs_uring_encoded_read(cmd, issue_flags);
> + }
> +
> + return -EINVAL;
> +}
> +
> long btrfs_ioctl(struct file *file, unsigned int
> cmd, unsigned long arg)
> {
next prev parent reply other threads:[~2024-10-21 13:50 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
2024-10-14 17:18 ` [PATCH 1/5] btrfs: remove pointless addition in btrfs_encoded_read Mark Harmstone
2024-10-14 17:18 ` [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback Mark Harmstone
2024-10-15 15:23 ` David Sterba
2024-10-21 13:21 ` David Sterba
2024-10-14 17:18 ` [PATCH 3/5] btrfs: change btrfs_encoded_read so that reading of extent is done by caller Mark Harmstone
2024-10-14 17:18 ` [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read Mark Harmstone
2024-10-14 22:12 ` Jens Axboe
2024-10-15 8:48 ` Mark Harmstone
2024-10-14 17:18 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
2024-10-21 13:50 ` David Sterba [this message]
2024-10-21 16:15 ` Pavel Begunkov
2024-10-21 17:05 ` Mark Harmstone
2024-10-21 18:23 ` David Sterba
2024-10-22 9:12 ` Mark Harmstone
2024-10-14 17:44 ` [PATCH v3 0/5] btrfs: encoded reads via io_uring Boris Burkov
2024-10-15 8:50 ` Mark Harmstone
-- strict thread matches above, loose matches on Subject: below --
2024-10-22 14:50 [PATCH v4 0/5] btrfs: io_uring interface for encoded reads Mark Harmstone
2024-10-22 14:50 ` [PATCH 5/5] btrfs: add io_uring command " Mark Harmstone
2024-10-29 21:51 ` David Sterba
2024-10-30 0:59 ` Pavel Begunkov
2024-10-30 1:24 ` David Sterba
2024-10-30 2:32 ` Pavel Begunkov
2024-10-31 17:08 ` Mark Harmstone
2024-10-31 18:26 ` Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241021135005.GC17835@twin.jikos.cz \
--to=dsterba@suse.cz \
--cc=io-uring@vger.kernel.org \
--cc=linux-btrfs@vger.kernel.org \
--cc=maharmstone@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox