All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gabriel Krisman Bertazi <krisman@suse.de>
To: Harshal Chavan <harshal24.chavan@gmail.com>,
	io-uring@vger.kernel.org, axboe@kernel.dk
Cc: gregkh@linuxfoundation.org, kees@kernel.org,
	gustavoars@kernel.org, linux-kernel@vger.kernel.org,
	linux-hardening@vger.kernel.org,
	Harshal Chavan <harshal24.chavan@gmail.com>
Subject: Re: [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode
Date: Mon, 22 Jun 2026 16:04:27 -0400	[thread overview]
Message-ID: <871pdyxrxw.fsf@mailhost.krisman.be> (raw)
In-Reply-To: <20260619093641.25339-1-harshal24.chavan@gmail.com>

Harshal Chavan <harshal24.chavan@gmail.com> writes:

> Currently, if an application wants to duplicate registered file
> descriptors from one io_uring instance to another, it must manually
> unregister and re-register them, incurring unnecessary overhead.
>
> Add IORING_REGISTER_CLONE_FILES to allow direct cloning of the file
> table from a source ring to a destination ring. This implementation
> strictly mirrors the io_clone_buffers UAPI, supporting partial offsets
> and the IORING_REGISTER_DST_REPLACE flag.
>
> To ensure lock synchronization safety, destination nodes are strictly
> allocated as new, private io_rsrc_nodes rather than sharing references
> across rings.
>
> Signed-off-by: Harshal Chavan <harshal24.chavan@gmail.com>

Hello,

Do you have the liburing side and test cases?

A few comments inline.

> ---
>  include/uapi/linux/io_uring.h |  12 +++
>  io_uring/register.c           |   6 ++
>  io_uring/rsrc.c               | 149 ++++++++++++++++++++++++++++++++++
>  io_uring/rsrc.h               |   1 +
>  4 files changed, 168 insertions(+)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 909fb7aea638..0727602ce12f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -723,6 +723,9 @@ enum io_uring_register_op {
>  	/* register bpf filtering programs */
>  	IORING_REGISTER_BPF_FILTER		= 37,
>  
> +	/* clone file descriptors from another ring*/
                                                   ^ spacing

> +	IORING_REGISTER_CLONE_FILES		= 38,
> +
>  	/* this goes last */
>  	IORING_REGISTER_LAST,
>  
> @@ -854,6 +857,15 @@ struct io_uring_clone_buffers {
>  	__u32	pad[3];
>  };
>  
> +struct io_uring_clone_files {
> +	__u32 src_fd;
> +	__u32 flags;
> +	__u32 src_off;
> +	__u32 dst_off;
> +	__u32 nr;
> +	__u32 pad[3];
> +};
> +
>  struct io_uring_buf {
>  	__u64	addr;
>  	__u32	len;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index dce5e2f9cf77..bbc8c506ea2d 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -924,6 +924,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>  			break;
>  		ret = io_register_clone_buffers(ctx, arg);
>  		break;
> +	case IORING_REGISTER_CLONE_FILES:
> +		ret = -EINVAL;
> +		if (!arg || nr_args != 1)
> +			break;
> +		ret = io_register_clone_files(ctx, arg);
> +		break;
>  	case IORING_REGISTER_ZCRX_IFQ:
>  		ret = -EINVAL;
>  		if (!arg || nr_args != 1)
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 650303626be6..a598e5af4c0a 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1303,6 +1303,155 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
>  	return ret;
>  }
>  
> +static int io_clone_file_node(struct io_ring_ctx *ctx,
> +			      struct io_rsrc_node *src_node,
> +			      int dst_index,
> +			      struct io_file_table *new_table)
> +{
> +	struct io_rsrc_node *dst_node;
> +	struct file *file;
> +
> +	dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> +	if (!dst_node)
> +		return -ENOMEM;
> +
> +	file = io_slot_file(src_node);
> +	get_file(file);
> +	io_fixed_file_set(dst_node, file);
> +
> +	new_table->data.nodes[dst_index] = dst_node;
> +	io_file_bitmap_set(new_table, dst_index);
> +
> +	return 0;
> +}
> +
> +static int io_clone_files(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> +			  struct io_uring_clone_files *arg)
> +{
> +	struct io_file_table new_file_table;
> +	unsigned int dst_nr = ctx->file_table.data.nr;
> +	unsigned int src_nr = src_ctx->file_table.data.nr;
> +	unsigned int new_nr, i;
> +
> +	lockdep_assert_held(&ctx->uring_lock);
> +	lockdep_assert_held(&src_ctx->uring_lock);
> +
> +	if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
> +		return -EINVAL;

I don't think it makes sense to check ->user here.  But is mm_account
necessary either?  How could you get the src_ctx from another process?

> +
> +	if (dst_nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
> +		return -EBUSY;
> +
> +	if (!src_nr)
> +		return -ENXIO;
> +
> +	if (!arg->nr)
> +		arg->nr = src_nr;
> +	else if (arg->nr > src_nr)
> +		return -EINVAL;
> +
> +	if (check_add_overflow(arg->src_off, arg->nr, &i) || i > src_nr)
> +		return -EINVAL;
> +	if (check_add_overflow(arg->dst_off, arg->nr, &i))
> +		return -EINVAL;
> +
> +	new_nr = max(dst_nr, arg->dst_off + arg->nr);
> +	if (new_nr > IORING_MAX_FIXED_FILES)
> +		return -EINVAL;
> +
> +	memset(&new_file_table, 0, sizeof(new_file_table));
> +	if (!io_alloc_file_tables(ctx, &new_file_table, new_nr))
> +		return -ENOMEM;
> +
> +	/* Copy original nodes from before the cloned range */
> +	for (i = 0; i < min(arg->dst_off, dst_nr); i++) {
> +		struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
> +
> +		if (!src_node)
> +			continue;
> +		if (io_clone_file_node(ctx, src_node, i, &new_file_table))
> +			goto out;
> +	}
> +
> +	/* Copy the actual cloned range from the source ring */
> +	for (i = 0; i < arg->nr; i++) {
> +		struct io_rsrc_node *src_node = io_rsrc_node_lookup(&src_ctx->file_table.data,
> +				arg->src_off + i);
> +
> +		if (!src_node)
> +			continue;
> +		if (io_clone_file_node(ctx, src_node, arg->dst_off + i, &new_file_table))
> +			goto out;
> +	}
> +
> +	/* Copy original nodes from after the cloned range */
> +	for (i = arg->dst_off + arg->nr; i < dst_nr; i++) {
> +		struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
> +
> +		if (!src_node)
> +			continue;
> +		if (io_clone_file_node(ctx, src_node, i, &new_file_table))
> +			goto out;
> +	}
> +
> +	/* free the old file table if there is any data present */
> +	if (dst_nr)
> +		io_free_file_tables(ctx, &ctx->file_table);
> +
> +	WARN_ON_ONCE(ctx->file_table.data.nr);
> +	ctx->file_table = new_file_table;
> +	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
> +	return 0;
> +
> +out:
> +	/* Error Path: Safely destroy whatever we partially built */
> +	io_free_file_tables(ctx, &new_file_table);
> +	return -ENOMEM;
> +}
> +
> +int io_register_clone_files(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_clone_files clone_arg;
> +	struct io_ring_ctx *src_ctx;
> +	bool registered_src;
> +	struct file *file;
> +	int ret;
> +
> +	if (copy_from_user(&clone_arg, arg, sizeof(clone_arg)))
> +		return -EFAULT;
> +	if (clone_arg.flags &
> +	    ~(IORING_REGISTER_SRC_REGISTERED | IORING_REGISTER_DST_REPLACE))
> +		return -EINVAL;
> +
> +	if (memchr_inv(clone_arg.pad, 0, sizeof(clone_arg.pad)))
> +		return -EINVAL;
> +
> +	registered_src = (clone_arg.flags & IORING_REGISTER_SRC_REGISTERED) != 0;

This is better written as

registered_src = !!(clone_arg.flags & IORING_REGISTER_SRC_REGISTERED);

> +	file = io_uring_ctx_get_file(clone_arg.src_fd, registered_src);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	src_ctx = file->private_data;
> +	/* Same ring clone is not allowed */
> +	if (src_ctx == ctx) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	mutex_unlock(&ctx->uring_lock);
> +	lock_two_rings(ctx, src_ctx);
> +
> +	ret = io_clone_files(ctx, src_ctx, &clone_arg);
> +
> +out:
> +	if (src_ctx != ctx)
> +		mutex_unlock(&src_ctx->uring_lock);

Make the mutex_unlock unconditionally above the out label.  It is never
locked in the error context.

-- 
Gabriel Krisman Bertazi

      reply	other threads:[~2026-06-22 20:04 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-19  9:36 [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode Harshal Chavan
2026-06-22 20:04 ` Gabriel Krisman Bertazi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=871pdyxrxw.fsf@mailhost.krisman.be \
    --to=krisman@suse.de \
    --cc=axboe@kernel.dk \
    --cc=gregkh@linuxfoundation.org \
    --cc=gustavoars@kernel.org \
    --cc=harshal24.chavan@gmail.com \
    --cc=io-uring@vger.kernel.org \
    --cc=kees@kernel.org \
    --cc=linux-hardening@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.