* [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode
@ 2026-06-19 9:36 Harshal Chavan
2026-06-22 20:04 ` Gabriel Krisman Bertazi
0 siblings, 1 reply; 3+ messages in thread
From: Harshal Chavan @ 2026-06-19 9:36 UTC (permalink / raw)
To: io-uring, axboe
Cc: krisman, gregkh, kees, gustavoars, linux-kernel, linux-hardening,
Harshal Chavan
Currently, if an application wants to duplicate registered file
descriptors from one io_uring instance to another, it must manually
unregister and re-register them, incurring unnecessary overhead.
Add IORING_REGISTER_CLONE_FILES to allow direct cloning of the file
table from a source ring to a destination ring. This implementation
strictly mirrors the io_clone_buffers UAPI, supporting partial offsets
and the IORING_REGISTER_DST_REPLACE flag.
To ensure lock synchronization safety, destination nodes are strictly
allocated as new, private io_rsrc_nodes rather than sharing references
across rings.
Signed-off-by: Harshal Chavan <harshal24.chavan@gmail.com>
---
v4:
- Updated Signed-off-by to use real name and moved above the scissors line (Greg KH).
v3:
- Rewrote the cloning loop to allocate private destination nodes via io_rsrc_node_alloc to fix non-atomic ref lock synchronization (Jens).
- Maintained partial offset/copy support to mirror io_clone_buffers UAPI (Jens).
- Gated the replacement free check on ctx->file_table.data.nr (Gabriel).
- Prevented self-cloning by checking ctx == src_ctx (Gabriel).
- Removed submitter_task check to allow cross-thread pooling setups (Gabriel).
v2:
- Dropped unrelated whitespace formatting changes from v1
---
include/uapi/linux/io_uring.h | 12 +++
io_uring/register.c | 6 ++
io_uring/rsrc.c | 149 ++++++++++++++++++++++++++++++++++
io_uring/rsrc.h | 1 +
4 files changed, 168 insertions(+)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 909fb7aea638..0727602ce12f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -723,6 +723,9 @@ enum io_uring_register_op {
/* register bpf filtering programs */
IORING_REGISTER_BPF_FILTER = 37,
+ /* clone file descriptors from another ring*/
+ IORING_REGISTER_CLONE_FILES = 38,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -854,6 +857,15 @@ struct io_uring_clone_buffers {
__u32 pad[3];
};
+struct io_uring_clone_files {
+ __u32 src_fd;
+ __u32 flags;
+ __u32 src_off;
+ __u32 dst_off;
+ __u32 nr;
+ __u32 pad[3];
+};
+
struct io_uring_buf {
__u64 addr;
__u32 len;
diff --git a/io_uring/register.c b/io_uring/register.c
index dce5e2f9cf77..bbc8c506ea2d 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -924,6 +924,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_clone_buffers(ctx, arg);
break;
+ case IORING_REGISTER_CLONE_FILES:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_clone_files(ctx, arg);
+ break;
case IORING_REGISTER_ZCRX_IFQ:
ret = -EINVAL;
if (!arg || nr_args != 1)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 650303626be6..a598e5af4c0a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1303,6 +1303,155 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
return ret;
}
+static int io_clone_file_node(struct io_ring_ctx *ctx,
+ struct io_rsrc_node *src_node,
+ int dst_index,
+ struct io_file_table *new_table)
+{
+ struct io_rsrc_node *dst_node;
+ struct file *file;
+
+ dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ if (!dst_node)
+ return -ENOMEM;
+
+ file = io_slot_file(src_node);
+ get_file(file);
+ io_fixed_file_set(dst_node, file);
+
+ new_table->data.nodes[dst_index] = dst_node;
+ io_file_bitmap_set(new_table, dst_index);
+
+ return 0;
+}
+
+static int io_clone_files(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
+ struct io_uring_clone_files *arg)
+{
+ struct io_file_table new_file_table;
+ unsigned int dst_nr = ctx->file_table.data.nr;
+ unsigned int src_nr = src_ctx->file_table.data.nr;
+ unsigned int new_nr, i;
+
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert_held(&src_ctx->uring_lock);
+
+ if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
+ return -EINVAL;
+
+ if (dst_nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+ return -EBUSY;
+
+ if (!src_nr)
+ return -ENXIO;
+
+ if (!arg->nr)
+ arg->nr = src_nr;
+ else if (arg->nr > src_nr)
+ return -EINVAL;
+
+ if (check_add_overflow(arg->src_off, arg->nr, &i) || i > src_nr)
+ return -EINVAL;
+ if (check_add_overflow(arg->dst_off, arg->nr, &i))
+ return -EINVAL;
+
+ new_nr = max(dst_nr, arg->dst_off + arg->nr);
+ if (new_nr > IORING_MAX_FIXED_FILES)
+ return -EINVAL;
+
+ memset(&new_file_table, 0, sizeof(new_file_table));
+ if (!io_alloc_file_tables(ctx, &new_file_table, new_nr))
+ return -ENOMEM;
+
+ /* Copy original nodes from before the cloned range */
+ for (i = 0; i < min(arg->dst_off, dst_nr); i++) {
+ struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
+
+ if (!src_node)
+ continue;
+ if (io_clone_file_node(ctx, src_node, i, &new_file_table))
+ goto out;
+ }
+
+ /* Copy the actual cloned range from the source ring */
+ for (i = 0; i < arg->nr; i++) {
+ struct io_rsrc_node *src_node = io_rsrc_node_lookup(&src_ctx->file_table.data,
+ arg->src_off + i);
+
+ if (!src_node)
+ continue;
+ if (io_clone_file_node(ctx, src_node, arg->dst_off + i, &new_file_table))
+ goto out;
+ }
+
+ /* Copy original nodes from after the cloned range */
+ for (i = arg->dst_off + arg->nr; i < dst_nr; i++) {
+ struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
+
+ if (!src_node)
+ continue;
+ if (io_clone_file_node(ctx, src_node, i, &new_file_table))
+ goto out;
+ }
+
+ /* free the old file table if there is any data present */
+ if (dst_nr)
+ io_free_file_tables(ctx, &ctx->file_table);
+
+ WARN_ON_ONCE(ctx->file_table.data.nr);
+ ctx->file_table = new_file_table;
+ io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
+ return 0;
+
+out:
+ /* Error Path: Safely destroy whatever we partially built */
+ io_free_file_tables(ctx, &new_file_table);
+ return -ENOMEM;
+}
+
+int io_register_clone_files(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_clone_files clone_arg;
+ struct io_ring_ctx *src_ctx;
+ bool registered_src;
+ struct file *file;
+ int ret;
+
+ if (copy_from_user(&clone_arg, arg, sizeof(clone_arg)))
+ return -EFAULT;
+ if (clone_arg.flags &
+ ~(IORING_REGISTER_SRC_REGISTERED | IORING_REGISTER_DST_REPLACE))
+ return -EINVAL;
+
+ if (memchr_inv(clone_arg.pad, 0, sizeof(clone_arg.pad)))
+ return -EINVAL;
+
+ registered_src = (clone_arg.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
+ file = io_uring_ctx_get_file(clone_arg.src_fd, registered_src);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ src_ctx = file->private_data;
+ /* Same ring clone is not allowed */
+ if (src_ctx == ctx) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_unlock(&ctx->uring_lock);
+ lock_two_rings(ctx, src_ctx);
+
+ ret = io_clone_files(ctx, src_ctx, &clone_arg);
+
+out:
+ if (src_ctx != ctx)
+ mutex_unlock(&src_ctx->uring_lock);
+
+ if (!registered_src)
+ fput(file);
+ return ret;
+}
+
void io_vec_free(struct iou_vec *iv)
{
if (!iv->iovec)
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 44e3386f7c1c..32f5c47c46af 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -75,6 +75,7 @@ int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
const struct iovec __user *uvec, size_t uvec_segs);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
+int io_register_clone_files(struct io_ring_ctx *ctx, void __user *arg);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags);
--
2.54.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode
2026-06-19 9:36 [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode Harshal Chavan
@ 2026-06-22 20:04 ` Gabriel Krisman Bertazi
2026-06-23 16:48 ` Harshal Chavan
0 siblings, 1 reply; 3+ messages in thread
From: Gabriel Krisman Bertazi @ 2026-06-22 20:04 UTC (permalink / raw)
To: Harshal Chavan, io-uring, axboe
Cc: gregkh, kees, gustavoars, linux-kernel, linux-hardening,
Harshal Chavan
Harshal Chavan <harshal24.chavan@gmail.com> writes:
> Currently, if an application wants to duplicate registered file
> descriptors from one io_uring instance to another, it must manually
> unregister and re-register them, incurring unnecessary overhead.
>
> Add IORING_REGISTER_CLONE_FILES to allow direct cloning of the file
> table from a source ring to a destination ring. This implementation
> strictly mirrors the io_clone_buffers UAPI, supporting partial offsets
> and the IORING_REGISTER_DST_REPLACE flag.
>
> To ensure lock synchronization safety, destination nodes are strictly
> allocated as new, private io_rsrc_nodes rather than sharing references
> across rings.
>
> Signed-off-by: Harshal Chavan <harshal24.chavan@gmail.com>
Hello,
Do you have the liburing side and test cases?
A few comments inline.
> ---
> include/uapi/linux/io_uring.h | 12 +++
> io_uring/register.c | 6 ++
> io_uring/rsrc.c | 149 ++++++++++++++++++++++++++++++++++
> io_uring/rsrc.h | 1 +
> 4 files changed, 168 insertions(+)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 909fb7aea638..0727602ce12f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -723,6 +723,9 @@ enum io_uring_register_op {
> /* register bpf filtering programs */
> IORING_REGISTER_BPF_FILTER = 37,
>
> + /* clone file descriptors from another ring*/
^ spacing
> + IORING_REGISTER_CLONE_FILES = 38,
> +
> /* this goes last */
> IORING_REGISTER_LAST,
>
> @@ -854,6 +857,15 @@ struct io_uring_clone_buffers {
> __u32 pad[3];
> };
>
> +struct io_uring_clone_files {
> + __u32 src_fd;
> + __u32 flags;
> + __u32 src_off;
> + __u32 dst_off;
> + __u32 nr;
> + __u32 pad[3];
> +};
> +
> struct io_uring_buf {
> __u64 addr;
> __u32 len;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index dce5e2f9cf77..bbc8c506ea2d 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -924,6 +924,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> break;
> ret = io_register_clone_buffers(ctx, arg);
> break;
> + case IORING_REGISTER_CLONE_FILES:
> + ret = -EINVAL;
> + if (!arg || nr_args != 1)
> + break;
> + ret = io_register_clone_files(ctx, arg);
> + break;
> case IORING_REGISTER_ZCRX_IFQ:
> ret = -EINVAL;
> if (!arg || nr_args != 1)
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 650303626be6..a598e5af4c0a 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1303,6 +1303,155 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
> return ret;
> }
>
> +static int io_clone_file_node(struct io_ring_ctx *ctx,
> + struct io_rsrc_node *src_node,
> + int dst_index,
> + struct io_file_table *new_table)
> +{
> + struct io_rsrc_node *dst_node;
> + struct file *file;
> +
> + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> + if (!dst_node)
> + return -ENOMEM;
> +
> + file = io_slot_file(src_node);
> + get_file(file);
> + io_fixed_file_set(dst_node, file);
> +
> + new_table->data.nodes[dst_index] = dst_node;
> + io_file_bitmap_set(new_table, dst_index);
> +
> + return 0;
> +}
> +
> +static int io_clone_files(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> + struct io_uring_clone_files *arg)
> +{
> + struct io_file_table new_file_table;
> + unsigned int dst_nr = ctx->file_table.data.nr;
> + unsigned int src_nr = src_ctx->file_table.data.nr;
> + unsigned int new_nr, i;
> +
> + lockdep_assert_held(&ctx->uring_lock);
> + lockdep_assert_held(&src_ctx->uring_lock);
> +
> + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
> + return -EINVAL;
I don't think it makes sense to check ->user here. But is mm_account
necessary either? How could you get the src_ctx from another process?
> +
> + if (dst_nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
> + return -EBUSY;
> +
> + if (!src_nr)
> + return -ENXIO;
> +
> + if (!arg->nr)
> + arg->nr = src_nr;
> + else if (arg->nr > src_nr)
> + return -EINVAL;
> +
> + if (check_add_overflow(arg->src_off, arg->nr, &i) || i > src_nr)
> + return -EINVAL;
> + if (check_add_overflow(arg->dst_off, arg->nr, &i))
> + return -EINVAL;
> +
> + new_nr = max(dst_nr, arg->dst_off + arg->nr);
> + if (new_nr > IORING_MAX_FIXED_FILES)
> + return -EINVAL;
> +
> + memset(&new_file_table, 0, sizeof(new_file_table));
> + if (!io_alloc_file_tables(ctx, &new_file_table, new_nr))
> + return -ENOMEM;
> +
> + /* Copy original nodes from before the cloned range */
> + for (i = 0; i < min(arg->dst_off, dst_nr); i++) {
> + struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
> +
> + if (!src_node)
> + continue;
> + if (io_clone_file_node(ctx, src_node, i, &new_file_table))
> + goto out;
> + }
> +
> + /* Copy the actual cloned range from the source ring */
> + for (i = 0; i < arg->nr; i++) {
> + struct io_rsrc_node *src_node = io_rsrc_node_lookup(&src_ctx->file_table.data,
> + arg->src_off + i);
> +
> + if (!src_node)
> + continue;
> + if (io_clone_file_node(ctx, src_node, arg->dst_off + i, &new_file_table))
> + goto out;
> + }
> +
> + /* Copy original nodes from after the cloned range */
> + for (i = arg->dst_off + arg->nr; i < dst_nr; i++) {
> + struct io_rsrc_node *src_node = io_rsrc_node_lookup(&ctx->file_table.data, i);
> +
> + if (!src_node)
> + continue;
> + if (io_clone_file_node(ctx, src_node, i, &new_file_table))
> + goto out;
> + }
> +
> + /* free the old file table if there is any data present */
> + if (dst_nr)
> + io_free_file_tables(ctx, &ctx->file_table);
> +
> + WARN_ON_ONCE(ctx->file_table.data.nr);
> + ctx->file_table = new_file_table;
> + io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
> + return 0;
> +
> +out:
> + /* Error Path: Safely destroy whatever we partially built */
> + io_free_file_tables(ctx, &new_file_table);
> + return -ENOMEM;
> +}
> +
> +int io_register_clone_files(struct io_ring_ctx *ctx, void __user *arg)
> +{
> + struct io_uring_clone_files clone_arg;
> + struct io_ring_ctx *src_ctx;
> + bool registered_src;
> + struct file *file;
> + int ret;
> +
> + if (copy_from_user(&clone_arg, arg, sizeof(clone_arg)))
> + return -EFAULT;
> + if (clone_arg.flags &
> + ~(IORING_REGISTER_SRC_REGISTERED | IORING_REGISTER_DST_REPLACE))
> + return -EINVAL;
> +
> + if (memchr_inv(clone_arg.pad, 0, sizeof(clone_arg.pad)))
> + return -EINVAL;
> +
> + registered_src = (clone_arg.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
This is better written as
registered_src = !!(clone_arg.flags & IORING_REGISTER_SRC_REGISTERED);
> + file = io_uring_ctx_get_file(clone_arg.src_fd, registered_src);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + src_ctx = file->private_data;
> + /* Same ring clone is not allowed */
> + if (src_ctx == ctx) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + mutex_unlock(&ctx->uring_lock);
> + lock_two_rings(ctx, src_ctx);
> +
> + ret = io_clone_files(ctx, src_ctx, &clone_arg);
> +
> +out:
> + if (src_ctx != ctx)
> + mutex_unlock(&src_ctx->uring_lock);
Make the mutex_unlock unconditionally above the out label. It is never
locked in the error context.
--
Gabriel Krisman Bertazi
^ permalink raw reply [flat|nested] 3+ messages in thread* Re: [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode
2026-06-22 20:04 ` Gabriel Krisman Bertazi
@ 2026-06-23 16:48 ` Harshal Chavan
0 siblings, 0 replies; 3+ messages in thread
From: Harshal Chavan @ 2026-06-23 16:48 UTC (permalink / raw)
To: krisman
Cc: axboe, gregkh, gustavoars, harshal24.chavan, io-uring, kees,
linux-hardening, linux-kernel
Gabriel Krisman Bertazi @ 2026-06-22 20:04 UTC writes:
>Hello,
>
>Do you have the liburing side and test cases?
>
>A few comments inline.
Hello,
Yes I will update the liburing side with helper function
and add appropriate test cases.
>> + /* clone file descriptors from another ring*/
> ^ spacing
Fixed in v5
>> + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
>> + return -EINVAL;
>
>I don't think it makes sense to check ->user here. But is mm_account
>necessary either? How could you get the src_ctx from another process?
Yes, Keeping this check would unnecessarily break valid use case like
Root user passing FDs to guest user
Removed it completely in v5, thanks for catching this!.
>> + registered_src = (clone_arg.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
>
>This is better written as
>
>registered_src = !!(clone_arg.flags & IORING_REGISTER_SRC_REGISTERED);
Understood, updated this in v5
>> +out:
>> + if (src_ctx != ctx)
>> + mutex_unlock(&src_ctx->uring_lock);
>
>Make the mutex_unlock unconditionally above the out label. It is never
>locked in the error context.
Yes, moved the unlock statement before out without any conditions.
Thank you for the review.
Regards,
Harshal Chavan
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-06-23 16:48 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-19 9:36 [PATCH v4] io_uring/register: add IORING_REGISTER_CLONE_FILES opcode Harshal Chavan
2026-06-22 20:04 ` Gabriel Krisman Bertazi
2026-06-23 16:48 ` Harshal Chavan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.