All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: Christian Brauner <brauner@kernel.org>
Cc: io-uring@vger.kernel.org, jannh@google.com, kees@kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions
Date: Tue, 27 Jan 2026 09:41:00 -0700	[thread overview]
Message-ID: <269f6e6a-362f-4ccf-926f-2e2e0c1ea014@kernel.dk> (raw)
In-Reply-To: <20260127-lobgesang-lautsprecher-3805340711c8@brauner>

On 1/27/26 3:06 AM, Christian Brauner wrote:
>> +int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
>> +{
>> +	struct io_bpf_filter *filter;
>> +	struct io_uring_bpf_ctx bpf_ctx;
>> +	int ret;
>> +
>> +	/* Fast check for existence of filters outside of RCU */
>> +	if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
>> +		return 0;
>> +
>> +	/*
>> +	 * req->opcode has already been validated to be within the range
>> +	 * of what we expect, io_init_req() does this.
>> +	 */
>> +	rcu_read_lock();
>> +	filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
>> +	if (!filter) {
>> +		ret = 1;
>> +		goto out;
>> +	} else if (filter == &dummy_filter) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	io_uring_populate_bpf_ctx(&bpf_ctx, req);
>> +
>> +	/*
>> +	 * Iterate registered filters. The opcode is allowed IFF all filters
>> +	 * return 1. If any filter returns denied, opcode will be denied.
>> +	 */
>> +	do {
>> +		if (filter == &dummy_filter)
>> +			ret = 0;
>> +		else
>> +			ret = bpf_prog_run(filter->prog, &bpf_ctx);
>> +		if (!ret)
>> +			break;
>> +		filter = filter->next;
>> +	} while (filter);
>> +out:
>> +	rcu_read_unlock();
>> +	return ret ? 0 : -EACCES;
>> +}
> 
> Maybe we can write this a little nicer?:
> 
> int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
> {
> 	struct io_bpf_filter *filter;
> 	struct io_uring_bpf_ctx bpf_ctx;
> 
> 	/* Fast check for existence of filters outside of RCU */
> 	if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
> 		return 0;
> 
> 	/*
> 	 * req->opcode has already been validated to be within the range
> 	 * of what we expect, io_init_req() does this.
> 	 */
> 	guard(rcu)();
> 	filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
> 	if (!filter)
> 		return 0;
> 
> 	if (filter == &dummy_filter)
> 		return -EACCES;
> 
> 	io_uring_populate_bpf_ctx(&bpf_ctx, req);
> 
> 	/*
> 	 * Iterate registered filters. The opcode is allowed IFF all filters
> 	 * return 1. If any filter returns denied, opcode will be denied.
> 	 */
> 	for (; filter ; filter = filter->next) {
> 		int ret;
> 
> 		if (filter == &dummy_filter)
> 			return -EACCES;
> 
> 		ret = bpf_prog_run(filter->prog, &bpf_ctx);
> 		if (!ret)
> 			return -EACCES;
> 	}
> 
> 	return 0;
> }

Did a variant based on this, I agree it looks nicer with guard for this
one.

>> +static void io_free_bpf_filters(struct rcu_head *head)
>> +{
>> +	struct io_bpf_filter __rcu **filter;
>> +	struct io_bpf_filters *filters;
>> +	int i;
>> +
>> +	filters = container_of(head, struct io_bpf_filters, rcu_head);
>> +	spin_lock(&filters->lock);
>> +	filter = filters->filters;
>> +	if (!filter) {
>> +		spin_unlock(&filters->lock);
>> +		return;
>> +	}
>> +	spin_unlock(&filters->lock);
> 
> This is minor but I prefer:
> 
> 	scoped_guard(spinlock)(&filters->lock) {
> 		filters = container_of(head, struct io_bpf_filters, rcu_head);
> 		filter = filters->filters;
> 		if (!filter)
> 			return;
> 	}

Reason I tend to never do that is that I always have to grep for the
syntax... And case in point, you need to do:

	scoped_guard(spinlock, &filters->lock) {

so I guess I'm not the only one :-). But it does read better that way,
made this change too.

>> +static struct io_bpf_filters *io_new_bpf_filters(void)
>> +{
>> +	struct io_bpf_filters *filters;
>> +
>> +	filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
>> +	if (!filters)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	filters->filters = kcalloc(IORING_OP_LAST,
>> +				   sizeof(struct io_bpf_filter *),
>> +				   GFP_KERNEL_ACCOUNT);
>> +	if (!filters->filters) {
>> +		kfree(filters);
>> +		return ERR_PTR(-ENOMEM);
>> +	}
>> +
>> +	refcount_set(&filters->refs, 1);
>> +	spin_lock_init(&filters->lock);
>> +	return filters;
>> +}
> 
> static struct io_bpf_filters *io_new_bpf_filters(void)
> {
> 	struct io_bpf_filters *filters __free(kfree) = NULL;
> 
> 	filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
> 	if (!filters)
> 		return ERR_PTR(-ENOMEM);
> 
> 	filters->filters = kcalloc(IORING_OP_LAST,
> 				   sizeof(struct io_bpf_filter *),
> 				   GFP_KERNEL_ACCOUNT);
> 	if (!filters->filters)
> 		return ERR_PTR(-ENOMEM);
> 
> 	refcount_set(&filters->refs, 1);
> 	spin_lock_init(&filters->lock);
> 	return no_free_ptr(filters);
> }

Adopted as well, thanks.

>> +/*
>> + * Validate classic BPF filter instructions. Only allow a safe subset of
>> + * operations - no packet data access, just context field loads and basic
>> + * ALU/jump operations.
>> + */
>> +static int io_uring_check_cbpf_filter(struct sock_filter *filter,
>> +				      unsigned int flen)
>> +{
>> +	int pc;
> 
> Seems fine to me but I can't meaningfully review this.

Yeah seriously... It's just the seccomp filter modified for this use
case, so supposedly should be vetted already.

>> +int io_register_bpf_filter(struct io_restriction *res,
>> +			   struct io_uring_bpf __user *arg)
>> +{
>> +	struct io_bpf_filter *filter, *old_filter;
>> +	struct io_bpf_filters *filters;
>> +	struct io_uring_bpf reg;
>> +	struct bpf_prog *prog;
>> +	struct sock_fprog fprog;
>> +	int ret;
>> +
>> +	if (copy_from_user(&reg, arg, sizeof(reg)))
>> +		return -EFAULT;
>> +	if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
>> +		return -EINVAL;
>> +	if (reg.cmd_flags || reg.resv)
>> +		return -EINVAL;
>> +
>> +	if (reg.filter.opcode >= IORING_OP_LAST)
>> +		return -EINVAL;
> 
> So you only support per-op-code filtering with cbpf. I assume that you
> would argue that people can use the existing io_uring restrictions. But
> that's not inherited, right? So then this forces users to have a bpf
> program for all opcodes that io_uring on their system supports.

The existing restrictions ARE inherited now, and can be set on a
per-task basis as well. That's in the last patch. And since the classic
"deny this opcode" filtering is way cheaper than running a BPF prog, I
do think that's the right approach.

> I think that this is a bit unfortunate and wasteful for both userspace
> and io_uring. Can't we do a combined thing where we also allow filters
> to attach to all op-codes. Then userspace could start with an allow-list
> or deny-list filter and then attach further per-op-code bpf programs to
> the op-codes they want to manage specifically. Then you also get
> inheritance of the restrictions per-task.
> 
> That would be nicer imho.

I'm considering this one moot given the above on using
IORING_REGISTER_RESTRICTIONS with fd == -1 to set per-task restrictions
using the classic non-bpf filtering, which are inherited as well. You
only need the cBPF filters if you want to deny an opcode based on aux
data for that opcode.

-- 
Jens Axboe

  reply	other threads:[~2026-01-27 16:41 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-19 23:54 [PATCHSET v6] Inherited restrictions and BPF filtering for io_uring Jens Axboe
2026-01-19 23:54 ` [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions Jens Axboe
2026-01-27 10:06   ` Christian Brauner
2026-01-27 16:41     ` Jens Axboe [this message]
2026-01-19 23:54 ` [PATCH 2/7] io_uring/net: allow filtering on IORING_OP_SOCKET data Jens Axboe
2026-01-19 23:54 ` [PATCH 3/7] io_uring/bpf_filter: allow filtering on contents of struct open_how Jens Axboe
2026-01-27  9:33   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 4/7] io_uring/bpf_filter: cache lookup table in ctx->bpf_filters Jens Axboe
2026-01-27  9:33   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 5/7] io_uring/bpf_filter: add ref counts to struct io_bpf_filter Jens Axboe
2026-01-27  9:34   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 6/7] io_uring: add task fork hook Jens Axboe
2026-01-27 10:07   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 7/7] io_uring: allow registration of per-task restrictions Jens Axboe
2026-01-22  3:37 ` [PATCHSET v6] Inherited restrictions and BPF filtering for io_uring Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2026-01-27 18:29 [PATCHSET v7] " Jens Axboe
2026-01-27 18:29 ` [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=269f6e6a-362f-4ccf-926f-2e2e0c1ea014@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=io-uring@vger.kernel.org \
    --cc=jannh@google.com \
    --cc=kees@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.