linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ming Lei <tom.leiming@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Yonghong Song <yonghong.song@linux.dev>,
	Ming Lei <tom.leiming@gmail.com>
Subject: [RFC PATCH 15/22] ublk: bpf: add bpf aio kfunc
Date: Tue,  7 Jan 2025 20:04:06 +0800	[thread overview]
Message-ID: <20250107120417.1237392-16-tom.leiming@gmail.com> (raw)
In-Reply-To: <20250107120417.1237392-1-tom.leiming@gmail.com>

Define bpf aio kfunc for bpf prog to submit AIO, so far it begins with
filesystem IO only, and in the future, it may be extended for network IO.

Only bvec buffer is covered for doing FS IO over this buffer, but it
is easy to cover UBUF because we have the great iov iter.

With bpf aio, not only user-kernel context switch is avoided, but also
user-kernel buffer copy is saved. It is very similar with loop's direct
IO implementation.

These kfuncs can be used for other subsystems, and should have belong to
lib/, but let's start from ublk first. When it becomes mature or gets more
use cases, it can be moved to /lib.

Define bpf struct_ops of bpf_aio_complete_ops which needs to be implemented
by the caller for completing bpf aio via bpf prog, which will be done in the
following patches.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/block/ublk/Makefile  |   2 +-
 drivers/block/ublk/bpf.c     |  40 +++++-
 drivers/block/ublk/bpf.h     |   1 +
 drivers/block/ublk/bpf_aio.c | 251 +++++++++++++++++++++++++++++++++++
 drivers/block/ublk/bpf_aio.h |  66 +++++++++
 5 files changed, 358 insertions(+), 2 deletions(-)
 create mode 100644 drivers/block/ublk/bpf_aio.c
 create mode 100644 drivers/block/ublk/bpf_aio.h

diff --git a/drivers/block/ublk/Makefile b/drivers/block/ublk/Makefile
index f843a9005cdb..7094607c040d 100644
--- a/drivers/block/ublk/Makefile
+++ b/drivers/block/ublk/Makefile
@@ -5,6 +5,6 @@ ccflags-y			+= -I$(src)
 
 ublk_drv-$(CONFIG_BLK_DEV_UBLK)	:= main.o
 ifeq ($(CONFIG_UBLK_BPF), y)
-ublk_drv-$(CONFIG_BLK_DEV_UBLK)	+= bpf_ops.o bpf.o
+ublk_drv-$(CONFIG_BLK_DEV_UBLK)	+= bpf_ops.o bpf.o bpf_aio.o
 endif
 obj-$(CONFIG_BLK_DEV_UBLK)	+= ublk_drv.o
diff --git a/drivers/block/ublk/bpf.c b/drivers/block/ublk/bpf.c
index ef1546a7ccda..d5880d61abe5 100644
--- a/drivers/block/ublk/bpf.c
+++ b/drivers/block/ublk/bpf.c
@@ -155,8 +155,23 @@ BTF_ID_FLAGS(func, ublk_bpf_get_iod, KF_TRUSTED_ARGS | KF_RET_NULL)
 BTF_ID_FLAGS(func, ublk_bpf_get_io_tag, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, ublk_bpf_get_queue_id, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, ublk_bpf_get_dev_id, KF_TRUSTED_ARGS)
+
+/* bpf aio kfunc */
+BTF_ID_FLAGS(func, bpf_aio_alloc, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_aio_alloc_sleepable, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_aio_release)
+BTF_ID_FLAGS(func, bpf_aio_submit)
 BTF_KFUNCS_END(ublk_bpf_kfunc_ids)
 
+__bpf_kfunc void bpf_aio_release_dtor(void *aio)
+{
+	bpf_aio_release(aio);
+}
+CFI_NOSEAL(bpf_aio_release_dtor);
+BTF_ID_LIST(bpf_aio_dtor_ids)
+BTF_ID(struct, bpf_aio)
+BTF_ID(func, bpf_aio_release_dtor)
+
 static const struct btf_kfunc_id_set ublk_bpf_kfunc_set = {
 	.owner = THIS_MODULE,
 	.set   = &ublk_bpf_kfunc_ids,
@@ -164,6 +179,12 @@ static const struct btf_kfunc_id_set ublk_bpf_kfunc_set = {
 
 int __init ublk_bpf_init(void)
 {
+	const struct btf_id_dtor_kfunc aio_dtors[] = {
+		{
+			.btf_id	      = bpf_aio_dtor_ids[0],
+			.kfunc_btf_id = bpf_aio_dtor_ids[1]
+		},
+	};
 	int err;
 
 	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
@@ -172,5 +193,22 @@ int __init ublk_bpf_init(void)
 		pr_warn("error while setting UBLK BPF tracing kfuncs: %d", err);
 		return err;
 	}
-	return ublk_bpf_struct_ops_init();
+
+	err = ublk_bpf_struct_ops_init();
+	if (err) {
+		pr_warn("error while initializing ublk bpf struct_ops: %d", err);
+		return err;
+	}
+
+	err = register_btf_id_dtor_kfuncs(aio_dtors, ARRAY_SIZE(aio_dtors),
+			THIS_MODULE);
+	if (err) {
+		pr_warn("error while registering aio destructor: %d", err);
+		return err;
+	}
+
+	err = bpf_aio_init();
+	if (err)
+		pr_warn("error while initializing bpf aio kfunc: %d", err);
+	return err;
 }
diff --git a/drivers/block/ublk/bpf.h b/drivers/block/ublk/bpf.h
index 4e178cbecb74..0ab25743ae7d 100644
--- a/drivers/block/ublk/bpf.h
+++ b/drivers/block/ublk/bpf.h
@@ -3,6 +3,7 @@
 #define UBLK_INT_BPF_HEADER
 
 #include "bpf_reg.h"
+#include "bpf_aio.h"
 
 typedef unsigned long ublk_bpf_return_t;
 typedef ublk_bpf_return_t (*queue_io_cmd_t)(struct ublk_bpf_io *io, unsigned int);
diff --git a/drivers/block/ublk/bpf_aio.c b/drivers/block/ublk/bpf_aio.c
new file mode 100644
index 000000000000..65013fe8054f
--- /dev/null
+++ b/drivers/block/ublk/bpf_aio.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Red Hat */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+
+#include "bpf_aio.h"
+
+static int __bpf_aio_submit(struct bpf_aio *aio);
+
+static struct kmem_cache *bpf_aio_cachep;
+static struct kmem_cache *bpf_aio_work_cachep;
+static struct workqueue_struct *bpf_aio_wq;
+
+static inline bool bpf_aio_is_rw(int op)
+{
+	return op == BPF_AIO_OP_FS_READ || op == BPF_AIO_OP_FS_WRITE;
+}
+
+/* check if it is short read */
+static bool bpf_aio_is_short_read(const struct bpf_aio *aio, long ret)
+{
+	return ret >= 0 && ret < aio->bytes &&
+		bpf_aio_get_op(aio) == BPF_AIO_OP_FS_READ;
+}
+
+/* zeroing the remained bytes starting from `off` to end */
+static void bpf_aio_zero_remained(const struct bpf_aio *aio, long off)
+{
+	struct iov_iter iter;
+
+	iov_iter_bvec(&iter, ITER_DEST, aio->buf.bvec, aio->buf.nr_bvec, aio->bytes);
+	iter.iov_offset = aio->buf.bvec_off;
+
+	iov_iter_advance(&iter, off);
+	iov_iter_zero(aio->bytes - off, &iter);
+}
+
+static void bpf_aio_do_completion(struct bpf_aio *aio)
+{
+	if (aio->iocb.ki_filp)
+		fput(aio->iocb.ki_filp);
+	if (aio->work)
+		kmem_cache_free(bpf_aio_work_cachep, aio->work);
+}
+
+/* ->ki_complete callback */
+static void bpf_aio_complete(struct kiocb *iocb, long ret)
+{
+	struct bpf_aio *aio = container_of(iocb, struct bpf_aio, iocb);
+
+	if (unlikely(ret == -EAGAIN)) {
+		aio->opf |= BPF_AIO_FORCE_WQ;
+		ret = __bpf_aio_submit(aio);
+		if (!ret)
+			return;
+	}
+
+	/* zero the remained bytes in case of short read */
+	if (bpf_aio_is_short_read(aio, ret))
+		bpf_aio_zero_remained(aio, ret);
+
+	bpf_aio_do_completion(aio);
+	aio->ops->bpf_aio_complete_cb(aio, ret);
+}
+
+static void bpf_aio_prep_rw(struct bpf_aio *aio, unsigned int rw,
+		struct iov_iter *iter)
+{
+	iov_iter_bvec(iter, rw, aio->buf.bvec, aio->buf.nr_bvec, aio->bytes);
+	iter->iov_offset = aio->buf.bvec_off;
+
+	if (unlikely(aio->opf & BPF_AIO_FORCE_WQ)) {
+		aio->iocb.ki_flags &= ~IOCB_NOWAIT;
+		aio->iocb.ki_complete = NULL;
+	} else {
+		aio->iocb.ki_flags |= IOCB_NOWAIT;
+		aio->iocb.ki_complete = bpf_aio_complete;
+	}
+}
+
+static int bpf_aio_do_submit(struct bpf_aio *aio)
+{
+	int op = bpf_aio_get_op(aio);
+	struct iov_iter iter;
+	struct file *file = aio->iocb.ki_filp;
+	int ret;
+
+	switch (op) {
+	case BPF_AIO_OP_FS_READ:
+		bpf_aio_prep_rw(aio, ITER_DEST, &iter);
+		if (file->f_op->read_iter)
+			ret = file->f_op->read_iter(&aio->iocb, &iter);
+		else
+			ret = -EOPNOTSUPP;
+		break;
+	case BPF_AIO_OP_FS_WRITE:
+		bpf_aio_prep_rw(aio, ITER_SOURCE, &iter);
+		if (file->f_op->write_iter)
+			ret = file->f_op->write_iter(&aio->iocb, &iter);
+		else
+			ret = -EOPNOTSUPP;
+		break;
+	case BPF_AIO_OP_FS_FSYNC:
+		ret = vfs_fsync_range(aio->iocb.ki_filp, aio->iocb.ki_pos,
+				aio->iocb.ki_pos + aio->bytes - 1, 0);
+		if (unlikely(ret && ret != -EINVAL))
+			ret = -EIO;
+		break;
+	case BPF_AIO_OP_FS_FALLOCATE:
+		ret = vfs_fallocate(aio->iocb.ki_filp, aio->iocb.ki_flags,
+				aio->iocb.ki_pos, aio->bytes);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret == -EIOCBQUEUED) {
+		ret = 0;
+	} else if (ret != -EAGAIN) {
+		bpf_aio_complete(&aio->iocb, ret);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static void bpf_aio_submit_work(struct work_struct *work)
+{
+	struct bpf_aio_work *aio_work = container_of(work, struct bpf_aio_work, work);
+
+	bpf_aio_do_submit(aio_work->aio);
+}
+
+static int __bpf_aio_submit(struct bpf_aio *aio)
+{
+	struct work_struct *work;
+
+do_submit:
+	if (likely(!(aio->opf & BPF_AIO_FORCE_WQ))) {
+		int ret = bpf_aio_do_submit(aio);
+
+		/* retry via workqueue in case of -EAGAIN */
+		if (ret != -EAGAIN)
+			return ret;
+		aio->opf |= BPF_AIO_FORCE_WQ;
+	}
+
+	if (!aio->work) {
+		bool in_irq = in_interrupt();
+		gfp_t gfpflags = in_irq ? GFP_ATOMIC : GFP_NOIO;
+
+		aio->work = kmem_cache_alloc(bpf_aio_work_cachep, gfpflags);
+		if (unlikely(!aio->work)) {
+			if (in_irq)
+				return -ENOMEM;
+			aio->opf &= ~BPF_AIO_FORCE_WQ;
+			goto do_submit;
+		}
+	}
+
+	aio->work->aio = aio;
+	work = &aio->work->work;
+	INIT_WORK(work, bpf_aio_submit_work);
+	queue_work(bpf_aio_wq, work);
+
+	return 0;
+}
+
+static struct bpf_aio *__bpf_aio_alloc(gfp_t gfpflags, unsigned op,
+				       enum bpf_aio_flag aio_flags)
+{
+	struct bpf_aio *aio;
+
+	if (op >= BPF_AIO_OP_LAST)
+		return NULL;
+
+	if (aio_flags & BPF_AIO_OP_MASK)
+		return NULL;
+
+	aio = kmem_cache_alloc(bpf_aio_cachep, gfpflags);
+	if (!aio)
+		return NULL;
+
+	memset(aio, 0, sizeof(*aio));
+	aio->opf = op | (unsigned int)aio_flags;
+	return aio;
+}
+
+__bpf_kfunc struct bpf_aio *bpf_aio_alloc(unsigned int op, enum bpf_aio_flag aio_flags)
+{
+	return __bpf_aio_alloc(GFP_ATOMIC, op, aio_flags);
+}
+
+__bpf_kfunc struct bpf_aio *bpf_aio_alloc_sleepable(unsigned int op, enum bpf_aio_flag aio_flags)
+{
+	return __bpf_aio_alloc(GFP_NOIO, op, aio_flags);
+}
+
+__bpf_kfunc void bpf_aio_release(struct bpf_aio *aio)
+{
+	kmem_cache_free(bpf_aio_cachep, aio);
+}
+
+/* Submit AIO from bpf prog */
+__bpf_kfunc int bpf_aio_submit(struct bpf_aio *aio, int fd, loff_t pos,
+		unsigned bytes, unsigned io_flags)
+{
+	struct file *file;
+
+	if (!aio->ops)
+		return -EINVAL;
+
+	file = fget(fd);
+	if (!file)
+		return -EINVAL;
+
+	/* we could be called from io completion handler */
+	if (in_interrupt())
+		aio->opf |= BPF_AIO_FORCE_WQ;
+
+	aio->iocb.ki_pos = pos;
+	aio->iocb.ki_filp = file;
+	aio->iocb.ki_flags = io_flags;
+	aio->bytes = bytes;
+	if (bpf_aio_is_rw(bpf_aio_get_op(aio))) {
+		if (file->f_flags & O_DIRECT)
+			aio->iocb.ki_flags |= IOCB_DIRECT;
+		else
+			aio->opf |= BPF_AIO_FORCE_WQ;
+		aio->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+	} else {
+		aio->opf |= BPF_AIO_FORCE_WQ;
+	}
+
+	return __bpf_aio_submit(aio);
+}
+
+int __init bpf_aio_init(void)
+{
+	bpf_aio_cachep = KMEM_CACHE(bpf_aio, SLAB_PANIC);
+	bpf_aio_work_cachep = KMEM_CACHE(bpf_aio_work, SLAB_PANIC);
+	bpf_aio_wq = alloc_workqueue("bpf_aio", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+
+	return 0;
+}
diff --git a/drivers/block/ublk/bpf_aio.h b/drivers/block/ublk/bpf_aio.h
new file mode 100644
index 000000000000..625737965c90
--- /dev/null
+++ b/drivers/block/ublk/bpf_aio.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (c) 2024 Red Hat */
+#ifndef UBLK_BPF_AIO_HEADER
+#define UBLK_BPF_AIO_HEADER
+
+#define	BPF_AIO_OP_BITS		8
+#define	BPF_AIO_OP_MASK		((1 << BPF_AIO_OP_BITS) - 1)
+
+enum bpf_aio_op {
+	BPF_AIO_OP_FS_READ	= 0,
+	BPF_AIO_OP_FS_WRITE,
+	BPF_AIO_OP_FS_FSYNC,
+	BPF_AIO_OP_FS_FALLOCATE,
+	BPF_AIO_OP_LAST,
+};
+
+enum bpf_aio_flag_bits {
+	/* force to submit io from wq */
+	__BPF_AIO_FORCE_WQ	= BPF_AIO_OP_BITS,
+	__BPF_AIO_NR_BITS,	/* stops here */
+};
+
+enum bpf_aio_flag {
+	BPF_AIO_FORCE_WQ	= (1 << __BPF_AIO_FORCE_WQ),
+};
+
+struct bpf_aio_work {
+	struct bpf_aio		*aio;
+	struct work_struct	work;
+};
+
+/* todo: support ubuf & iovec in future */
+struct bpf_aio_buf {
+	unsigned int		bvec_off;
+	int			nr_bvec;
+	const struct bio_vec	*bvec;
+};
+
+struct bpf_aio {
+	unsigned int opf;
+	unsigned int bytes;
+	struct bpf_aio_buf	buf;
+	struct bpf_aio_work	*work;
+	const struct bpf_aio_complete_ops *ops;
+	struct kiocb iocb;
+};
+
+typedef void (*bpf_aio_complete_t)(struct bpf_aio *io, long ret);
+
+struct bpf_aio_complete_ops {
+	unsigned int		id;
+	bpf_aio_complete_t	bpf_aio_complete_cb;
+};
+
+static inline unsigned int bpf_aio_get_op(const struct bpf_aio *aio)
+{
+	return aio->opf & BPF_AIO_OP_MASK;
+}
+
+int bpf_aio_init(void);
+struct bpf_aio *bpf_aio_alloc(unsigned int op, enum bpf_aio_flag aio_flags);
+struct bpf_aio *bpf_aio_alloc_sleepable(unsigned int op, enum bpf_aio_flag aio_flags);
+void bpf_aio_release(struct bpf_aio *aio);
+int bpf_aio_submit(struct bpf_aio *aio, int fd, loff_t pos, unsigned bytes,
+		unsigned io_flags);
+#endif
-- 
2.47.0


  parent reply	other threads:[~2025-01-07 12:08 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-07 12:03 [RFC PATCH 00/22] ublk: support bpf Ming Lei
2025-01-07 12:03 ` [RFC PATCH 01/22] ublk: remove two unused fields from 'struct ublk_queue' Ming Lei
2025-01-07 12:03 ` [RFC PATCH 02/22] ublk: convert several bool type fields into bitfield of `ublk_queue` Ming Lei
2025-01-07 12:03 ` [RFC PATCH 03/22] ublk: add helper of ublk_need_map_io() Ming Lei
2025-01-07 12:03 ` [RFC PATCH 04/22] ublk: move ublk into one standalone directory Ming Lei
2025-01-07 12:03 ` [RFC PATCH 05/22] ublk: move private definitions into private header Ming Lei
2025-01-07 12:03 ` [RFC PATCH 06/22] ublk: move several helpers to " Ming Lei
2025-01-07 12:03 ` [RFC PATCH 07/22] ublk: bpf: add bpf prog attach helpers Ming Lei
2025-01-07 12:03 ` [RFC PATCH 08/22] ublk: bpf: add bpf struct_ops Ming Lei
2025-01-10  1:43   ` Alexei Starovoitov
2025-01-13  4:08     ` Ming Lei
2025-01-13 21:30       ` Alexei Starovoitov
2025-01-15 11:58         ` Ming Lei
2025-01-15 20:11           ` Amery Hung
2025-01-07 12:04 ` [RFC PATCH 09/22] ublk: bpf: attach bpf prog to ublk device Ming Lei
2025-01-07 12:04 ` [RFC PATCH 10/22] ublk: bpf: add kfunc for ublk bpf prog Ming Lei
2025-01-07 12:04 ` [RFC PATCH 11/22] ublk: bpf: enable ublk-bpf Ming Lei
2025-01-07 12:04 ` [RFC PATCH 12/22] selftests: ublk: add tests for the ublk-bpf initial implementation Ming Lei
2025-01-07 12:04 ` [RFC PATCH 13/22] selftests: ublk: add tests for covering io split Ming Lei
2025-01-07 12:04 ` [RFC PATCH 14/22] selftests: ublk: add tests for covering redirecting to userspace Ming Lei
2025-01-07 12:04 ` Ming Lei [this message]
2025-01-07 12:04 ` [RFC PATCH 16/22] ublk: bpf: add bpf aio struct_ops Ming Lei
2025-01-07 12:04 ` [RFC PATCH 17/22] ublk: bpf: attach bpf aio prog to ublk device Ming Lei
2025-01-07 12:04 ` [RFC PATCH 18/22] ublk: bpf: add several ublk bpf aio kfuncs Ming Lei
2025-01-07 12:04 ` [RFC PATCH 19/22] ublk: bpf: wire bpf aio with ublk io handling Ming Lei
2025-01-07 12:04 ` [RFC PATCH 20/22] selftests: add tests for ublk bpf aio Ming Lei
2025-01-07 12:04 ` [RFC PATCH 21/22] selftests: add tests for covering both bpf aio and split Ming Lei
2025-01-07 12:04 ` [RFC PATCH 22/22] ublk: document ublk-bpf & bpf-aio Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250107120417.1237392-16-tom.leiming@gmail.com \
    --to=tom.leiming@gmail.com \
    --cc=ast@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bpf@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).