From: Anuj Gupta <anuj20.g@samsung.com>
To: Christoph Hellwig <hch@lst.de>
Cc: Anuj gupta <anuj1072538@gmail.com>,
axboe@kernel.dk, kbusch@kernel.org, martin.petersen@oracle.com,
asml.silence@gmail.com, brauner@kernel.org, jack@suse.cz,
viro@zeniv.linux.org.uk, io-uring@vger.kernel.org,
linux-nvme@lists.infradead.org, linux-block@vger.kernel.org,
gost.dev@samsung.com, linux-scsi@vger.kernel.org,
vishak.g@samsung.com, linux-fsdevel@vger.kernel.org,
Kanchan Joshi <joshi.k@samsung.com>
Subject: Re: [PATCH v8 06/10] io_uring/rw: add support to send metadata along with read/write
Date: Thu, 7 Nov 2024 16:10:00 +0530 [thread overview]
Message-ID: <20241107104000.GB9730@green245> (raw)
In-Reply-To: <20241107073852.GA5195@lst.de>
[-- Attachment #1: Type: text/plain, Size: 9216 bytes --]
I addressed your feedback in the patch below, does this look fine?
From e03fe5fe8ea057d01f5986b8add3769d1095da07 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Wed, 6 Nov 2024 17:48:38 +0530
Subject: [PATCH] io_uring/rw: add support to send metadata along with
read/write
This patch adds the capability of passing integrity metadata along with
read/write. A new ext_cap (extended_capability) field is introduced in SQE
which indicates the type of extra information being sent. A new
'struct io_uring_sqe_ext' represents the secondary SQE space for
read/write. In future if another extension needs to be added, then one
needs to:
1. Add extra fields in the sqe/secondary-sqe
2. Introduce a ext_cap flag indicating additional values that have been
passed
The last 32 bytes of secondary SQE is used to pass following PI related
information:
- flags: integrity check flags namely
IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- pi_len: length of the pi/metadata buffer
- pi_addr: address of the metadata buffer
- pi_seed: seed value for reftag remapping
- pi_app_tag: application defined 16b value
Application sets up a SQE128 ring, prepares PI information within the
second SQE and sets the ext_cap field to EXT_CAP_PI. The patch processes
this information to prepare uio_meta descriptor and passes it down using
kiocb->private.
Meta exchange is supported only for direct IO.
Also vectored read/write operations with meta are not supported
currently.
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
include/uapi/linux/io_uring.h | 28 +++++++++++
io_uring/io_uring.c | 5 ++
io_uring/rw.c | 88 ++++++++++++++++++++++++++++++++++-
io_uring/rw.h | 14 +++++-
4 files changed, 132 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 56cf30b49ef5..29f0b742004b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,11 @@ struct io_uring_sqe {
__u16 addr_len;
__u16 __pad3[1];
};
+ struct {
+ /* flags indicating additional information being passed */
+ __u16 ext_cap;
+ __u16 __pad4[1];
+ };
};
union {
struct {
@@ -107,6 +112,29 @@ struct io_uring_sqe {
};
};
+/*
+ * If sqe->ext_cap is set to this for IORING_OP_READ/WRITE, then the SQE
+ * contains protection information, and ring needs to be setup with SQE128
+ */
+#define EXT_CAP_PI (1U << 0)
+
+/* Second half of SQE128 for IORING_OP_READ/WRITE */
+struct io_uring_sqe_ext {
+ /*
+ * Reserved space for extended capabilities that are added down the
+ * line. Kept in beginning to maintain contiguity with the free space
+ * in first SQE
+ */
+ __u64 rsvd0[4];
+ /* only valid when EXT_CAP_PI is set */
+ __u16 flags;
+ __u16 pi_app_tag;
+ __u32 pi_len;
+ __u64 pi_addr;
+ __u64 pi_seed;
+ __u64 rsvd1;
+};
+
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 076171977d5e..5aa16bb60313 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4166,7 +4166,9 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
+ BUILD_BUG_SQE_ELEM(44, __u16, ext_cap);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
+ BUILD_BUG_SQE_ELEM(46, __u16, __pad4[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
@@ -4193,6 +4195,9 @@ static int __init io_uring_init(void)
/* top 8bits are for internal use */
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+ BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) !=
+ sizeof(struct io_uring_sqe));
+
io_uring_optable_init();
/*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 768a908ca2a8..4f8b7952d9be 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,64 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
return 0;
}
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+ io->meta_state.seed = io->meta.seed;
+ iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+ io->meta.seed = io->meta_state.seed;
+ iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe)
+{
+ return (sqe + 1);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_rw *rw, int ddir)
+{
+ const struct io_uring_sqe_ext *sqe_ext;
+ const struct io_issue_def *def;
+ struct io_async_rw *io;
+ int ret;
+
+ if (!(req->ctx->flags & IORING_SETUP_SQE128))
+ return -EINVAL;
+
+ sqe_ext = io_uring_sqe_ext(sqe);
+ if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1])
+ || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3]))
+ return -EINVAL;
+ if (READ_ONCE(sqe_ext->rsvd1))
+ return -EINVAL;
+
+ def = &io_issue_defs[req->opcode];
+ if (def->vectored)
+ return -EOPNOTSUPP;
+
+ io = req->async_data;
+ io->meta.flags = READ_ONCE(sqe_ext->flags);
+ io->meta.app_tag = READ_ONCE(sqe_ext->pi_app_tag);
+ io->meta.seed = READ_ONCE(sqe_ext->pi_seed);
+ ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->pi_addr)),
+ READ_ONCE(sqe_ext->pi_len), &io->meta.iter);
+ if (unlikely(ret < 0))
+ return ret;
+ rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+ io_meta_save_state(io);
+ return ret;
+}
+
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir, bool do_import)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned ioprio;
+ u16 ext_cap;
int ret;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +332,23 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
+ rw->kiocb.ki_flags = 0;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
- return io_prep_rw_setup(req, ddir, do_import);
+ ret = io_prep_rw_setup(req, ddir, do_import);
+
+ if (unlikely(ret))
+ return ret;
+
+ ext_cap = READ_ONCE(sqe->ext_cap);
+ if (ext_cap) {
+ if (READ_ONCE(sqe->__pad4[0]) || !(ext_cap & EXT_CAP_PI))
+ return -EINVAL;
+ ret = io_prep_rw_pi(req, sqe, rw, ddir);
+ }
+ return ret;
}
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -410,7 +475,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
static void io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
iov_iter_restore(&io->iter, &io->iter_state);
}
@@ -795,7 +863,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file);
- kiocb->ki_flags = file->f_iocb_flags;
+ kiocb->ki_flags |= file->f_iocb_flags;
ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
if (unlikely(ret))
return ret;
@@ -829,6 +897,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_complete = io_complete_rw;
}
+ if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+ struct io_async_rw *io = req->async_data;
+
+ /*
+ * We have a union of meta fields with wpq used for buffered-io
+ * in io_async_rw, so fail it here.
+ */
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+ kiocb->private = &io->meta;
+ }
+
return 0;
}
@@ -903,6 +983,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
+ if (kiocb->ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
do {
/*
@@ -1126,6 +1208,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
+ if (kiocb->ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
#include <linux/pagemap.h>
+struct io_meta_state {
+ u32 seed;
+ struct iov_iter_state iter_meta;
+};
+
struct io_async_rw {
size_t bytes_done;
struct iov_iter iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
struct iovec fast_iov;
struct iovec *free_iovec;
int free_iov_nr;
- struct wait_page_queue wpq;
+ /* wpq is for buffered io, while meta fields are used with direct io */
+ union {
+ struct wait_page_queue wpq;
+ struct {
+ struct uio_meta meta;
+ struct io_meta_state meta_state;
+ };
+ };
};
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
--
2.25.1
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
next prev parent reply other threads:[~2024-11-07 11:39 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20241106122631epcas5p2575c59b7634e0077f8e5c654b5fd5dbb@epcas5p2.samsung.com>
2024-11-06 12:18 ` [PATCH v8 00/10] Read/Write with meta/integrity Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 01/10] block: define set of integrity flags to be inherited by cloned bip Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 02/10] block: copy back bounce buffer to user-space correctly in case of split Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 03/10] block: modify bio_integrity_map_user to accept iov_iter as argument Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 04/10] fs, iov_iter: define meta io descriptor Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 05/10] fs: introduce IOCB_HAS_METADATA for metadata Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 06/10] io_uring/rw: add support to send metadata along with read/write Anuj Gupta
2024-11-07 5:55 ` Christoph Hellwig
2024-11-07 7:26 ` Anuj gupta
2024-11-07 7:38 ` Christoph Hellwig
2024-11-07 10:40 ` Anuj Gupta [this message]
2024-11-07 11:44 ` Christoph Hellwig
2024-11-12 0:54 ` Pavel Begunkov
2024-11-12 6:51 ` Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 07/10] block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 08/10] nvme: add support for passing on the application tag Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 09/10] scsi: add support for user-meta interface Anuj Gupta
2024-11-06 12:18 ` [PATCH v8 10/10] block: add support to pass user meta buffer Anuj Gupta
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241107104000.GB9730@green245 \
--to=anuj20.g@samsung.com \
--cc=anuj1072538@gmail.com \
--cc=asml.silence@gmail.com \
--cc=axboe@kernel.dk \
--cc=brauner@kernel.org \
--cc=gost.dev@samsung.com \
--cc=hch@lst.de \
--cc=io-uring@vger.kernel.org \
--cc=jack@suse.cz \
--cc=joshi.k@samsung.com \
--cc=kbusch@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=viro@zeniv.linux.org.uk \
--cc=vishak.g@samsung.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.