From: Kanchan Joshi <joshi.k@samsung.com>
To: axboe@kernel.dk, kbusch@kernel.org, hch@lst.de, hare@suse.de,
sagi@grimberg.me, martin.petersen@oracle.com, brauner@kernel.org,
viro@zeniv.linux.org.uk, jack@suse.cz, jaegeuk@kernel.org,
bcrl@kvack.org, dhowells@redhat.com, bvanassche@acm.org,
asml.silence@gmail.com
Cc: linux-nvme@lists.infradead.org, linux-fsdevel@vger.kernel.org,
io-uring@vger.kernel.org, linux-block@vger.kernel.org,
linux-aio@kvack.org, gost.dev@samsung.com, vishak.g@samsung.com,
javier.gonz@samsung.com, Kanchan Joshi <joshi.k@samsung.com>,
Nitesh Shetty <nj.shetty@samsung.com>
Subject: [PATCH v7 3/3] io_uring: enable per-io hinting capability
Date: Mon, 30 Sep 2024 23:43:05 +0530 [thread overview]
Message-ID: <20240930181305.17286-4-joshi.k@samsung.com> (raw)
In-Reply-To: <20240930181305.17286-1-joshi.k@samsung.com>
With F_SET_RW_HINT fcntl, user can set a hint on the file inode, and
all the subsequent writes on the file pass that hint value down.
This can be limiting for large files (and for block device) as all the
writes can be tagged with only one lifetime hint value.
Concurrent writes (with different hint values) are hard to manage.
Per-IO hinting solves that problem.
Allow userspace to pass additional metadata in the SQE.
The type of passed metadata is expressed by a new field
__u16 meta_type;
At this point one type META_TYPE_LIFETIME_HINT is supported.
With this type, user can pass lifetime hint values in the new field
__u64 lifetime_val;
This accepts all lifetime hint values that are possible with
F_SET_RW_HINT fcntl.
The write handlers (io_prep_rw, io_write) send the hint value to
lower-layer using kiocb. This is good for upporting direct IO,
but not when kiocb is not available (e.g., buffered IO).
When per-io hints are not passed, the per-inode hint values are set in
the kiocb (as before). Otherwise, these take the precedence on per-inode
hints.
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
fs/fcntl.c | 22 ----------------------
include/linux/rw_hint.h | 24 ++++++++++++++++++++++++
include/uapi/linux/io_uring.h | 19 +++++++++++++++++++
io_uring/rw.c | 25 ++++++++++++++++++++++++-
4 files changed, 67 insertions(+), 23 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22dd9dcce7ec..a390a05f4ef8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -334,28 +334,6 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
-static bool rw_hint_valid(u64 hint)
-{
- BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
- BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
- BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
- BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
- BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
- BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
-
- switch (hint) {
- case RWH_WRITE_LIFE_NOT_SET:
- case RWH_WRITE_LIFE_NONE:
- case RWH_WRITE_LIFE_SHORT:
- case RWH_WRITE_LIFE_MEDIUM:
- case RWH_WRITE_LIFE_LONG:
- case RWH_WRITE_LIFE_EXTREME:
- return true;
- default:
- return false;
- }
-}
-
static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h
index 309ca72f2dfb..f4373a71ffed 100644
--- a/include/linux/rw_hint.h
+++ b/include/linux/rw_hint.h
@@ -21,4 +21,28 @@ enum rw_hint {
static_assert(sizeof(enum rw_hint) == 1);
#endif
+#define WRITE_LIFE_INVALID (RWH_WRITE_LIFE_EXTREME + 1)
+
+static inline bool rw_hint_valid(u64 hint)
+{
+ BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+ BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+ BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+ BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+ BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+ BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
+ switch (hint) {
+ case RWH_WRITE_LIFE_NOT_SET:
+ case RWH_WRITE_LIFE_NONE:
+ case RWH_WRITE_LIFE_SHORT:
+ case RWH_WRITE_LIFE_MEDIUM:
+ case RWH_WRITE_LIFE_LONG:
+ case RWH_WRITE_LIFE_EXTREME:
+ return true;
+ default:
+ return false;
+ }
+}
+
#endif /* _LINUX_RW_HINT_H */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 86cb385fe0b5..951e35226229 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,12 +92,23 @@ struct io_uring_sqe {
__u16 addr_len;
__u16 __pad3[1];
};
+ struct {
+ /* Bit field to express 16 meta types */
+ __u16 meta_type;
+ __u16 __pad4[1];
+ };
};
union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
+ struct {
+ /* First meta type specific fields */
+ __u64 lifetime_val;
+ /* For future use */
+ __u64 __pad5[1];
+ };
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +118,14 @@ struct io_uring_sqe {
};
};
+enum io_uring_sqe_meta_type_bits {
+ META_TYPE_LIFETIME_HINT_BIT
+};
+
+/* this meta type covers write hint values supported by F_SET_RW_HINT fcntl */
+#define META_TYPE_LIFETIME_HINT (1U << META_TYPE_LIFETIME_HINT_BIT)
+
+
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 510123d3d837..bf45ee8904a4 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -269,6 +269,24 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
+ if (ddir == ITER_SOURCE) {
+ u16 mtype = READ_ONCE(sqe->meta_type);
+
+ rw->kiocb.ki_write_hint = WRITE_LIFE_INVALID;
+ if (mtype) {
+ u64 lhint = READ_ONCE(sqe->lifetime_val);
+
+ if (READ_ONCE(sqe->__pad4[0]) ||
+ READ_ONCE(sqe->__pad5[0]))
+ return -EINVAL;
+
+ if (mtype != META_TYPE_LIFETIME_HINT ||
+ !rw_hint_valid(lhint))
+ return -EINVAL;
+
+ rw->kiocb.ki_write_hint = lhint;
+ }
+ }
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
@@ -1023,7 +1041,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret))
return ret;
req->cqe.res = iov_iter_count(&io->iter);
- rw->kiocb.ki_write_hint = file_write_hint(rw->kiocb.ki_filp);
+ /*
+ * Use per-file hint only if per-io hint is not set.
+ * We need per-io hint to get precedence.
+ */
+ if (rw->kiocb.ki_write_hint == WRITE_LIFE_INVALID)
+ rw->kiocb.ki_write_hint = file_write_hint(rw->kiocb.ki_filp);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
--
2.25.1
next prev parent reply other threads:[~2024-09-30 18:21 UTC|newest]
Thread overview: 86+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20240930182052epcas5p37edefa7556b87c3fbb543275756ac736@epcas5p3.samsung.com>
2024-09-30 18:13 ` [PATCH v7 0/3] FDP and per-io hints Kanchan Joshi
2024-09-30 18:13 ` [PATCH v7 1/3] nvme: enable FDP support Kanchan Joshi
2024-10-02 18:37 ` Bart Van Assche
2024-10-03 12:55 ` Christoph Hellwig
2024-09-30 18:13 ` [PATCH v7 2/3] block, fs: restore kiocb based write hint processing Kanchan Joshi
2024-09-30 18:13 ` Kanchan Joshi [this message]
2024-10-02 14:26 ` [PATCH v7 3/3] io_uring: enable per-io hinting capability Pavel Begunkov
2024-10-17 14:58 ` Kanchan Joshi
2024-10-02 18:29 ` Bart Van Assche
2024-10-01 9:20 ` [PATCH v7 0/3] FDP and per-io hints Christoph Hellwig
2024-10-01 15:58 ` James R. Bergsten
2024-10-01 16:18 ` Jens Axboe
2024-10-02 7:51 ` Christoph Hellwig
2024-10-02 15:03 ` Jens Axboe
2024-10-02 15:13 ` Christoph Hellwig
2024-10-02 15:17 ` Keith Busch
2024-10-02 15:19 ` Christoph Hellwig
2024-10-02 15:33 ` Keith Busch
2024-10-03 12:51 ` Christoph Hellwig
2024-10-02 15:47 ` Martin K. Petersen
2024-10-02 18:34 ` Bart Van Assche
2024-10-03 12:55 ` Christoph Hellwig
2024-10-03 21:48 ` Keith Busch
2024-10-03 22:00 ` Bart Van Assche
2024-10-03 22:12 ` Jens Axboe
2024-10-03 22:17 ` Keith Busch
2024-10-04 6:21 ` Javier González
2024-10-04 6:24 ` Christoph Hellwig
2024-10-04 6:59 ` Javier González
2024-10-04 12:32 ` Christoph Hellwig
2024-10-07 11:29 ` Javier González
2024-10-08 12:27 ` Christoph Hellwig
2024-10-03 12:54 ` Christoph Hellwig
2024-10-03 22:14 ` Jens Axboe
2024-10-04 5:31 ` Christoph Hellwig
2024-10-04 6:18 ` Javier González
2024-10-04 6:27 ` Christoph Hellwig
2024-10-04 6:52 ` Javier González
2024-10-04 12:30 ` Christoph Hellwig
2024-10-07 10:10 ` Javier González
2024-10-08 10:06 ` Hans Holmberg
2024-10-09 14:36 ` Javier Gonzalez
2024-10-10 6:40 ` Hans Holmberg
2024-10-10 7:13 ` Javier Gonzalez
2024-10-10 9:20 ` Christoph Hellwig
2024-10-10 12:22 ` Javier Gonzalez
2024-10-11 8:56 ` Christoph Hellwig
2024-10-11 12:21 ` Javier Gonzalez
2024-10-11 16:59 ` Keith Busch
2024-10-10 10:46 ` Hans Holmberg
2024-10-10 12:27 ` Javier Gonzalez
2024-10-11 8:59 ` Christoph Hellwig
2024-10-08 12:25 ` Christoph Hellwig
2024-10-08 14:44 ` Keith Busch
2024-10-09 9:28 ` Christoph Hellwig
2024-10-09 15:06 ` Keith Busch
2024-10-10 7:07 ` Javier González
2024-10-10 9:13 ` Christoph Hellwig
2024-10-10 11:59 ` Javier González
2024-10-11 9:02 ` Christoph Hellwig
2024-10-11 17:08 ` Jens Axboe
2024-10-14 6:21 ` Christoph Hellwig
2024-10-14 7:02 ` Javier Gonzalez
2024-10-14 7:47 ` Christoph Hellwig
2024-10-14 9:08 ` Javier Gonzalez
2024-10-14 11:50 ` Christoph Hellwig
2024-10-15 3:07 ` Javier Gonzalez
2024-10-15 5:30 ` Christoph Hellwig
2024-10-10 9:10 ` Christoph Hellwig
2024-10-09 16:28 ` Nitesh Shetty
2024-10-02 15:22 ` Jens Axboe
2024-10-01 16:23 ` Keith Busch
2024-10-02 7:49 ` Christoph Hellwig
2024-10-02 14:56 ` Keith Busch
2024-10-02 15:00 ` Jens Axboe
2024-10-03 0:20 ` Bart Van Assche
2024-10-15 5:50 ` Christoph Hellwig
2024-10-15 15:09 ` Keith Busch
2024-10-15 15:22 ` Christoph Hellwig
2024-10-17 14:35 ` Kanchan Joshi
2024-10-17 15:23 ` Christoph Hellwig
2024-10-17 15:44 ` Keith Busch
2024-10-17 15:46 ` Christoph Hellwig
2024-10-17 16:06 ` Keith Busch
2024-10-17 16:15 ` Bart Van Assche
2024-10-17 16:23 ` Keith Busch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240930181305.17286-4-joshi.k@samsung.com \
--to=joshi.k@samsung.com \
--cc=asml.silence@gmail.com \
--cc=axboe@kernel.dk \
--cc=bcrl@kvack.org \
--cc=brauner@kernel.org \
--cc=bvanassche@acm.org \
--cc=dhowells@redhat.com \
--cc=gost.dev@samsung.com \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=io-uring@vger.kernel.org \
--cc=jack@suse.cz \
--cc=jaegeuk@kernel.org \
--cc=javier.gonz@samsung.com \
--cc=kbusch@kernel.org \
--cc=linux-aio@kvack.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=martin.petersen@oracle.com \
--cc=nj.shetty@samsung.com \
--cc=sagi@grimberg.me \
--cc=viro@zeniv.linux.org.uk \
--cc=vishak.g@samsung.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox