From: Keith Busch <kbusch@meta.com>
To: <ming.lei@redhat.com>, <asml.silence@gmail.com>,
<axboe@kernel.dk>, <linux-block@vger.kernel.org>,
<io-uring@vger.kernel.org>
Cc: <bernd@bsbernd.com>, Keith Busch <kbusch@kernel.org>
Subject: [PATCHv2 6/6] io_uring: cache nodes and mapped buffers
Date: Mon, 10 Feb 2025 16:56:46 -0800 [thread overview]
Message-ID: <20250211005646.222452-7-kbusch@meta.com> (raw)
In-Reply-To: <20250211005646.222452-1-kbusch@meta.com>
From: Keith Busch <kbusch@kernel.org>
Frequent alloc/free cycles on these is pretty costly. Use an io cache to
more efficiently reuse these buffers.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/linux/io_uring_types.h | 18 +++---
io_uring/filetable.c | 2 +-
io_uring/rsrc.c | 115 +++++++++++++++++++++++++--------
io_uring/rsrc.h | 2 +-
4 files changed, 101 insertions(+), 36 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4f4b7ad21500d..a6e525b756d10 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,8 +67,18 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_alloc_cache {
+ void **entries;
+ unsigned int nr_cached;
+ unsigned int max_cached;
+ size_t elem_size;
+ unsigned int init_clear;
+};
+
struct io_buf_table {
struct io_rsrc_data data;
+ struct io_alloc_cache node_cache;
+ struct io_alloc_cache imu_cache;
};
struct io_hash_bucket {
@@ -222,14 +232,6 @@ struct io_submit_state {
struct blk_plug plug;
};
-struct io_alloc_cache {
- void **entries;
- unsigned int nr_cached;
- unsigned int max_cached;
- unsigned int elem_size;
- unsigned int init_clear;
-};
-
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index dd8eeec97acf6..a21660e3145ab 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
if (slot_index >= ctx->file_table.data.nr)
return -EINVAL;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node)
return -ENOMEM;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index b3f36f1b2a668..88a67590c67d4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
+#define IO_CACHED_BVECS_SEGS 30
+
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -119,19 +121,35 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
}
}
-struct io_rsrc_node *io_rsrc_node_alloc(int type)
+
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
{
struct io_rsrc_node *node;
- node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (type == IORING_RSRC_FILE)
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ else
+ node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
if (node) {
node->type = type;
node->refs = 1;
+ node->tag = 0;
+ node->file_ptr = 0;
+ node->release = NULL;
+ node->priv = NULL;
}
return node;
}
-__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
+static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
+{
+ kvfree(data->nodes);
+ data->nodes = NULL;
+ data->nr = 0;
+}
+
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data)
{
if (!data->nr)
return;
@@ -139,9 +157,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
if (data->nodes[data->nr])
io_put_rsrc_node(ctx, data->nodes[data->nr]);
}
- kvfree(data->nodes);
- data->nodes = NULL;
- data->nr = 0;
+ __io_rsrc_data_free(data);
}
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
@@ -155,6 +171,34 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
return -ENOMEM;
}
+static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
+{
+ const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
+ IO_CACHED_BVECS_SEGS);
+ int ret;
+
+ BUILD_BUG_ON(imu_cache_size != 512);
+ ret = io_rsrc_data_alloc(&table->data, nr);
+ if (ret)
+ return ret;
+
+ ret = io_alloc_cache_init(&table->node_cache, nr,
+ sizeof(struct io_rsrc_node), 0);
+ if (ret)
+ goto out_1;
+
+ ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
+ if (ret)
+ goto out_2;
+
+ return 0;
+out_2:
+ io_alloc_cache_free(&table->node_cache, kfree);
+out_1:
+ __io_rsrc_data_free(&table->data);
+ return ret;
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
@@ -204,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
err = -ENOMEM;
fput(file);
@@ -465,6 +509,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
io_buffer_unmap(ctx, node);
if (node->release)
node->release(node->priv);
+ if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
+ return;
break;
default:
WARN_ON_ONCE(1);
@@ -533,7 +579,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail;
}
ret = -ENOMEM;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
fput(file);
goto fail;
@@ -553,11 +599,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
+ struct io_buf_table *table)
+{
+ io_rsrc_data_free(ctx, &table->data);
+ io_alloc_cache_free(&table->node_cache, kfree);
+ io_alloc_cache_free(&table->imu_cache, kfree);
+}
+
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
if (!ctx->buf_table.data.nr)
return -ENXIO;
- io_rsrc_data_free(ctx, &ctx->buf_table.data);
+ io_rsrc_buffer_free(ctx, &ctx->buf_table);
return 0;
}
@@ -722,6 +776,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
return true;
}
+static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
+ int nr_bvecs)
+{
+ if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
+ return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
+ return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
+ GFP_KERNEL);
+}
+
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
struct iovec *iov,
struct page **last_hpage)
@@ -738,7 +801,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (!iov->iov_base)
return NULL;
- node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node)
return ERR_PTR(-ENOMEM);
node->buf = NULL;
@@ -758,7 +821,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
}
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ imu = io_alloc_imu(ctx, nr_pages);
if (!imu)
goto done;
@@ -804,9 +867,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
struct page *last_hpage = NULL;
- struct io_rsrc_data data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec;
+ struct io_buf_table table;
int i, ret;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -815,13 +878,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
- ret = io_rsrc_data_alloc(&data, nr_args);
+ ret = io_rsrc_buffer_alloc(&table, nr_args);
if (ret)
return ret;
if (!arg)
memset(iov, 0, sizeof(*iov));
+ ctx->buf_table = table;
for (i = 0; i < nr_args; i++) {
struct io_rsrc_node *node;
u64 tag = 0;
@@ -861,10 +925,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
node->tag = tag;
}
- data.nodes[i] = node;
+ table.data.nodes[i] = node;
}
-
- ctx->buf_table.data = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
@@ -892,7 +954,7 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
if (node)
return -EBUSY;
- node = io_rsrc_node_alloc(IORING_RSRC_KBUFFER);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_KBUFFER);
if (!node)
return -ENOMEM;
@@ -900,7 +962,8 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
node->priv = rq;
nr_bvecs = blk_rq_nr_phys_segments(rq);
- imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+
+ imu = io_alloc_imu(ctx, nr_bvecs);
if (!imu) {
kfree(node);
return -ENOMEM;
@@ -1022,7 +1085,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
- struct io_rsrc_data data;
+ struct io_buf_table table;
int i, ret, off, nr;
unsigned int nbufs;
@@ -1053,7 +1116,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
- ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
+ ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
if (ret)
return ret;
@@ -1062,7 +1125,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
if (src_node) {
- data.nodes[i] = src_node;
+ table.data.nodes[i] = src_node;
src_node->refs++;
}
}
@@ -1092,7 +1155,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!src_node) {
dst_node = NULL;
} else {
- dst_node = io_rsrc_node_alloc(src_node->type);
+ dst_node = io_rsrc_node_alloc(ctx, src_node->type);
if (!dst_node) {
ret = -ENOMEM;
goto out_free;
@@ -1101,12 +1164,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
refcount_inc(&src_node->buf->refs);
dst_node->buf = src_node->buf;
}
- data.nodes[off++] = dst_node;
+ table.data.nodes[off++] = dst_node;
i++;
}
/*
- * If asked for replace, put the old table. data->nodes[] holds both
+ * If asked for replace, put the old table. table.data->nodes[] holds both
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
@@ -1119,10 +1182,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* entry).
*/
WARN_ON_ONCE(ctx->buf_table.data.nr);
- ctx->buf_table.data = data;
+ ctx->buf_table = table;
return 0;
out_free:
- io_rsrc_data_free(ctx, &data);
+ io_rsrc_buffer_free(ctx, &table);
return ret;
}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8147dfc26f737..751db2ce9affb 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,7 +49,7 @@ struct io_imu_folio_data {
unsigned int nr_folios;
};
-struct io_rsrc_node *io_rsrc_node_alloc(int type);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
--
2.43.5
next prev parent reply other threads:[~2025-02-11 0:57 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-11 0:56 [PATCHv2 0/6] ublk zero-copy support Keith Busch
2025-02-11 0:56 ` [PATCHv2 1/6] io_uring: use node for import Keith Busch
2025-02-11 0:56 ` [PATCHv2 2/6] io_uring: create resource release callback Keith Busch
2025-02-13 1:31 ` Pavel Begunkov
2025-02-13 1:58 ` Keith Busch
2025-02-13 13:06 ` Pavel Begunkov
2025-02-11 0:56 ` [PATCHv2 3/6] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-13 1:33 ` Pavel Begunkov
2025-02-14 3:30 ` Ming Lei
2025-02-14 15:26 ` Keith Busch
2025-02-15 1:34 ` Ming Lei
2025-02-18 20:34 ` Keith Busch
2025-02-11 0:56 ` [PATCHv2 4/6] ublk: zc register/unregister bvec Keith Busch
2025-02-12 2:49 ` Ming Lei
2025-02-12 4:11 ` Keith Busch
2025-02-12 9:24 ` Ming Lei
2025-02-12 14:59 ` Keith Busch
2025-02-13 2:12 ` Pavel Begunkov
2025-02-11 0:56 ` [PATCHv2 5/6] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-11 0:56 ` Keith Busch [this message]
2025-02-11 15:17 ` [PATCHv2 6/6] io_uring: cache nodes and mapped buffers kernel test robot
2025-02-11 16:47 ` Keith Busch
2025-02-12 1:42 ` kernel test robot
2025-02-12 2:29 ` [PATCHv2 0/6] ublk zero-copy support Ming Lei
2025-02-12 15:28 ` Keith Busch
2025-02-12 16:06 ` Pavel Begunkov
2025-02-13 1:52 ` Ming Lei
2025-02-13 15:12 ` lizetao
2025-02-13 16:06 ` Keith Busch
2025-02-14 3:39 ` lizetao
2025-02-14 2:41 ` Ming Lei
2025-02-14 4:21 ` lizetao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250211005646.222452-7-kbusch@meta.com \
--to=kbusch@meta.com \
--cc=asml.silence@gmail.com \
--cc=axboe@kernel.dk \
--cc=bernd@bsbernd.com \
--cc=io-uring@vger.kernel.org \
--cc=kbusch@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=ming.lei@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.