From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: Uday Shankar <ushankar@purestorage.com>,
Caleb Sander Mateos <csander@purestorage.com>,
Ming Lei <ming.lei@redhat.com>
Subject: [PATCH V2 2/5] ublk: implement NUMA-aware memory allocation
Date: Tue, 28 Oct 2025 16:56:31 +0800 [thread overview]
Message-ID: <20251028085636.185714-3-ming.lei@redhat.com> (raw)
In-Reply-To: <20251028085636.185714-1-ming.lei@redhat.com>
Implement NUMA-friendly memory allocation for ublk driver to improve
performance on multi-socket systems.
This commit includes the following changes:
1. Convert struct ublk_device to use a flexible array member for the
queues field (using DECLARE_FLEX_ARRAY) instead of a separate
pointer array allocation. This eliminates one level of indirection
and simplifies memory management. The queues array is now allocated
as part of struct ublk_device using struct_size().
2. Rename __queues to queues, dropping the __ prefix since the field is
now accessed directly throughout the codebase rather than only through
the ublk_get_queue() helper.
3. Remove the queue_size field from struct ublk_device as it is no longer
needed.
4. Move queue allocation and deallocation into ublk_init_queue() and
ublk_deinit_queue() respectively, improving encapsulation. This
simplifies ublk_init_queues() and ublk_deinit_queues() to just
iterate and call the per-queue functions.
5. Add ublk_get_queue_numa_node() helper function to determine the
appropriate NUMA node for a queue by finding the first CPU mapped
to that queue via tag_set.map[HCTX_TYPE_DEFAULT].mq_map[] and
converting it to a NUMA node using cpu_to_node(). This function is
called internally by ublk_init_queue() to determine the allocation
node.
6. Allocate each queue structure on its local NUMA node using
kvzalloc_node() in ublk_init_queue().
7. Allocate the I/O command buffer on the same NUMA node using
alloc_pages_node().
This reduces memory access latency on multi-socket NUMA systems by
ensuring each queue's data structures are local to the CPUs that
access them.
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/block/ublk_drv.c | 84 +++++++++++++++++++++++++---------------
1 file changed, 53 insertions(+), 31 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 2569566bf5e6..394e9b5f512f 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -209,9 +209,6 @@ struct ublk_queue {
struct ublk_device {
struct gendisk *ub_disk;
- char *__queues;
-
- unsigned int queue_size;
struct ublksrv_ctrl_dev_info dev_info;
struct blk_mq_tag_set tag_set;
@@ -239,6 +236,8 @@ struct ublk_device {
bool canceling;
pid_t ublksrv_tgid;
struct delayed_work exit_work;
+
+ DECLARE_FLEX_ARRAY(struct ublk_queue *, queues);
};
/* header of ublk_params */
@@ -781,7 +780,7 @@ static noinline void ublk_put_device(struct ublk_device *ub)
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
int qid)
{
- return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+ return dev->queues[qid];
}
static inline bool ublk_rq_has_data(const struct request *rq)
@@ -2662,9 +2661,13 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub);
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- int i;
+ struct ublk_queue *ubq = ub->queues[q_id];
+ int size, i;
+
+ if (!ubq)
+ return;
+
+ size = ublk_queue_cmd_buf_size(ub);
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2676,57 +2679,76 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
if (ubq->io_cmd_buf)
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+
+ kvfree(ubq);
+ ub->queues[q_id] = NULL;
+}
+
+static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
+{
+ unsigned int cpu;
+
+ /* Find first CPU mapped to this queue */
+ for_each_possible_cpu(cpu) {
+ if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
+ return cpu_to_node(cpu);
+ }
+
+ return NUMA_NO_NODE;
}
static int ublk_init_queue(struct ublk_device *ub, int q_id)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+ int depth = ub->dev_info.queue_depth;
+ int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
- void *ptr;
+ struct ublk_queue *ubq;
+ struct page *page;
+ int numa_node;
int size;
+ /* Determine NUMA node based on queue's CPU affinity */
+ numa_node = ublk_get_queue_numa_node(ub, q_id);
+
+ /* Allocate queue structure on local NUMA node */
+ ubq = kvzalloc_node(ubq_size, GFP_KERNEL, numa_node);
+ if (!ubq)
+ return -ENOMEM;
+
spin_lock_init(&ubq->cancel_lock);
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
- ubq->q_depth = ub->dev_info.queue_depth;
+ ubq->q_depth = depth;
size = ublk_queue_cmd_buf_size(ub);
- ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
- if (!ptr)
+ /* Allocate I/O command buffer on local NUMA node */
+ page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
+ if (!page) {
+ kvfree(ubq);
return -ENOMEM;
+ }
+ ubq->io_cmd_buf = page_address(page);
- ubq->io_cmd_buf = ptr;
+ ub->queues[q_id] = ubq;
ubq->dev = ub;
return 0;
}
static void ublk_deinit_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
int i;
- if (!ub->__queues)
- return;
-
- for (i = 0; i < nr_queues; i++)
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_deinit_queue(ub, i);
- kvfree(ub->__queues);
}
static int ublk_init_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
- int depth = ub->dev_info.queue_depth;
- int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
- int i, ret = -ENOMEM;
+ int i, ret;
- ub->queue_size = ubq_size;
- ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
- if (!ub->__queues)
- return ret;
-
- for (i = 0; i < nr_queues; i++) {
- if (ublk_init_queue(ub, i))
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ ret = ublk_init_queue(ub, i);
+ if (ret)
goto fail;
}
@@ -3128,7 +3150,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
ret = -ENOMEM;
- ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+ ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
if (!ub)
goto out_unlock;
mutex_init(&ub->mutex);
--
2.47.0
next prev parent reply other threads:[~2025-10-28 8:56 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-28 8:56 [PATCH V2 0/5] ublk: NUMA-aware memory allocation Ming Lei
2025-10-28 8:56 ` [PATCH V2 1/5] ublk: reorder tag_set initialization before queue allocation Ming Lei
2025-10-28 8:56 ` Ming Lei [this message]
2025-10-28 8:56 ` [PATCH V2 3/5] ublk: use flexible array for ublk_queue.ios Ming Lei
2025-10-28 21:52 ` Caleb Sander Mateos
2025-10-29 2:51 ` Ming Lei
2025-10-28 8:56 ` [PATCH V2 4/5] selftests: ublk: set CPU affinity before thread initialization Ming Lei
2025-10-28 8:56 ` [PATCH V2 5/5] selftests: ublk: make ublk_thread thread-local variable Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251028085636.185714-3-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=csander@purestorage.com \
--cc=linux-block@vger.kernel.org \
--cc=ushankar@purestorage.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox