Re: [PATCH v4 8/8] fuse: {io-uring} Prefer the current core over mapping

public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed

From: Bernd Schubert <bschubert@ddn.com>
To: Joanne Koong <joannelkoong@gmail.com>, bernd@bsbernd.com
Cc: Miklos Szeredi <miklos@szeredi.hu>,
	linux-fsdevel@vger.kernel.org, Luis Henriques <luis@igalia.com>,
	Gang He <dchg2000@gmail.com>
Subject: Re: [PATCH v4 8/8] fuse: {io-uring} Prefer the current core over mapping
Date: Wed, 29 Apr 2026 18:11:46 +0200	[thread overview]
Message-ID: <840a41a9-cbb8-4855-985d-d74ffbf317fd@ddn.com> (raw)
In-Reply-To: <CAJnrk1bE7ALGtWdV2Jm8_36gZv1o3AhF9Pdehwr+tTij_b1JYw@mail.gmail.com>



On 4/29/26 17:40, Joanne Koong wrote:
> On Mon, Apr 13, 2026 at 10:41 AM Bernd Schubert via B4 Relay
> <devnull+bernd.bsbernd.com@kernel.org> wrote:
>>
>> From: Bernd Schubert <bschubert@ddn.com>
>>
>> Mapping might point to a totally different core due to
>> random assignment. For performance using the current
>> core might be beneficial
>>
>> Example (with core binding)
>>
>> unpatched WRITE: bw=841MiB/s
>> patched   WRITE: bw=1363MiB/s
>>
>> With
>> fio --name=test --ioengine=psync --direct=1 \
>>     --rw=write --bs=1M --iodepth=1 --numjobs=1 \
>>     --filename_format=/redfs/testfile.\$jobnum --size=100G \
>>     --thread --create_on_open=1 --runtime=30s --cpus_allowed=1
>>
>> In order to get the good number `--cpus_allowed=1` is needed.
>> This could be improved by a future change that avoids
>> cpu migration in fuse_request_end() on wake_up() call.
>>
>> Signed-off-by: Bernd Schubert <bernd@bsbernd.com>
>> ---
>>  fs/fuse/dev_uring.c | 41 ++++++++++++++++++++++++++++-------------
>>  1 file changed, 28 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>> index e06d45b161d5000e24431314b2222b66bdea58aa..5c00fd047c8bd359ec34fb6a41abba44f6794517 100644
>> --- a/fs/fuse/dev_uring.c
>> +++ b/fs/fuse/dev_uring.c
>> @@ -19,8 +19,12 @@ MODULE_PARM_DESC(enable_uring,
>>
>>  #define FUSE_URING_IOV_SEGS 2 /* header and payload */
>>
>> +/* Threshold that determines if a better queue should be searched for */
>>  #define FUSE_URING_Q_THRESHOLD 2
>>
>> +/* Number of (re)tries to find a better queue */
>> +#define FUSE_URING_Q_TRIES 3
>> +
>>  bool fuse_uring_enabled(void)
>>  {
>>         return enable_uring;
>> @@ -1311,7 +1315,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring,
>>                                                        bool background)
>>  {
>>         unsigned int qid;
>> -       int node, retries = 0;
>> +       int node, tries = 0;
>>         unsigned int nr_queues;
>>         unsigned int cpu = task_cpu(current);
>>         struct fuse_ring_queue *queue, *primary_queue = NULL;
>> @@ -1336,26 +1340,36 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring,
>>
>>         nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues);
>>         if (nr_queues) {
>> +               /* prefer the queue that corresponds to the current cpu */
>> +               queue = READ_ONCE(ring->queues[cpu]);
>> +               if (queue) {
>> +                       if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD)
>> +                               return queue;
>> +                       primary_queue = queue;
>> +               }
>> +
>>                 qid = ring->numa_q_map[node].cpu_to_qid[cpu];
>>                 if (WARN_ON_ONCE(qid >= ring->max_nr_queues))
>>                         return NULL;
>> -               queue = READ_ONCE(ring->queues[qid]);
>> +               if (qid != cpu) {
>> +                       queue = READ_ONCE(ring->queues[qid]);
>>
>> -               /* Might happen on teardown */
>> -               if (unlikely(!queue))
>> -                       return NULL;
>> +                       /* Might happen on teardown */
>> +                       if (unlikely(!queue))
>> +                               return NULL;
>>
>> -               if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD)
>> -                       return queue;
>> +                       if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD)
>> +                               return queue;
>> +               }
>>
>>                 /* Retries help for load balancing */
>> -               if (retries < FUSE_URING_Q_THRESHOLD) {
>> -                       if (!retries)
>> +               if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) {
>> +                       if (!primary_queue)
>>                                 primary_queue = queue;
>>
>> -                       /* Increase cpu, assuming it will map to a differet qid*/
>> +                       /* Increase cpu, assuming it will map to a different qid*/
>>                         cpu++;
>> -                       retries++;
>> +                       tries++;
>>                         goto retry;
>>                 }
>>         }
>> @@ -1366,9 +1380,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring,
>>
>>         /* global registered queue bitmap */
>>         qid = ring->q_map.cpu_to_qid[cpu];
>> -       if (WARN_ON_ONCE(qid >= ring->max_nr_queues))
>> -       /* Might happen on teardown */
>> +       if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) {
>> +               /* Might happen on teardown */
>>                 return NULL;
>> +       }
>>         return READ_ONCE(ring->queues[qid]);
>>  }
>>
> 
> This kind of seems like a runtime workaround for the mapping algorithm
> we used in patch 5 for the round-robin distribution.
> 
> In patch 5, the mapping logic is:
> 
>> +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid,
>> +                                      struct fuse_queue_map *q_map,
>> +                                      int node)
>> +{
>> +       int cpu, qid_idx, mapping_count = 0;
>> +       size_t nr_queues;
>> +
>> +       cpumask_set_cpu(qid, q_map->registered_q_mask);
>> +       nr_queues = cpumask_weight(q_map->registered_q_mask);
>> +       for (cpu = 0; cpu < ring->max_nr_queues; cpu++) {
>> +               if (node != -1 && cpu_to_node(cpu) != node)
>> +                       continue;
>> +
>> +               qid_idx = mapping_count % nr_queues;
>> +               q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx,
>> +                                                    q_map->registered_q_mask);
>> +               mapping_count++;
>> +               pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n",
>> +                        __func__, node, qid, qid_idx, nr_queues, cpu,
>> +                        q_map->cpu_to_qid[cpu]);
>> +       }
>> +}
> 
> I think if we tweaked the mapping initialization to
> 
>       if (cpumask_test_cpu(cpu, q_map->registered_q_mask)) {
>           q_map->cpu_to_qid[cpu] = cpu;
>       } else {
>           qid_idx = mapping_count % nr_queues;
>           q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx,
>                                        q_map->registered_q_mask);
>           mapping_count++;
>       }
>       ...
> 
> then it's guaranteed that a CPU with its own registered queue would
> always map to its own queue, and this patch could be dropped.

I had actually considered that. My idea was that libfuse can override
fuse_uring_select_queue() with eBPF and could then enable/disable
this preferred self-queue with a userspace config. Which is not so easy,
if it is encoded into the mapping.
This override is something I wanted try out, but never came to it so
far. I'm perfectly fine to the simple way for now and then go the more
complicated way once libfuse has the eBPF overide.

Thanks,
Bernd

next prev parent reply	other threads:[~2026-04-29 16:28 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-13  9:41 [PATCH v4 0/8] fuse: {io-uring} Allow to reduce the number of queues and request distribution Bernd Schubert via B4 Relay
2026-04-13  9:41 ` [PATCH v4 1/8] fuse: {io-uring} Add queue length counters Bernd Schubert via B4 Relay
2026-04-13  9:41 ` [PATCH v4 2/8] fuse: {io-uring} Rename ring->nr_queues to max_nr_queues Bernd Schubert via B4 Relay
2026-04-27 15:35   ` Joanne Koong
2026-04-13  9:41 ` [PATCH v4 3/8] fuse: {io-uring} Use bitmaps to track registered queues Bernd Schubert via B4 Relay
2026-04-24 15:04   ` Luis Henriques
2026-04-24 15:33     ` Bernd Schubert
2026-04-27  8:02       ` Luis Henriques
2026-04-27 10:39         ` Bernd Schubert
2026-04-13  9:41 ` [PATCH v4 4/8] fuse: Fetch a queued fuse request on command registration Bernd Schubert via B4 Relay
2026-04-13  9:41 ` [PATCH v4 5/8] fuse: {io-uring} Allow reduced number of ring queues Bernd Schubert via B4 Relay
2026-04-24 15:15   ` Luis Henriques
2026-04-24 18:28   ` Joanne Koong
2026-04-24 22:00     ` Bernd Schubert
2026-04-27 13:10       ` Joanne Koong
2026-04-27 13:49         ` Bernd Schubert
2026-04-27 14:10           ` Joanne Koong
2026-04-27 14:42             ` Bernd Schubert
2026-04-27 15:10               ` Joanne Koong
2026-04-29 16:10       ` Joanne Koong
2026-04-29 16:24         ` Bernd Schubert
2026-04-29 16:32           ` Joanne Koong
2026-04-30  4:16             ` Darrick J. Wong
2026-04-13  9:41 ` [PATCH v4 6/8] fuse: {io-uring} Queue background requests on a different core Bernd Schubert via B4 Relay
2026-04-24 15:26   ` Luis Henriques
2026-04-27 12:08     ` Bernd Schubert
2026-04-29 14:43   ` Joanne Koong
2026-04-29 16:01     ` Bernd Schubert
2026-04-29 16:56       ` Joanne Koong
2026-04-29 20:19         ` Bernd Schubert
2026-04-13  9:41 ` [PATCH v4 7/8] fuse: Add retry attempts for numa local queues for load distribution Bernd Schubert via B4 Relay
2026-04-24 15:28   ` Luis Henriques
2026-04-29 15:03   ` Joanne Koong
2026-04-29 16:07     ` Bernd Schubert
2026-04-13  9:41 ` [PATCH v4 8/8] fuse: {io-uring} Prefer the current core over mapping Bernd Schubert via B4 Relay
2026-04-29 15:40   ` Joanne Koong
2026-04-29 16:11     ` Bernd Schubert [this message]
2026-04-29 16:15 ` [PATCH v4 0/8] fuse: {io-uring} Allow to reduce the number of queues and request distribution Joanne Koong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=840a41a9-cbb8-4855-985d-d74ffbf317fd@ddn.com \
    --to=bschubert@ddn.com \
    --cc=bernd@bsbernd.com \
    --cc=dchg2000@gmail.com \
    --cc=joannelkoong@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=luis@igalia.com \
    --cc=miklos@szeredi.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox