[PATCH v1] fuse: support configurable number of uring queues

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v1] fuse: support configurable number of uring queues
@ 2025-03-14 20:44 Joanne Koong
  2025-03-14 23:11 ` Bernd Schubert
  0 siblings, 1 reply; 8+ messages in thread
From: Joanne Koong @ 2025-03-14 20:44 UTC (permalink / raw)
  To: miklos, linux-fsdevel; +Cc: bernd.schubert, kernel-team

In the current uring design, the number of queues is equal to the number
of cores on a system. However, on high-scale machines where there are
hundreds of cores, having such a high number of queues is often
overkill and resource-intensive. As well, in the current design where
the queue for the request is set to the cpu the task is currently
executing on (see fuse_uring_task_to_queue()), there is no guarantee
that requests for the same file will be sent to the same queue (eg if a
task is preempted and moved to a different cpu) which may be problematic
for some servers (eg if the server is append-only and does not support
unordered writes).

In this commit, the server can configure the number of uring queues
(passed to the kernel through the init reply). The number of queues must
be a power of two, in order to make queue assignment for a request
efficient. If the server specifies a non-power of two, then it will be
automatically rounded down to the nearest power of two. If the server
does not specify the number of queues, then this will automatically
default to the current behavior where the number of queues will be equal
to the number of cores with core and numa affinity. The queue id hash
is computed on the nodeid, which ensures that requests for the same file
will be forwarded to the same queue.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
 fs/fuse/dev_uring_i.h     | 11 +++++++++
 fs/fuse/fuse_i.h          |  1 +
 fs/fuse/inode.c           |  4 +++-
 include/uapi/linux/fuse.h |  6 ++++-
 5 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 64f1ae308dc4..f173f9e451ac 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 {
 	struct fuse_ring *ring;
-	size_t nr_queues = num_possible_cpus();
+	size_t nr_queues = fc->uring_nr_queues;
 	struct fuse_ring *res = NULL;
 	size_t max_payload_size;
+	unsigned int nr_cpus = num_possible_cpus();
 
 	ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
 	if (!ring)
@@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 
 	fc->ring = ring;
 	ring->nr_queues = nr_queues;
+	if (nr_queues == nr_cpus) {
+		ring->core_affinity = 1;
+	} else {
+		WARN_ON(!nr_queues || nr_queues > nr_cpus ||
+			!is_power_of_2(nr_queues));
+		ring->qid_hash_bits = ilog2(nr_queues);
+	}
 	ring->fc = fc;
 	ring->max_payload_sz = max_payload_size;
 	atomic_set(&ring->queue_refs, 0);
@@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
 	fuse_uring_send(ent, cmd, err, issue_flags);
 }
 
-static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
+static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
+{
+	if (ring->nr_queues == 1)
+		return 0;
+
+	return hash_long(nodeid, ring->qid_hash_bits);
+}
+
+static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
+							struct fuse_req *req)
 {
 	unsigned int qid;
 	struct fuse_ring_queue *queue;
 
-	qid = task_cpu(current);
+	if (ring->core_affinity)
+		qid = task_cpu(current);
+	else
+		qid = hash_qid(ring, req->in.h.nodeid);
 
 	if (WARN_ONCE(qid >= ring->nr_queues,
 		      "Core number (%u) exceeds nr queues (%zu)\n", qid,
@@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
 	int err;
 
 	err = -EINVAL;
-	queue = fuse_uring_task_to_queue(ring);
+	queue = fuse_uring_task_to_queue(ring, req);
 	if (!queue)
 		goto err;
 
@@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
 	struct fuse_ring_queue *queue;
 	struct fuse_ring_ent *ent = NULL;
 
-	queue = fuse_uring_task_to_queue(ring);
+	queue = fuse_uring_task_to_queue(ring, req);
 	if (!queue)
 		return false;
 
@@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
 	.send_interrupt = fuse_dev_queue_interrupt,
 	.send_req = fuse_uring_queue_fuse_req,
 };
+
+void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
+{
+	if (!nr_queues) {
+		fc->uring_nr_queues = num_possible_cpus();
+		return;
+	}
+
+	if (!is_power_of_2(nr_queues)) {
+		unsigned int old_nr_queues = nr_queues;
+
+		nr_queues = rounddown_pow_of_two(nr_queues);
+		pr_debug("init: uring_nr_queues=%u is not a power of 2. "
+			 "Rounding down uring_nr_queues to %u\n",
+			 old_nr_queues, nr_queues);
+	}
+	fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
+}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index ce823c6b1806..81398b5b8bf2 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -122,6 +122,12 @@ struct fuse_ring {
 	 */
 	unsigned int stop_debug_log : 1;
 
+	/* Each core has its own queue */
+	unsigned int core_affinity : 1;
+
+	/* Only used if core affinity is not set */
+	unsigned int qid_hash_bits;
+
 	wait_queue_head_t stop_waitq;
 
 	/* async tear down */
@@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
 bool fuse_uring_queue_bq_req(struct fuse_req *req);
 bool fuse_uring_request_expired(struct fuse_conn *fc);
+void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
 
 static inline void fuse_uring_abort(struct fuse_conn *fc)
 {
@@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
 	return false;
 }
 
+static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
+{
+}
+
 #endif /* CONFIG_FUSE_IO_URING */
 
 #endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 38a782673bfd..7c3010bda02d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -962,6 +962,7 @@ struct fuse_conn {
 #ifdef CONFIG_FUSE_IO_URING
 	/**  uring connection information*/
 	struct fuse_ring *ring;
+	uint8_t uring_nr_queues;
 #endif
 
 	/** Only used if the connection opts into request timeouts */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fd48e8d37f2e..c168247d87f2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				else
 					ok = false;
 			}
-			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
+			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
 				fc->io_uring = 1;
+				fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
+			}
 
 			if (flags & FUSE_REQUEST_TIMEOUT)
 				timeout = arg->request_timeout;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 5ec43ecbceb7..0d73b8fcd2be 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -232,6 +232,9 @@
  *
  *  7.43
  *  - add FUSE_REQUEST_TIMEOUT
+ *
+ * 7.44
+ * - add uring_nr_queues to fuse_init_out
  */
 
 #ifndef _LINUX_FUSE_H
@@ -915,7 +918,8 @@ struct fuse_init_out {
 	uint32_t	flags2;
 	uint32_t	max_stack_depth;
 	uint16_t	request_timeout;
-	uint16_t	unused[11];
+	uint8_t		uring_nr_queues;
+	uint8_t		unused[21];
 };
 
 #define CUSE_INIT_INFO_MAX 4096
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-03-14 20:44 [PATCH v1] fuse: support configurable number of uring queues Joanne Koong
@ 2025-03-14 23:11 ` Bernd Schubert
  2025-03-18  0:55   ` Joanne Koong
  0 siblings, 1 reply; 8+ messages in thread
From: Bernd Schubert @ 2025-03-14 23:11 UTC (permalink / raw)
  To: Joanne Koong, miklos, linux-fsdevel; +Cc: kernel-team

Thanks Joanne! That is rather close to what I wanted to add,
just a few comments.

On 3/14/25 21:44, Joanne Koong wrote:
> In the current uring design, the number of queues is equal to the number
> of cores on a system. However, on high-scale machines where there are
> hundreds of cores, having such a high number of queues is often
> overkill and resource-intensive. As well, in the current design where
> the queue for the request is set to the cpu the task is currently
> executing on (see fuse_uring_task_to_queue()), there is no guarantee
> that requests for the same file will be sent to the same queue (eg if a
> task is preempted and moved to a different cpu) which may be problematic
> for some servers (eg if the server is append-only and does not support
> unordered writes).
> 
> In this commit, the server can configure the number of uring queues
> (passed to the kernel through the init reply). The number of queues must
> be a power of two, in order to make queue assignment for a request
> efficient. If the server specifies a non-power of two, then it will be
> automatically rounded down to the nearest power of two. If the server
> does not specify the number of queues, then this will automatically
> default to the current behavior where the number of queues will be equal
> to the number of cores with core and numa affinity. The queue id hash
> is computed on the nodeid, which ensures that requests for the same file
> will be forwarded to the same queue.
> 
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
>  fs/fuse/dev_uring_i.h     | 11 +++++++++
>  fs/fuse/fuse_i.h          |  1 +
>  fs/fuse/inode.c           |  4 +++-
>  include/uapi/linux/fuse.h |  6 ++++-
>  5 files changed, 63 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 64f1ae308dc4..f173f9e451ac 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
>  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>  {
>  	struct fuse_ring *ring;
> -	size_t nr_queues = num_possible_cpus();
> +	size_t nr_queues = fc->uring_nr_queues;
>  	struct fuse_ring *res = NULL;
>  	size_t max_payload_size;
> +	unsigned int nr_cpus = num_possible_cpus();
>  
>  	ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
>  	if (!ring)
> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>  
>  	fc->ring = ring;
>  	ring->nr_queues = nr_queues;
> +	if (nr_queues == nr_cpus) {
> +		ring->core_affinity = 1;
> +	} else {
> +		WARN_ON(!nr_queues || nr_queues > nr_cpus ||
> +			!is_power_of_2(nr_queues));
> +		ring->qid_hash_bits = ilog2(nr_queues);
> +	}
>  	ring->fc = fc;
>  	ring->max_payload_sz = max_payload_size;
>  	atomic_set(&ring->queue_refs, 0);
> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
>  	fuse_uring_send(ent, cmd, err, issue_flags);
>  }
>  
> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
> +{
> +	if (ring->nr_queues == 1)
> +		return 0;
> +
> +	return hash_long(nodeid, ring->qid_hash_bits);
> +}
> +
> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
> +							struct fuse_req *req)
>  {
>  	unsigned int qid;
>  	struct fuse_ring_queue *queue;
>  
> -	qid = task_cpu(current);
> +	if (ring->core_affinity)
> +		qid = task_cpu(current);
> +	else
> +		qid = hash_qid(ring, req->in.h.nodeid);

I think we need to handle numa affinity.

>  
>  	if (WARN_ONCE(qid >= ring->nr_queues,
>  		      "Core number (%u) exceeds nr queues (%zu)\n", qid,
> @@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
>  	int err;
>  
>  	err = -EINVAL;
> -	queue = fuse_uring_task_to_queue(ring);
> +	queue = fuse_uring_task_to_queue(ring, req);
>  	if (!queue)
>  		goto err;
>  
> @@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
>  	struct fuse_ring_queue *queue;
>  	struct fuse_ring_ent *ent = NULL;
>  
> -	queue = fuse_uring_task_to_queue(ring);
> +	queue = fuse_uring_task_to_queue(ring, req);
>  	if (!queue)
>  		return false;
>  
> @@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
>  	.send_interrupt = fuse_dev_queue_interrupt,
>  	.send_req = fuse_uring_queue_fuse_req,
>  };
> +
> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> +{
> +	if (!nr_queues) {
> +		fc->uring_nr_queues = num_possible_cpus();
> +		return;
> +	}
> +
> +	if (!is_power_of_2(nr_queues)) {
> +		unsigned int old_nr_queues = nr_queues;
> +
> +		nr_queues = rounddown_pow_of_two(nr_queues);
> +		pr_debug("init: uring_nr_queues=%u is not a power of 2. "
> +			 "Rounding down uring_nr_queues to %u\n",
> +			 old_nr_queues, nr_queues);
> +	}
> +	fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
> +}
> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> index ce823c6b1806..81398b5b8bf2 100644
> --- a/fs/fuse/dev_uring_i.h
> +++ b/fs/fuse/dev_uring_i.h
> @@ -122,6 +122,12 @@ struct fuse_ring {
>  	 */
>  	unsigned int stop_debug_log : 1;
>  
> +	/* Each core has its own queue */
> +	unsigned int core_affinity : 1;
> +
> +	/* Only used if core affinity is not set */
> +	unsigned int qid_hash_bits;
> +
>  	wait_queue_head_t stop_waitq;
>  
>  	/* async tear down */
> @@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
>  void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
>  bool fuse_uring_queue_bq_req(struct fuse_req *req);
>  bool fuse_uring_request_expired(struct fuse_conn *fc);
> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
>  
>  static inline void fuse_uring_abort(struct fuse_conn *fc)
>  {
> @@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
>  	return false;
>  }
>  
> +static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> +{
> +}
> +
>  #endif /* CONFIG_FUSE_IO_URING */
>  
>  #endif /* _FS_FUSE_DEV_URING_I_H */
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 38a782673bfd..7c3010bda02d 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -962,6 +962,7 @@ struct fuse_conn {
>  #ifdef CONFIG_FUSE_IO_URING
>  	/**  uring connection information*/
>  	struct fuse_ring *ring;
> +	uint8_t uring_nr_queues;
>  #endif
>  
>  	/** Only used if the connection opts into request timeouts */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index fd48e8d37f2e..c168247d87f2 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>  				else
>  					ok = false;
>  			}
> -			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> +			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
>  				fc->io_uring = 1;
> +				fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
> +			}
>  
>  			if (flags & FUSE_REQUEST_TIMEOUT)
>  				timeout = arg->request_timeout;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 5ec43ecbceb7..0d73b8fcd2be 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -232,6 +232,9 @@
>   *
>   *  7.43
>   *  - add FUSE_REQUEST_TIMEOUT
> + *
> + * 7.44
> + * - add uring_nr_queues to fuse_init_out
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -915,7 +918,8 @@ struct fuse_init_out {
>  	uint32_t	flags2;
>  	uint32_t	max_stack_depth;
>  	uint16_t	request_timeout;
> -	uint16_t	unused[11];
> +	uint8_t		uring_nr_queues;
> +	uint8_t		unused[21];


I'm a bit scared that uint8_t might not be sufficient at some.
The largest system we have in the lab has 244 cores. So far
I'm still not sure if we are going to do queue-per-core or
are going to reduce it. That even might become a generic tuning
for us. If we add this value it probably would need to be
uint16_t. Though I wonder if we can do without this variable
and just set initialization to completed once the first
queue had an entry.


Thanks,
Bernd



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-03-14 23:11 ` Bernd Schubert
@ 2025-03-18  0:55   ` Joanne Koong
  2025-03-18 10:33     ` Bernd Schubert
  0 siblings, 1 reply; 8+ messages in thread
From: Joanne Koong @ 2025-03-18  0:55 UTC (permalink / raw)
  To: Bernd Schubert; +Cc: miklos, linux-fsdevel, kernel-team

Hi Bernd,
Thanks for the quick turnaround on the review!

On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
> Thanks Joanne! That is rather close to what I wanted to add,
> just a few comments.
>
> On 3/14/25 21:44, Joanne Koong wrote:
> > In the current uring design, the number of queues is equal to the number
> > of cores on a system. However, on high-scale machines where there are
> > hundreds of cores, having such a high number of queues is often
> > overkill and resource-intensive. As well, in the current design where
> > the queue for the request is set to the cpu the task is currently
> > executing on (see fuse_uring_task_to_queue()), there is no guarantee
> > that requests for the same file will be sent to the same queue (eg if a
> > task is preempted and moved to a different cpu) which may be problematic
> > for some servers (eg if the server is append-only and does not support
> > unordered writes).
> >
> > In this commit, the server can configure the number of uring queues
> > (passed to the kernel through the init reply). The number of queues must
> > be a power of two, in order to make queue assignment for a request
> > efficient. If the server specifies a non-power of two, then it will be
> > automatically rounded down to the nearest power of two. If the server
> > does not specify the number of queues, then this will automatically
> > default to the current behavior where the number of queues will be equal
> > to the number of cores with core and numa affinity. The queue id hash
> > is computed on the nodeid, which ensures that requests for the same file
> > will be forwarded to the same queue.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
> >  fs/fuse/dev_uring_i.h     | 11 +++++++++
> >  fs/fuse/fuse_i.h          |  1 +
> >  fs/fuse/inode.c           |  4 +++-
> >  include/uapi/linux/fuse.h |  6 ++++-
> >  5 files changed, 63 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> > index 64f1ae308dc4..f173f9e451ac 100644
> > --- a/fs/fuse/dev_uring.c
> > +++ b/fs/fuse/dev_uring.c
> > @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
> >  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> >  {
> >       struct fuse_ring *ring;
> > -     size_t nr_queues = num_possible_cpus();
> > +     size_t nr_queues = fc->uring_nr_queues;
> >       struct fuse_ring *res = NULL;
> >       size_t max_payload_size;
> > +     unsigned int nr_cpus = num_possible_cpus();
> >
> >       ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
> >       if (!ring)
> > @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> >
> >       fc->ring = ring;
> >       ring->nr_queues = nr_queues;
> > +     if (nr_queues == nr_cpus) {
> > +             ring->core_affinity = 1;
> > +     } else {
> > +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
> > +                     !is_power_of_2(nr_queues));
> > +             ring->qid_hash_bits = ilog2(nr_queues);
> > +     }
> >       ring->fc = fc;
> >       ring->max_payload_sz = max_payload_size;
> >       atomic_set(&ring->queue_refs, 0);
> > @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
> >       fuse_uring_send(ent, cmd, err, issue_flags);
> >  }
> >
> > -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
> > +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
> > +{
> > +     if (ring->nr_queues == 1)
> > +             return 0;
> > +
> > +     return hash_long(nodeid, ring->qid_hash_bits);
> > +}
> > +
> > +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
> > +                                                     struct fuse_req *req)
> >  {
> >       unsigned int qid;
> >       struct fuse_ring_queue *queue;
> >
> > -     qid = task_cpu(current);
> > +     if (ring->core_affinity)
> > +             qid = task_cpu(current);
> > +     else
> > +             qid = hash_qid(ring, req->in.h.nodeid);
>
> I think we need to handle numa affinity.
>

Could you elaborate more on this? I'm not too familiar with how to
enforce this in practice. As I understand it, the main goal of numa
affinity is to make sure processes access memory that's physically
closer to the CPU it's executing on. How does this usually get
enforced at the kernel level?

> >
> >       if (WARN_ONCE(qid >= ring->nr_queues,
> >                     "Core number (%u) exceeds nr queues (%zu)\n", qid,
> > @@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
> >       int err;
> >
> >       err = -EINVAL;
> > -     queue = fuse_uring_task_to_queue(ring);
> > +     queue = fuse_uring_task_to_queue(ring, req);
> >       if (!queue)
> >               goto err;
> >
> > @@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
> >       struct fuse_ring_queue *queue;
> >       struct fuse_ring_ent *ent = NULL;
> >
> > -     queue = fuse_uring_task_to_queue(ring);
> > +     queue = fuse_uring_task_to_queue(ring, req);
> >       if (!queue)
> >               return false;
> >
> > @@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
> >       .send_interrupt = fuse_dev_queue_interrupt,
> >       .send_req = fuse_uring_queue_fuse_req,
> >  };
> > +
> > +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> > +{
> > +     if (!nr_queues) {
> > +             fc->uring_nr_queues = num_possible_cpus();
> > +             return;
> > +     }
> > +
> > +     if (!is_power_of_2(nr_queues)) {
> > +             unsigned int old_nr_queues = nr_queues;
> > +
> > +             nr_queues = rounddown_pow_of_two(nr_queues);
> > +             pr_debug("init: uring_nr_queues=%u is not a power of 2. "
> > +                      "Rounding down uring_nr_queues to %u\n",
> > +                      old_nr_queues, nr_queues);
> > +     }
> > +     fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
> > +}
> > diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> > index ce823c6b1806..81398b5b8bf2 100644
> > --- a/fs/fuse/dev_uring_i.h
> > +++ b/fs/fuse/dev_uring_i.h
> > @@ -122,6 +122,12 @@ struct fuse_ring {
> >        */
> >       unsigned int stop_debug_log : 1;
> >
> > +     /* Each core has its own queue */
> > +     unsigned int core_affinity : 1;
> > +
> > +     /* Only used if core affinity is not set */
> > +     unsigned int qid_hash_bits;
> > +
> >       wait_queue_head_t stop_waitq;
> >
> >       /* async tear down */
> > @@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
> >  void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
> >  bool fuse_uring_queue_bq_req(struct fuse_req *req);
> >  bool fuse_uring_request_expired(struct fuse_conn *fc);
> > +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
> >
> >  static inline void fuse_uring_abort(struct fuse_conn *fc)
> >  {
> > @@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
> >       return false;
> >  }
> >
> > +static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> > +{
> > +}
> > +
> >  #endif /* CONFIG_FUSE_IO_URING */
> >
> >  #endif /* _FS_FUSE_DEV_URING_I_H */
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 38a782673bfd..7c3010bda02d 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -962,6 +962,7 @@ struct fuse_conn {
> >  #ifdef CONFIG_FUSE_IO_URING
> >       /**  uring connection information*/
> >       struct fuse_ring *ring;
> > +     uint8_t uring_nr_queues;
> >  #endif
> >
> >       /** Only used if the connection opts into request timeouts */
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index fd48e8d37f2e..c168247d87f2 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >                               else
> >                                       ok = false;
> >                       }
> > -                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > +                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
> >                               fc->io_uring = 1;
> > +                             fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
> > +                     }
> >
> >                       if (flags & FUSE_REQUEST_TIMEOUT)
> >                               timeout = arg->request_timeout;
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index 5ec43ecbceb7..0d73b8fcd2be 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -232,6 +232,9 @@
> >   *
> >   *  7.43
> >   *  - add FUSE_REQUEST_TIMEOUT
> > + *
> > + * 7.44
> > + * - add uring_nr_queues to fuse_init_out
> >   */
> >
> >  #ifndef _LINUX_FUSE_H
> > @@ -915,7 +918,8 @@ struct fuse_init_out {
> >       uint32_t        flags2;
> >       uint32_t        max_stack_depth;
> >       uint16_t        request_timeout;
> > -     uint16_t        unused[11];
> > +     uint8_t         uring_nr_queues;
> > +     uint8_t         unused[21];
>
>
> I'm a bit scared that uint8_t might not be sufficient at some.
> The largest system we have in the lab has 244 cores. So far
> I'm still not sure if we are going to do queue-per-core or
> are going to reduce it. That even might become a generic tuning
> for us. If we add this value it probably would need to be
> uint16_t. Though I wonder if we can do without this variable
> and just set initialization to completed once the first
> queue had an entry.

The only thing I could think of for not having it be part of the init was:
a) adding another ioctl call, something like FUSE_IO_URING_CMD_INIT
where we pass that as an init param before FUSE_IO_URING_CMD_REGISTERs
get called
or
b) adding the nr_queues to fuse_uring_cmd_req (in the padding bits)
for FUSE_IO_URING_CMD_REGISTER calls

but I don't think either of these are backwards-compatible unfortunately.

I think the issue with setting initialization to completed once the
first queue has an entry is that we need to allocate the queues at
ring creation time, so we need to know nr_queues from the beginning.

If we do go with the init approach, having uring_nr_queues be a
uint16_t sounds reasonable to me.

Thanks,
Joanne
>
>
> Thanks,
> Bernd
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-03-18  0:55   ` Joanne Koong
@ 2025-03-18 10:33     ` Bernd Schubert
  2025-03-18 23:16       ` Joanne Koong
  0 siblings, 1 reply; 8+ messages in thread
From: Bernd Schubert @ 2025-03-18 10:33 UTC (permalink / raw)
  To: Joanne Koong; +Cc: miklos, linux-fsdevel, kernel-team



On 3/18/25 01:55, Joanne Koong wrote:
> Hi Bernd,
> Thanks for the quick turnaround on the review!
> 
> On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>> Thanks Joanne! That is rather close to what I wanted to add,
>> just a few comments.
>>
>> On 3/14/25 21:44, Joanne Koong wrote:
>>> In the current uring design, the number of queues is equal to the number
>>> of cores on a system. However, on high-scale machines where there are
>>> hundreds of cores, having such a high number of queues is often
>>> overkill and resource-intensive. As well, in the current design where
>>> the queue for the request is set to the cpu the task is currently
>>> executing on (see fuse_uring_task_to_queue()), there is no guarantee
>>> that requests for the same file will be sent to the same queue (eg if a
>>> task is preempted and moved to a different cpu) which may be problematic
>>> for some servers (eg if the server is append-only and does not support
>>> unordered writes).
>>>
>>> In this commit, the server can configure the number of uring queues
>>> (passed to the kernel through the init reply). The number of queues must
>>> be a power of two, in order to make queue assignment for a request
>>> efficient. If the server specifies a non-power of two, then it will be
>>> automatically rounded down to the nearest power of two. If the server
>>> does not specify the number of queues, then this will automatically
>>> default to the current behavior where the number of queues will be equal
>>> to the number of cores with core and numa affinity. The queue id hash
>>> is computed on the nodeid, which ensures that requests for the same file
>>> will be forwarded to the same queue.
>>>
>>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
>>> ---
>>>  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
>>>  fs/fuse/dev_uring_i.h     | 11 +++++++++
>>>  fs/fuse/fuse_i.h          |  1 +
>>>  fs/fuse/inode.c           |  4 +++-
>>>  include/uapi/linux/fuse.h |  6 ++++-
>>>  5 files changed, 63 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>>> index 64f1ae308dc4..f173f9e451ac 100644
>>> --- a/fs/fuse/dev_uring.c
>>> +++ b/fs/fuse/dev_uring.c
>>> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
>>>  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>  {
>>>       struct fuse_ring *ring;
>>> -     size_t nr_queues = num_possible_cpus();
>>> +     size_t nr_queues = fc->uring_nr_queues;
>>>       struct fuse_ring *res = NULL;
>>>       size_t max_payload_size;
>>> +     unsigned int nr_cpus = num_possible_cpus();
>>>
>>>       ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
>>>       if (!ring)
>>> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>
>>>       fc->ring = ring;
>>>       ring->nr_queues = nr_queues;
>>> +     if (nr_queues == nr_cpus) {
>>> +             ring->core_affinity = 1;
>>> +     } else {
>>> +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
>>> +                     !is_power_of_2(nr_queues));
>>> +             ring->qid_hash_bits = ilog2(nr_queues);
>>> +     }
>>>       ring->fc = fc;
>>>       ring->max_payload_sz = max_payload_size;
>>>       atomic_set(&ring->queue_refs, 0);
>>> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
>>>       fuse_uring_send(ent, cmd, err, issue_flags);
>>>  }
>>>
>>> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
>>> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
>>> +{
>>> +     if (ring->nr_queues == 1)
>>> +             return 0;
>>> +
>>> +     return hash_long(nodeid, ring->qid_hash_bits);
>>> +}
>>> +
>>> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
>>> +                                                     struct fuse_req *req)
>>>  {
>>>       unsigned int qid;
>>>       struct fuse_ring_queue *queue;
>>>
>>> -     qid = task_cpu(current);
>>> +     if (ring->core_affinity)
>>> +             qid = task_cpu(current);
>>> +     else
>>> +             qid = hash_qid(ring, req->in.h.nodeid);
>>
>> I think we need to handle numa affinity.
>>
> 
> Could you elaborate more on this? I'm not too familiar with how to
> enforce this in practice. As I understand it, the main goal of numa
> affinity is to make sure processes access memory that's physically
> closer to the CPU it's executing on. How does this usually get
> enforced at the kernel level?

The request comes on a specific core and that is on a numa node - 
we should try to avoid switching. If there is no queue for the
current core we should try to stay on the same numa node.
And we should probably also consider the waiting requests per
queue and distribute between that, although that is a bit
independent.

> 
>>>
>>>       if (WARN_ONCE(qid >= ring->nr_queues,
>>>                     "Core number (%u) exceeds nr queues (%zu)\n", qid,
>>> @@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
>>>       int err;
>>>
>>>       err = -EINVAL;
>>> -     queue = fuse_uring_task_to_queue(ring);
>>> +     queue = fuse_uring_task_to_queue(ring, req);
>>>       if (!queue)
>>>               goto err;
>>>
>>> @@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
>>>       struct fuse_ring_queue *queue;
>>>       struct fuse_ring_ent *ent = NULL;
>>>
>>> -     queue = fuse_uring_task_to_queue(ring);
>>> +     queue = fuse_uring_task_to_queue(ring, req);
>>>       if (!queue)
>>>               return false;
>>>
>>> @@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
>>>       .send_interrupt = fuse_dev_queue_interrupt,
>>>       .send_req = fuse_uring_queue_fuse_req,
>>>  };
>>> +
>>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
>>> +{
>>> +     if (!nr_queues) {
>>> +             fc->uring_nr_queues = num_possible_cpus();
>>> +             return;
>>> +     }
>>> +
>>> +     if (!is_power_of_2(nr_queues)) {
>>> +             unsigned int old_nr_queues = nr_queues;
>>> +
>>> +             nr_queues = rounddown_pow_of_two(nr_queues);
>>> +             pr_debug("init: uring_nr_queues=%u is not a power of 2. "
>>> +                      "Rounding down uring_nr_queues to %u\n",
>>> +                      old_nr_queues, nr_queues);
>>> +     }
>>> +     fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
>>> +}
>>> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
>>> index ce823c6b1806..81398b5b8bf2 100644
>>> --- a/fs/fuse/dev_uring_i.h
>>> +++ b/fs/fuse/dev_uring_i.h
>>> @@ -122,6 +122,12 @@ struct fuse_ring {
>>>        */
>>>       unsigned int stop_debug_log : 1;
>>>
>>> +     /* Each core has its own queue */
>>> +     unsigned int core_affinity : 1;
>>> +
>>> +     /* Only used if core affinity is not set */
>>> +     unsigned int qid_hash_bits;
>>> +
>>>       wait_queue_head_t stop_waitq;
>>>
>>>       /* async tear down */
>>> @@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
>>>  void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
>>>  bool fuse_uring_queue_bq_req(struct fuse_req *req);
>>>  bool fuse_uring_request_expired(struct fuse_conn *fc);
>>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
>>>
>>>  static inline void fuse_uring_abort(struct fuse_conn *fc)
>>>  {
>>> @@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
>>>       return false;
>>>  }
>>>
>>> +static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
>>> +{
>>> +}
>>> +
>>>  #endif /* CONFIG_FUSE_IO_URING */
>>>
>>>  #endif /* _FS_FUSE_DEV_URING_I_H */
>>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>>> index 38a782673bfd..7c3010bda02d 100644
>>> --- a/fs/fuse/fuse_i.h
>>> +++ b/fs/fuse/fuse_i.h
>>> @@ -962,6 +962,7 @@ struct fuse_conn {
>>>  #ifdef CONFIG_FUSE_IO_URING
>>>       /**  uring connection information*/
>>>       struct fuse_ring *ring;
>>> +     uint8_t uring_nr_queues;
>>>  #endif
>>>
>>>       /** Only used if the connection opts into request timeouts */
>>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
>>> index fd48e8d37f2e..c168247d87f2 100644
>>> --- a/fs/fuse/inode.c
>>> +++ b/fs/fuse/inode.c
>>> @@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>>>                               else
>>>                                       ok = false;
>>>                       }
>>> -                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
>>> +                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
>>>                               fc->io_uring = 1;
>>> +                             fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
>>> +                     }
>>>
>>>                       if (flags & FUSE_REQUEST_TIMEOUT)
>>>                               timeout = arg->request_timeout;
>>> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
>>> index 5ec43ecbceb7..0d73b8fcd2be 100644
>>> --- a/include/uapi/linux/fuse.h
>>> +++ b/include/uapi/linux/fuse.h
>>> @@ -232,6 +232,9 @@
>>>   *
>>>   *  7.43
>>>   *  - add FUSE_REQUEST_TIMEOUT
>>> + *
>>> + * 7.44
>>> + * - add uring_nr_queues to fuse_init_out
>>>   */
>>>
>>>  #ifndef _LINUX_FUSE_H
>>> @@ -915,7 +918,8 @@ struct fuse_init_out {
>>>       uint32_t        flags2;
>>>       uint32_t        max_stack_depth;
>>>       uint16_t        request_timeout;
>>> -     uint16_t        unused[11];
>>> +     uint8_t         uring_nr_queues;
>>> +     uint8_t         unused[21];
>>
>>
>> I'm a bit scared that uint8_t might not be sufficient at some.
>> The largest system we have in the lab has 244 cores. So far
>> I'm still not sure if we are going to do queue-per-core or
>> are going to reduce it. That even might become a generic tuning
>> for us. If we add this value it probably would need to be
>> uint16_t. Though I wonder if we can do without this variable
>> and just set initialization to completed once the first
>> queue had an entry.
> 
> The only thing I could think of for not having it be part of the init was:
> a) adding another ioctl call, something like FUSE_IO_URING_CMD_INIT
> where we pass that as an init param before FUSE_IO_URING_CMD_REGISTERs
> get called
> or
> b) adding the nr_queues to fuse_uring_cmd_req (in the padding bits)
> for FUSE_IO_URING_CMD_REGISTER calls
> 
> but I don't think either of these are backwards-compatible unfortunately.
> 
> I think the issue with setting initialization to completed once the
> first queue has an entry is that we need to allocate the queues at
> ring creation time, so we need to know nr_queues from the beginning.
> 
> If we do go with the init approach, having uring_nr_queues be a
> uint16_t sounds reasonable to me.

I will take an hour a bit later today and try to update the patch.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-03-18 10:33     ` Bernd Schubert
@ 2025-03-18 23:16       ` Joanne Koong
  2025-04-01  0:22         ` Joanne Koong
  0 siblings, 1 reply; 8+ messages in thread
From: Joanne Koong @ 2025-03-18 23:16 UTC (permalink / raw)
  To: Bernd Schubert; +Cc: miklos, linux-fsdevel, kernel-team

On Tue, Mar 18, 2025 at 3:33 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 3/18/25 01:55, Joanne Koong wrote:
> > Hi Bernd,
> > Thanks for the quick turnaround on the review!
> >
> > On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
> > <bernd.schubert@fastmail.fm> wrote:
> >>
> >> Thanks Joanne! That is rather close to what I wanted to add,
> >> just a few comments.
> >>
> >> On 3/14/25 21:44, Joanne Koong wrote:
> >>> In the current uring design, the number of queues is equal to the number
> >>> of cores on a system. However, on high-scale machines where there are
> >>> hundreds of cores, having such a high number of queues is often
> >>> overkill and resource-intensive. As well, in the current design where
> >>> the queue for the request is set to the cpu the task is currently
> >>> executing on (see fuse_uring_task_to_queue()), there is no guarantee
> >>> that requests for the same file will be sent to the same queue (eg if a
> >>> task is preempted and moved to a different cpu) which may be problematic
> >>> for some servers (eg if the server is append-only and does not support
> >>> unordered writes).
> >>>
> >>> In this commit, the server can configure the number of uring queues
> >>> (passed to the kernel through the init reply). The number of queues must
> >>> be a power of two, in order to make queue assignment for a request
> >>> efficient. If the server specifies a non-power of two, then it will be
> >>> automatically rounded down to the nearest power of two. If the server
> >>> does not specify the number of queues, then this will automatically
> >>> default to the current behavior where the number of queues will be equal
> >>> to the number of cores with core and numa affinity. The queue id hash
> >>> is computed on the nodeid, which ensures that requests for the same file
> >>> will be forwarded to the same queue.
> >>>
> >>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> >>> ---
> >>>  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
> >>>  fs/fuse/dev_uring_i.h     | 11 +++++++++
> >>>  fs/fuse/fuse_i.h          |  1 +
> >>>  fs/fuse/inode.c           |  4 +++-
> >>>  include/uapi/linux/fuse.h |  6 ++++-
> >>>  5 files changed, 63 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> >>> index 64f1ae308dc4..f173f9e451ac 100644
> >>> --- a/fs/fuse/dev_uring.c
> >>> +++ b/fs/fuse/dev_uring.c
> >>> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
> >>>  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> >>>  {
> >>>       struct fuse_ring *ring;
> >>> -     size_t nr_queues = num_possible_cpus();
> >>> +     size_t nr_queues = fc->uring_nr_queues;
> >>>       struct fuse_ring *res = NULL;
> >>>       size_t max_payload_size;
> >>> +     unsigned int nr_cpus = num_possible_cpus();
> >>>
> >>>       ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
> >>>       if (!ring)
> >>> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> >>>
> >>>       fc->ring = ring;
> >>>       ring->nr_queues = nr_queues;
> >>> +     if (nr_queues == nr_cpus) {
> >>> +             ring->core_affinity = 1;
> >>> +     } else {
> >>> +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
> >>> +                     !is_power_of_2(nr_queues));
> >>> +             ring->qid_hash_bits = ilog2(nr_queues);
> >>> +     }
> >>>       ring->fc = fc;
> >>>       ring->max_payload_sz = max_payload_size;
> >>>       atomic_set(&ring->queue_refs, 0);
> >>> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
> >>>       fuse_uring_send(ent, cmd, err, issue_flags);
> >>>  }
> >>>
> >>> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
> >>> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
> >>> +{
> >>> +     if (ring->nr_queues == 1)
> >>> +             return 0;
> >>> +
> >>> +     return hash_long(nodeid, ring->qid_hash_bits);
> >>> +}
> >>> +
> >>> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
> >>> +                                                     struct fuse_req *req)
> >>>  {
> >>>       unsigned int qid;
> >>>       struct fuse_ring_queue *queue;
> >>>
> >>> -     qid = task_cpu(current);
> >>> +     if (ring->core_affinity)
> >>> +             qid = task_cpu(current);
> >>> +     else
> >>> +             qid = hash_qid(ring, req->in.h.nodeid);
> >>
> >> I think we need to handle numa affinity.
> >>
> >
> > Could you elaborate more on this? I'm not too familiar with how to
> > enforce this in practice. As I understand it, the main goal of numa
> > affinity is to make sure processes access memory that's physically
> > closer to the CPU it's executing on. How does this usually get
> > enforced at the kernel level?
>
> The request comes on a specific core and that is on a numa node -
> we should try to avoid switching. If there is no queue for the
> current core we should try to stay on the same numa node.
> And we should probably also consider the waiting requests per
> queue and distribute between that, although that is a bit
> independent.
>

In that case then, there's no guarantee that requests on the same file
will get sent to the same queue. But thinking more about this, maybe
it doesn't matter after all if they're sent to different queues. I
need to think some more about this. But I agree, if we don't care
about requests for the same inode getting routed to the same queue,
then we should aim for numa affinity. I'll look more into this.

Thanks,
Joanne

> >
> >>>
> >>>       if (WARN_ONCE(qid >= ring->nr_queues,
> >>>                     "Core number (%u) exceeds nr queues (%zu)\n", qid,
> >>> @@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
> >>>       int err;
> >>>
> >>>       err = -EINVAL;
> >>> -     queue = fuse_uring_task_to_queue(ring);
> >>> +     queue = fuse_uring_task_to_queue(ring, req);
> >>>       if (!queue)
> >>>               goto err;
> >>>
> >>> @@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
> >>>       struct fuse_ring_queue *queue;
> >>>       struct fuse_ring_ent *ent = NULL;
> >>>
> >>> -     queue = fuse_uring_task_to_queue(ring);
> >>> +     queue = fuse_uring_task_to_queue(ring, req);
> >>>       if (!queue)
> >>>               return false;
> >>>
> >>> @@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
> >>>       .send_interrupt = fuse_dev_queue_interrupt,
> >>>       .send_req = fuse_uring_queue_fuse_req,
> >>>  };
> >>> +
> >>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> >>> +{
> >>> +     if (!nr_queues) {
> >>> +             fc->uring_nr_queues = num_possible_cpus();
> >>> +             return;
> >>> +     }
> >>> +
> >>> +     if (!is_power_of_2(nr_queues)) {
> >>> +             unsigned int old_nr_queues = nr_queues;
> >>> +
> >>> +             nr_queues = rounddown_pow_of_two(nr_queues);
> >>> +             pr_debug("init: uring_nr_queues=%u is not a power of 2. "
> >>> +                      "Rounding down uring_nr_queues to %u\n",
> >>> +                      old_nr_queues, nr_queues);
> >>> +     }
> >>> +     fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
> >>> +}
> >>> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> >>> index ce823c6b1806..81398b5b8bf2 100644
> >>> --- a/fs/fuse/dev_uring_i.h
> >>> +++ b/fs/fuse/dev_uring_i.h
> >>> @@ -122,6 +122,12 @@ struct fuse_ring {
> >>>        */
> >>>       unsigned int stop_debug_log : 1;
> >>>
> >>> +     /* Each core has its own queue */
> >>> +     unsigned int core_affinity : 1;
> >>> +
> >>> +     /* Only used if core affinity is not set */
> >>> +     unsigned int qid_hash_bits;
> >>> +
> >>>       wait_queue_head_t stop_waitq;
> >>>
> >>>       /* async tear down */
> >>> @@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
> >>>  void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
> >>>  bool fuse_uring_queue_bq_req(struct fuse_req *req);
> >>>  bool fuse_uring_request_expired(struct fuse_conn *fc);
> >>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
> >>>
> >>>  static inline void fuse_uring_abort(struct fuse_conn *fc)
> >>>  {
> >>> @@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
> >>>       return false;
> >>>  }
> >>>
> >>> +static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> >>> +{
> >>> +}
> >>> +
> >>>  #endif /* CONFIG_FUSE_IO_URING */
> >>>
> >>>  #endif /* _FS_FUSE_DEV_URING_I_H */
> >>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> >>> index 38a782673bfd..7c3010bda02d 100644
> >>> --- a/fs/fuse/fuse_i.h
> >>> +++ b/fs/fuse/fuse_i.h
> >>> @@ -962,6 +962,7 @@ struct fuse_conn {
> >>>  #ifdef CONFIG_FUSE_IO_URING
> >>>       /**  uring connection information*/
> >>>       struct fuse_ring *ring;
> >>> +     uint8_t uring_nr_queues;
> >>>  #endif
> >>>
> >>>       /** Only used if the connection opts into request timeouts */
> >>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> >>> index fd48e8d37f2e..c168247d87f2 100644
> >>> --- a/fs/fuse/inode.c
> >>> +++ b/fs/fuse/inode.c
> >>> @@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >>>                               else
> >>>                                       ok = false;
> >>>                       }
> >>> -                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> >>> +                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
> >>>                               fc->io_uring = 1;
> >>> +                             fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
> >>> +                     }
> >>>
> >>>                       if (flags & FUSE_REQUEST_TIMEOUT)
> >>>                               timeout = arg->request_timeout;
> >>> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> >>> index 5ec43ecbceb7..0d73b8fcd2be 100644
> >>> --- a/include/uapi/linux/fuse.h
> >>> +++ b/include/uapi/linux/fuse.h
> >>> @@ -232,6 +232,9 @@
> >>>   *
> >>>   *  7.43
> >>>   *  - add FUSE_REQUEST_TIMEOUT
> >>> + *
> >>> + * 7.44
> >>> + * - add uring_nr_queues to fuse_init_out
> >>>   */
> >>>
> >>>  #ifndef _LINUX_FUSE_H
> >>> @@ -915,7 +918,8 @@ struct fuse_init_out {
> >>>       uint32_t        flags2;
> >>>       uint32_t        max_stack_depth;
> >>>       uint16_t        request_timeout;
> >>> -     uint16_t        unused[11];
> >>> +     uint8_t         uring_nr_queues;
> >>> +     uint8_t         unused[21];
> >>
> >>
> >> I'm a bit scared that uint8_t might not be sufficient at some.
> >> The largest system we have in the lab has 244 cores. So far
> >> I'm still not sure if we are going to do queue-per-core or
> >> are going to reduce it. That even might become a generic tuning
> >> for us. If we add this value it probably would need to be
> >> uint16_t. Though I wonder if we can do without this variable
> >> and just set initialization to completed once the first
> >> queue had an entry.
> >
> > The only thing I could think of for not having it be part of the init was:
> > a) adding another ioctl call, something like FUSE_IO_URING_CMD_INIT
> > where we pass that as an init param before FUSE_IO_URING_CMD_REGISTERs
> > get called
> > or
> > b) adding the nr_queues to fuse_uring_cmd_req (in the padding bits)
> > for FUSE_IO_URING_CMD_REGISTER calls
> >
> > but I don't think either of these are backwards-compatible unfortunately.
> >
> > I think the issue with setting initialization to completed once the
> > first queue has an entry is that we need to allocate the queues at
> > ring creation time, so we need to know nr_queues from the beginning.
> >
> > If we do go with the init approach, having uring_nr_queues be a
> > uint16_t sounds reasonable to me.
>
> I will take an hour a bit later today and try to update the patch.
>
>
> Thanks,
> Bernd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-03-18 23:16       ` Joanne Koong
@ 2025-04-01  0:22         ` Joanne Koong
  2025-04-01 17:06           ` Bernd Schubert
  0 siblings, 1 reply; 8+ messages in thread
From: Joanne Koong @ 2025-04-01  0:22 UTC (permalink / raw)
  To: Bernd Schubert; +Cc: miklos, linux-fsdevel, kernel-team

On Tue, Mar 18, 2025 at 4:16 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Tue, Mar 18, 2025 at 3:33 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
> >
> > On 3/18/25 01:55, Joanne Koong wrote:
> > > Hi Bernd,
> > > Thanks for the quick turnaround on the review!
> > >
> > > On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
> > > <bernd.schubert@fastmail.fm> wrote:
> > >>
> > >> Thanks Joanne! That is rather close to what I wanted to add,
> > >> just a few comments.
> > >>
> > >> On 3/14/25 21:44, Joanne Koong wrote:
> > >>> In the current uring design, the number of queues is equal to the number
> > >>> of cores on a system. However, on high-scale machines where there are
> > >>> hundreds of cores, having such a high number of queues is often
> > >>> overkill and resource-intensive. As well, in the current design where
> > >>> the queue for the request is set to the cpu the task is currently
> > >>> executing on (see fuse_uring_task_to_queue()), there is no guarantee
> > >>> that requests for the same file will be sent to the same queue (eg if a
> > >>> task is preempted and moved to a different cpu) which may be problematic
> > >>> for some servers (eg if the server is append-only and does not support
> > >>> unordered writes).
> > >>>
> > >>> In this commit, the server can configure the number of uring queues
> > >>> (passed to the kernel through the init reply). The number of queues must
> > >>> be a power of two, in order to make queue assignment for a request
> > >>> efficient. If the server specifies a non-power of two, then it will be
> > >>> automatically rounded down to the nearest power of two. If the server
> > >>> does not specify the number of queues, then this will automatically
> > >>> default to the current behavior where the number of queues will be equal
> > >>> to the number of cores with core and numa affinity. The queue id hash
> > >>> is computed on the nodeid, which ensures that requests for the same file
> > >>> will be forwarded to the same queue.
> > >>>
> > >>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > >>> ---
> > >>>  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
> > >>>  fs/fuse/dev_uring_i.h     | 11 +++++++++
> > >>>  fs/fuse/fuse_i.h          |  1 +
> > >>>  fs/fuse/inode.c           |  4 +++-
> > >>>  include/uapi/linux/fuse.h |  6 ++++-
> > >>>  5 files changed, 63 insertions(+), 7 deletions(-)
> > >>>
> > >>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> > >>> index 64f1ae308dc4..f173f9e451ac 100644
> > >>> --- a/fs/fuse/dev_uring.c
> > >>> +++ b/fs/fuse/dev_uring.c
> > >>> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
> > >>>  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> > >>>  {
> > >>>       struct fuse_ring *ring;
> > >>> -     size_t nr_queues = num_possible_cpus();
> > >>> +     size_t nr_queues = fc->uring_nr_queues;
> > >>>       struct fuse_ring *res = NULL;
> > >>>       size_t max_payload_size;
> > >>> +     unsigned int nr_cpus = num_possible_cpus();
> > >>>
> > >>>       ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
> > >>>       if (!ring)
> > >>> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
> > >>>
> > >>>       fc->ring = ring;
> > >>>       ring->nr_queues = nr_queues;
> > >>> +     if (nr_queues == nr_cpus) {
> > >>> +             ring->core_affinity = 1;
> > >>> +     } else {
> > >>> +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
> > >>> +                     !is_power_of_2(nr_queues));
> > >>> +             ring->qid_hash_bits = ilog2(nr_queues);
> > >>> +     }
> > >>>       ring->fc = fc;
> > >>>       ring->max_payload_sz = max_payload_size;
> > >>>       atomic_set(&ring->queue_refs, 0);
> > >>> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
> > >>>       fuse_uring_send(ent, cmd, err, issue_flags);
> > >>>  }
> > >>>
> > >>> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
> > >>> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
> > >>> +{
> > >>> +     if (ring->nr_queues == 1)
> > >>> +             return 0;
> > >>> +
> > >>> +     return hash_long(nodeid, ring->qid_hash_bits);
> > >>> +}
> > >>> +
> > >>> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
> > >>> +                                                     struct fuse_req *req)
> > >>>  {
> > >>>       unsigned int qid;
> > >>>       struct fuse_ring_queue *queue;
> > >>>
> > >>> -     qid = task_cpu(current);
> > >>> +     if (ring->core_affinity)
> > >>> +             qid = task_cpu(current);
> > >>> +     else
> > >>> +             qid = hash_qid(ring, req->in.h.nodeid);
> > >>
> > >> I think we need to handle numa affinity.
> > >>
> > >
> > > Could you elaborate more on this? I'm not too familiar with how to
> > > enforce this in practice. As I understand it, the main goal of numa
> > > affinity is to make sure processes access memory that's physically
> > > closer to the CPU it's executing on. How does this usually get
> > > enforced at the kernel level?
> >
> > The request comes on a specific core and that is on a numa node -
> > we should try to avoid switching. If there is no queue for the
> > current core we should try to stay on the same numa node.
> > And we should probably also consider the waiting requests per
> > queue and distribute between that, although that is a bit
> > independent.
> >
>
> In that case then, there's no guarantee that requests on the same file
> will get sent to the same queue. But thinking more about this, maybe
> it doesn't matter after all if they're sent to different queues. I
> need to think some more about this. But I agree, if we don't care
> about requests for the same inode getting routed to the same queue,
> then we should aim for numa affinity. I'll look more into this.
>

Thought about this some more... is this even worth doing? AFAICT,
there's no guarantee that the same number of CPUs are distributed
evenly across numa nodes. For example, one numa node may have CPUs 0
to 5 on them, then another numa node might have CPU 6 and 7. If
there's two queues, each associated with a numa node, then requests
will be disproportionately / unevenly allocated. Eg most of the
workload will be queued on the numa node with CPUs 0 to 5. Moreover, I
don't think there's a good way to enforce this in the cases where
number of queues < number of numa nodes. For example if there's 3 numa
nodes with say 3 CPUs each and there's 2 queues. The logic for which
cpu gets sent to which queue gets a little messy here.

imo, this is an optimization that could be added in the future if the
need for this comes up. WDYT?

Thanks,
Joanne

> Thanks,
> Joanne
>
> > >
> > >>>
> > >>>       if (WARN_ONCE(qid >= ring->nr_queues,
> > >>>                     "Core number (%u) exceeds nr queues (%zu)\n", qid,
> > >>> @@ -1253,7 +1273,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
> > >>>       int err;
> > >>>
> > >>>       err = -EINVAL;
> > >>> -     queue = fuse_uring_task_to_queue(ring);
> > >>> +     queue = fuse_uring_task_to_queue(ring, req);
> > >>>       if (!queue)
> > >>>               goto err;
> > >>>
> > >>> @@ -1293,7 +1313,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
> > >>>       struct fuse_ring_queue *queue;
> > >>>       struct fuse_ring_ent *ent = NULL;
> > >>>
> > >>> -     queue = fuse_uring_task_to_queue(ring);
> > >>> +     queue = fuse_uring_task_to_queue(ring, req);
> > >>>       if (!queue)
> > >>>               return false;
> > >>>
> > >>> @@ -1344,3 +1364,21 @@ static const struct fuse_iqueue_ops fuse_io_uring_ops = {
> > >>>       .send_interrupt = fuse_dev_queue_interrupt,
> > >>>       .send_req = fuse_uring_queue_fuse_req,
> > >>>  };
> > >>> +
> > >>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> > >>> +{
> > >>> +     if (!nr_queues) {
> > >>> +             fc->uring_nr_queues = num_possible_cpus();
> > >>> +             return;
> > >>> +     }
> > >>> +
> > >>> +     if (!is_power_of_2(nr_queues)) {
> > >>> +             unsigned int old_nr_queues = nr_queues;
> > >>> +
> > >>> +             nr_queues = rounddown_pow_of_two(nr_queues);
> > >>> +             pr_debug("init: uring_nr_queues=%u is not a power of 2. "
> > >>> +                      "Rounding down uring_nr_queues to %u\n",
> > >>> +                      old_nr_queues, nr_queues);
> > >>> +     }
> > >>> +     fc->uring_nr_queues = min(nr_queues, num_possible_cpus());
> > >>> +}
> > >>> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> > >>> index ce823c6b1806..81398b5b8bf2 100644
> > >>> --- a/fs/fuse/dev_uring_i.h
> > >>> +++ b/fs/fuse/dev_uring_i.h
> > >>> @@ -122,6 +122,12 @@ struct fuse_ring {
> > >>>        */
> > >>>       unsigned int stop_debug_log : 1;
> > >>>
> > >>> +     /* Each core has its own queue */
> > >>> +     unsigned int core_affinity : 1;
> > >>> +
> > >>> +     /* Only used if core affinity is not set */
> > >>> +     unsigned int qid_hash_bits;
> > >>> +
> > >>>       wait_queue_head_t stop_waitq;
> > >>>
> > >>>       /* async tear down */
> > >>> @@ -143,6 +149,7 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
> > >>>  void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
> > >>>  bool fuse_uring_queue_bq_req(struct fuse_req *req);
> > >>>  bool fuse_uring_request_expired(struct fuse_conn *fc);
> > >>> +void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues);
> > >>>
> > >>>  static inline void fuse_uring_abort(struct fuse_conn *fc)
> > >>>  {
> > >>> @@ -200,6 +207,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
> > >>>       return false;
> > >>>  }
> > >>>
> > >>> +static inline void fuse_uring_set_nr_queues(struct fuse_conn *fc, unsigned int nr_queues)
> > >>> +{
> > >>> +}
> > >>> +
> > >>>  #endif /* CONFIG_FUSE_IO_URING */
> > >>>
> > >>>  #endif /* _FS_FUSE_DEV_URING_I_H */
> > >>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > >>> index 38a782673bfd..7c3010bda02d 100644
> > >>> --- a/fs/fuse/fuse_i.h
> > >>> +++ b/fs/fuse/fuse_i.h
> > >>> @@ -962,6 +962,7 @@ struct fuse_conn {
> > >>>  #ifdef CONFIG_FUSE_IO_URING
> > >>>       /**  uring connection information*/
> > >>>       struct fuse_ring *ring;
> > >>> +     uint8_t uring_nr_queues;
> > >>>  #endif
> > >>>
> > >>>       /** Only used if the connection opts into request timeouts */
> > >>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > >>> index fd48e8d37f2e..c168247d87f2 100644
> > >>> --- a/fs/fuse/inode.c
> > >>> +++ b/fs/fuse/inode.c
> > >>> @@ -1433,8 +1433,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > >>>                               else
> > >>>                                       ok = false;
> > >>>                       }
> > >>> -                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > >>> +                     if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) {
> > >>>                               fc->io_uring = 1;
> > >>> +                             fuse_uring_set_nr_queues(fc, arg->uring_nr_queues);
> > >>> +                     }
> > >>>
> > >>>                       if (flags & FUSE_REQUEST_TIMEOUT)
> > >>>                               timeout = arg->request_timeout;
> > >>> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > >>> index 5ec43ecbceb7..0d73b8fcd2be 100644
> > >>> --- a/include/uapi/linux/fuse.h
> > >>> +++ b/include/uapi/linux/fuse.h
> > >>> @@ -232,6 +232,9 @@
> > >>>   *
> > >>>   *  7.43
> > >>>   *  - add FUSE_REQUEST_TIMEOUT
> > >>> + *
> > >>> + * 7.44
> > >>> + * - add uring_nr_queues to fuse_init_out
> > >>>   */
> > >>>
> > >>>  #ifndef _LINUX_FUSE_H
> > >>> @@ -915,7 +918,8 @@ struct fuse_init_out {
> > >>>       uint32_t        flags2;
> > >>>       uint32_t        max_stack_depth;
> > >>>       uint16_t        request_timeout;
> > >>> -     uint16_t        unused[11];
> > >>> +     uint8_t         uring_nr_queues;
> > >>> +     uint8_t         unused[21];
> > >>
> > >>
> > >> I'm a bit scared that uint8_t might not be sufficient at some.
> > >> The largest system we have in the lab has 244 cores. So far
> > >> I'm still not sure if we are going to do queue-per-core or
> > >> are going to reduce it. That even might become a generic tuning
> > >> for us. If we add this value it probably would need to be
> > >> uint16_t. Though I wonder if we can do without this variable
> > >> and just set initialization to completed once the first
> > >> queue had an entry.
> > >
> > > The only thing I could think of for not having it be part of the init was:
> > > a) adding another ioctl call, something like FUSE_IO_URING_CMD_INIT
> > > where we pass that as an init param before FUSE_IO_URING_CMD_REGISTERs
> > > get called
> > > or
> > > b) adding the nr_queues to fuse_uring_cmd_req (in the padding bits)
> > > for FUSE_IO_URING_CMD_REGISTER calls
> > >
> > > but I don't think either of these are backwards-compatible unfortunately.
> > >
> > > I think the issue with setting initialization to completed once the
> > > first queue has an entry is that we need to allocate the queues at
> > > ring creation time, so we need to know nr_queues from the beginning.
> > >
> > > If we do go with the init approach, having uring_nr_queues be a
> > > uint16_t sounds reasonable to me.
> >
> > I will take an hour a bit later today and try to update the patch.
> >
> >
> > Thanks,
> > Bernd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-04-01  0:22         ` Joanne Koong
@ 2025-04-01 17:06           ` Bernd Schubert
  2025-06-04 17:57             ` Bernd Schubert
  0 siblings, 1 reply; 8+ messages in thread
From: Bernd Schubert @ 2025-04-01 17:06 UTC (permalink / raw)
  To: Joanne Koong; +Cc: miklos, linux-fsdevel, kernel-team



On 4/1/25 02:22, Joanne Koong wrote:
> On Tue, Mar 18, 2025 at 4:16 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>>
>> On Tue, Mar 18, 2025 at 3:33 AM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>>>
>>> On 3/18/25 01:55, Joanne Koong wrote:
>>>> Hi Bernd,
>>>> Thanks for the quick turnaround on the review!
>>>>
>>>> On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>
>>>>> Thanks Joanne! That is rather close to what I wanted to add,
>>>>> just a few comments.
>>>>>
>>>>> On 3/14/25 21:44, Joanne Koong wrote:
>>>>>> In the current uring design, the number of queues is equal to the number
>>>>>> of cores on a system. However, on high-scale machines where there are
>>>>>> hundreds of cores, having such a high number of queues is often
>>>>>> overkill and resource-intensive. As well, in the current design where
>>>>>> the queue for the request is set to the cpu the task is currently
>>>>>> executing on (see fuse_uring_task_to_queue()), there is no guarantee
>>>>>> that requests for the same file will be sent to the same queue (eg if a
>>>>>> task is preempted and moved to a different cpu) which may be problematic
>>>>>> for some servers (eg if the server is append-only and does not support
>>>>>> unordered writes).
>>>>>>
>>>>>> In this commit, the server can configure the number of uring queues
>>>>>> (passed to the kernel through the init reply). The number of queues must
>>>>>> be a power of two, in order to make queue assignment for a request
>>>>>> efficient. If the server specifies a non-power of two, then it will be
>>>>>> automatically rounded down to the nearest power of two. If the server
>>>>>> does not specify the number of queues, then this will automatically
>>>>>> default to the current behavior where the number of queues will be equal
>>>>>> to the number of cores with core and numa affinity. The queue id hash
>>>>>> is computed on the nodeid, which ensures that requests for the same file
>>>>>> will be forwarded to the same queue.
>>>>>>
>>>>>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
>>>>>> ---
>>>>>>  fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
>>>>>>  fs/fuse/dev_uring_i.h     | 11 +++++++++
>>>>>>  fs/fuse/fuse_i.h          |  1 +
>>>>>>  fs/fuse/inode.c           |  4 +++-
>>>>>>  include/uapi/linux/fuse.h |  6 ++++-
>>>>>>  5 files changed, 63 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>>>>>> index 64f1ae308dc4..f173f9e451ac 100644
>>>>>> --- a/fs/fuse/dev_uring.c
>>>>>> +++ b/fs/fuse/dev_uring.c
>>>>>> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
>>>>>>  static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>>>>  {
>>>>>>       struct fuse_ring *ring;
>>>>>> -     size_t nr_queues = num_possible_cpus();
>>>>>> +     size_t nr_queues = fc->uring_nr_queues;
>>>>>>       struct fuse_ring *res = NULL;
>>>>>>       size_t max_payload_size;
>>>>>> +     unsigned int nr_cpus = num_possible_cpus();
>>>>>>
>>>>>>       ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
>>>>>>       if (!ring)
>>>>>> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>>>>
>>>>>>       fc->ring = ring;
>>>>>>       ring->nr_queues = nr_queues;
>>>>>> +     if (nr_queues == nr_cpus) {
>>>>>> +             ring->core_affinity = 1;
>>>>>> +     } else {
>>>>>> +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
>>>>>> +                     !is_power_of_2(nr_queues));
>>>>>> +             ring->qid_hash_bits = ilog2(nr_queues);
>>>>>> +     }
>>>>>>       ring->fc = fc;
>>>>>>       ring->max_payload_sz = max_payload_size;
>>>>>>       atomic_set(&ring->queue_refs, 0);
>>>>>> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
>>>>>>       fuse_uring_send(ent, cmd, err, issue_flags);
>>>>>>  }
>>>>>>
>>>>>> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
>>>>>> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
>>>>>> +{
>>>>>> +     if (ring->nr_queues == 1)
>>>>>> +             return 0;
>>>>>> +
>>>>>> +     return hash_long(nodeid, ring->qid_hash_bits);
>>>>>> +}
>>>>>> +
>>>>>> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
>>>>>> +                                                     struct fuse_req *req)
>>>>>>  {
>>>>>>       unsigned int qid;
>>>>>>       struct fuse_ring_queue *queue;
>>>>>>
>>>>>> -     qid = task_cpu(current);
>>>>>> +     if (ring->core_affinity)
>>>>>> +             qid = task_cpu(current);
>>>>>> +     else
>>>>>> +             qid = hash_qid(ring, req->in.h.nodeid);
>>>>>
>>>>> I think we need to handle numa affinity.
>>>>>
>>>>
>>>> Could you elaborate more on this? I'm not too familiar with how to
>>>> enforce this in practice. As I understand it, the main goal of numa
>>>> affinity is to make sure processes access memory that's physically
>>>> closer to the CPU it's executing on. How does this usually get
>>>> enforced at the kernel level?
>>>
>>> The request comes on a specific core and that is on a numa node -
>>> we should try to avoid switching. If there is no queue for the
>>> current core we should try to stay on the same numa node.
>>> And we should probably also consider the waiting requests per
>>> queue and distribute between that, although that is a bit
>>> independent.
>>>
>>
>> In that case then, there's no guarantee that requests on the same file
>> will get sent to the same queue. But thinking more about this, maybe
>> it doesn't matter after all if they're sent to different queues. I
>> need to think some more about this. But I agree, if we don't care
>> about requests for the same inode getting routed to the same queue,
>> then we should aim for numa affinity. I'll look more into this.
>>
> 
> Thought about this some more... is this even worth doing? AFAICT,
> there's no guarantee that the same number of CPUs are distributed
> evenly across numa nodes. For example, one numa node may have CPUs 0
> to 5 on them, then another numa node might have CPU 6 and 7. If
> there's two queues, each associated with a numa node, then requests
> will be disproportionately / unevenly allocated. Eg most of the
> workload will be queued on the numa node with CPUs 0 to 5. Moreover, I
> don't think there's a good way to enforce this in the cases where
> number of queues < number of numa nodes. For example if there's 3 numa
> nodes with say 3 CPUs each and there's 2 queues. The logic for which
> cpu gets sent to which queue gets a little messy here.
> 
> imo, this is an optimization that could be added in the future if the
> need for this comes up. WDYT?


I will eventually come to this this week. My plan is to use queue
lengths for distribution. We should do that for background requests
anyway. 
I.e. first try the local core, if queue length to large or no queue.
try queues within the same numa domain, if all queues are busy check
if foreign queues are more suitable.

Thanks,
Bernd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1] fuse: support configurable number of uring queues
  2025-04-01 17:06           ` Bernd Schubert
@ 2025-06-04 17:57             ` Bernd Schubert
  0 siblings, 0 replies; 8+ messages in thread
From: Bernd Schubert @ 2025-06-04 17:57 UTC (permalink / raw)
  To: Joanne Koong; +Cc: miklos, linux-fsdevel, kernel-team



On 4/1/25 19:06, Bernd Schubert wrote:
> 
> 
> On 4/1/25 02:22, Joanne Koong wrote:
>> On Tue, Mar 18, 2025 at 4:16 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>>>
>>> On Tue, Mar 18, 2025 at 3:33 AM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>> On 3/18/25 01:55, Joanne Koong wrote:
>>>>> Hi Bernd,
>>>>> Thanks for the quick turnaround on the review!
>>>>>
>>>>> On Fri, Mar 14, 2025 at 4:11 PM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>> Thanks Joanne! That is rather close to what I wanted to add,
>>>>>> just a few comments.
>>>>>>
>>>>>> On 3/14/25 21:44, Joanne Koong wrote:
>>>>>>> In the current uring design, the number of queues is equal to the number
>>>>>>> of cores on a system. However, on high-scale machines where there are
>>>>>>> hundreds of cores, having such a high number of queues is often
>>>>>>> overkill and resource-intensive. As well, in the current design where
>>>>>>> the queue for the request is set to the cpu the task is currently
>>>>>>> executing on (see fuse_uring_task_to_queue()), there is no guarantee
>>>>>>> that requests for the same file will be sent to the same queue (eg if a
>>>>>>> task is preempted and moved to a different cpu) which may be problematic
>>>>>>> for some servers (eg if the server is append-only and does not support
>>>>>>> unordered writes).
>>>>>>>
>>>>>>> In this commit, the server can configure the number of uring queues
>>>>>>> (passed to the kernel through the init reply). The number of queues must
>>>>>>> be a power of two, in order to make queue assignment for a request
>>>>>>> efficient. If the server specifies a non-power of two, then it will be
>>>>>>> automatically rounded down to the nearest power of two. If the server
>>>>>>> does not specify the number of queues, then this will automatically
>>>>>>> default to the current behavior where the number of queues will be equal
>>>>>>> to the number of cores with core and numa affinity. The queue id hash
>>>>>>> is computed on the nodeid, which ensures that requests for the same file
>>>>>>> will be forwarded to the same queue.
>>>>>>>
>>>>>>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
>>>>>>> ---
>>>>>>>   fs/fuse/dev_uring.c       | 48 +++++++++++++++++++++++++++++++++++----
>>>>>>>   fs/fuse/dev_uring_i.h     | 11 +++++++++
>>>>>>>   fs/fuse/fuse_i.h          |  1 +
>>>>>>>   fs/fuse/inode.c           |  4 +++-
>>>>>>>   include/uapi/linux/fuse.h |  6 ++++-
>>>>>>>   5 files changed, 63 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>>>>>>> index 64f1ae308dc4..f173f9e451ac 100644
>>>>>>> --- a/fs/fuse/dev_uring.c
>>>>>>> +++ b/fs/fuse/dev_uring.c
>>>>>>> @@ -209,9 +209,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
>>>>>>>   static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>>>>>   {
>>>>>>>        struct fuse_ring *ring;
>>>>>>> -     size_t nr_queues = num_possible_cpus();
>>>>>>> +     size_t nr_queues = fc->uring_nr_queues;
>>>>>>>        struct fuse_ring *res = NULL;
>>>>>>>        size_t max_payload_size;
>>>>>>> +     unsigned int nr_cpus = num_possible_cpus();
>>>>>>>
>>>>>>>        ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
>>>>>>>        if (!ring)
>>>>>>> @@ -237,6 +238,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
>>>>>>>
>>>>>>>        fc->ring = ring;
>>>>>>>        ring->nr_queues = nr_queues;
>>>>>>> +     if (nr_queues == nr_cpus) {
>>>>>>> +             ring->core_affinity = 1;
>>>>>>> +     } else {
>>>>>>> +             WARN_ON(!nr_queues || nr_queues > nr_cpus ||
>>>>>>> +                     !is_power_of_2(nr_queues));
>>>>>>> +             ring->qid_hash_bits = ilog2(nr_queues);
>>>>>>> +     }
>>>>>>>        ring->fc = fc;
>>>>>>>        ring->max_payload_sz = max_payload_size;
>>>>>>>        atomic_set(&ring->queue_refs, 0);
>>>>>>> @@ -1217,12 +1225,24 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
>>>>>>>        fuse_uring_send(ent, cmd, err, issue_flags);
>>>>>>>   }
>>>>>>>
>>>>>>> -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
>>>>>>> +static unsigned int hash_qid(struct fuse_ring *ring, u64 nodeid)
>>>>>>> +{
>>>>>>> +     if (ring->nr_queues == 1)
>>>>>>> +             return 0;
>>>>>>> +
>>>>>>> +     return hash_long(nodeid, ring->qid_hash_bits);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring,
>>>>>>> +                                                     struct fuse_req *req)
>>>>>>>   {
>>>>>>>        unsigned int qid;
>>>>>>>        struct fuse_ring_queue *queue;
>>>>>>>
>>>>>>> -     qid = task_cpu(current);
>>>>>>> +     if (ring->core_affinity)
>>>>>>> +             qid = task_cpu(current);
>>>>>>> +     else
>>>>>>> +             qid = hash_qid(ring, req->in.h.nodeid);
>>>>>>
>>>>>> I think we need to handle numa affinity.
>>>>>>
>>>>>
>>>>> Could you elaborate more on this? I'm not too familiar with how to
>>>>> enforce this in practice. As I understand it, the main goal of numa
>>>>> affinity is to make sure processes access memory that's physically
>>>>> closer to the CPU it's executing on. How does this usually get
>>>>> enforced at the kernel level?
>>>>
>>>> The request comes on a specific core and that is on a numa node -
>>>> we should try to avoid switching. If there is no queue for the
>>>> current core we should try to stay on the same numa node.
>>>> And we should probably also consider the waiting requests per
>>>> queue and distribute between that, although that is a bit
>>>> independent.
>>>>
>>>
>>> In that case then, there's no guarantee that requests on the same file
>>> will get sent to the same queue. But thinking more about this, maybe
>>> it doesn't matter after all if they're sent to different queues. I
>>> need to think some more about this. But I agree, if we don't care
>>> about requests for the same inode getting routed to the same queue,
>>> then we should aim for numa affinity. I'll look more into this.
>>>
>>
>> Thought about this some more... is this even worth doing? AFAICT,
>> there's no guarantee that the same number of CPUs are distributed
>> evenly across numa nodes. For example, one numa node may have CPUs 0
>> to 5 on them, then another numa node might have CPU 6 and 7. If
>> there's two queues, each associated with a numa node, then requests
>> will be disproportionately / unevenly allocated. Eg most of the
>> workload will be queued on the numa node with CPUs 0 to 5. Moreover, I
>> don't think there's a good way to enforce this in the cases where
>> number of queues < number of numa nodes. For example if there's 3 numa
>> nodes with say 3 CPUs each and there's 2 queues. The logic for which
>> cpu gets sent to which queue gets a little messy here.
>>
>> imo, this is an optimization that could be added in the future if the
>> need for this comes up. WDYT?
> 
> 
> I will eventually come to this this week. My plan is to use queue
> lengths for distribution. We should do that for background requests
> anyway.
> I.e. first try the local core, if queue length to large or no queue.
> try queues within the same numa domain, if all queues are busy check
> if foreign queues are more suitable.


Sorry for the delay, and just a heads up. Had to find the time to get 
there.

https://github.com/bsbernd/linux/commits/reduced-nr-ring-queues/

Totally untested and I don't like the loops for queue selection. Will
try to find a faster way (currently thinking about queue bitmaps).



Thanks,
Bernd



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-06-04 17:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-14 20:44 [PATCH v1] fuse: support configurable number of uring queues Joanne Koong
2025-03-14 23:11 ` Bernd Schubert
2025-03-18  0:55   ` Joanne Koong
2025-03-18 10:33     ` Bernd Schubert
2025-03-18 23:16       ` Joanne Koong
2025-04-01  0:22         ` Joanne Koong
2025-04-01 17:06           ` Bernd Schubert
2025-06-04 17:57             ` Bernd Schubert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).