[PATCH RESEND] lib/group_cpus: make group CPU cluster aware

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
@ 2025-11-11  2:06 Wangyang Guo
  2025-11-11  3:25 ` Ming Lei
  0 siblings, 1 reply; 11+ messages in thread
From: Wangyang Guo @ 2025-11-11  2:06 UTC (permalink / raw)
  To: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block
  Cc: Wangyang Guo, Tianyou Li, Tim Chen, Dan Liang

As CPU core counts increase, the number of NVMe IRQs may be smaller than
the total number of CPUs. This forces multiple CPUs to share the same
IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
performance penalty can be observed on some platforms.

This patch improves IRQ affinity by grouping CPUs by cluster within each
NUMA domain, ensuring better locality between CPUs and their assigned
NVMe IRQs.

Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Tested-by: Dan Liang <dan.liang@intel.com>
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
 lib/group_cpus.c | 269 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 204 insertions(+), 65 deletions(-)

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index 6d08ac05f371..56ca6193736d 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
 	return ln->ncpus - rn->ncpus;
 }
 
-/*
- * Allocate group number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated number is <= active CPU number of this node
- *
- * The actual allocated total groups may be less than @numgrps when
- * active total CPU number is less than @numgrps.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_groups(unsigned int numgrps,
-			       cpumask_var_t *node_to_cpumask,
-			       const struct cpumask *cpu_mask,
-			       const nodemask_t nodemsk,
-			       struct cpumask *nmsk,
-			       struct node_groups *node_groups)
+static void alloc_groups_to_nodes(unsigned int numgrps,
+				  unsigned int numcpus,
+				  struct node_groups *node_groups,
+				  unsigned int num_nodes)
 {
-	unsigned n, remaining_ncpus = 0;
-
-	for (n = 0; n < nr_node_ids; n++) {
-		node_groups[n].id = n;
-		node_groups[n].ncpus = UINT_MAX;
-	}
-
-	for_each_node_mask(n, nodemsk) {
-		unsigned ncpus;
-
-		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
-		ncpus = cpumask_weight(nmsk);
+	unsigned int n, remaining_ncpus = numcpus;
+	unsigned int  ngroups, ncpus;
 
-		if (!ncpus)
-			continue;
-		remaining_ncpus += ncpus;
-		node_groups[n].ncpus = ncpus;
-	}
-
-	numgrps = min_t(unsigned, remaining_ncpus, numgrps);
-
-	sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
+	sort(node_groups, num_nodes, sizeof(node_groups[0]),
 	     ncpus_cmp_func, NULL);
 
 	/*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
 	 * finally for each node X: grps(X) <= ncpu(X).
 	 *
 	 */
-	for (n = 0; n < nr_node_ids; n++) {
-		unsigned ngroups, ncpus;
 
+	for (n = 0; n < num_nodes; n++) {
 		if (node_groups[n].ncpus == UINT_MAX)
 			continue;
 
@@ -246,12 +212,199 @@ static void alloc_nodes_groups(unsigned int numgrps,
 	}
 }
 
+/*
+ * Allocate group number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated number is <= active CPU number of this node
+ *
+ * The actual allocated total groups may be less than @numgrps when
+ * active total CPU number is less than @numgrps.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_groups(unsigned int numgrps,
+			       cpumask_var_t *node_to_cpumask,
+			       const struct cpumask *cpu_mask,
+			       const nodemask_t nodemsk,
+			       struct cpumask *nmsk,
+			       struct node_groups *node_groups)
+{
+	unsigned int n, numcpus = 0;
+
+	for (n = 0; n < nr_node_ids; n++) {
+		node_groups[n].id = n;
+		node_groups[n].ncpus = UINT_MAX;
+	}
+
+	for_each_node_mask(n, nodemsk) {
+		unsigned int ncpus;
+
+		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+		ncpus = cpumask_weight(nmsk);
+
+		if (!ncpus)
+			continue;
+		numcpus += ncpus;
+		node_groups[n].ncpus = ncpus;
+	}
+
+	numgrps = min_t(unsigned int, numcpus, numgrps);
+	alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
+}
+
+static void assign_cpus_to_groups(unsigned int ncpus,
+				  struct cpumask *nmsk,
+				  struct node_groups *nv,
+				  struct cpumask *masks,
+				  unsigned int *curgrp,
+				  unsigned int last_grp)
+{
+	unsigned int v, cpus_per_grp, extra_grps;
+	/* Account for rounding errors */
+	extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
+
+	/* Spread allocated groups on CPUs of the current node */
+	for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
+		cpus_per_grp = ncpus / nv->ngroups;
+
+		/* Account for extra groups to compensate rounding errors */
+		if (extra_grps) {
+			cpus_per_grp++;
+			--extra_grps;
+		}
+
+		/*
+		 * wrapping has to be considered given 'startgrp'
+		 * may start anywhere
+		 */
+		if (*curgrp >= last_grp)
+			*curgrp = 0;
+		grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp);
+	}
+}
+
+static int alloc_cluster_groups(unsigned int ncpus,
+				unsigned int ngroups,
+				struct cpumask *node_cpumask,
+				cpumask_var_t msk,
+				const struct cpumask ***clusters_ptr,
+				struct node_groups **cluster_groups_ptr)
+{
+	unsigned int ncluster = 0;
+	unsigned int cpu, nc, n;
+	const struct cpumask *cluster_mask;
+	const struct cpumask **clusters;
+	struct node_groups *cluster_groups;
+
+	cpumask_copy(msk, node_cpumask);
+
+	/* Probe how many clusters in this node. */
+	while (1) {
+		cpu = cpumask_first(msk);
+		if (cpu >= nr_cpu_ids)
+			break;
+
+		cluster_mask = topology_cluster_cpumask(cpu);
+		/* Clean out CPUs on the same cluster. */
+		cpumask_andnot(msk, msk, cluster_mask);
+		ncluster++;
+	}
+
+	/* If ngroups < ncluster, cross cluster is inevitable, skip. */
+	if (ncluster == 0 || ncluster > ngroups)
+		goto no_cluster;
+
+	/* Allocate memory based on cluster number. */
+	clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL);
+	if (!clusters)
+		goto no_cluster;
+	cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL);
+	if (!cluster_groups)
+		goto fail_cluster_groups;
+
+	/* Filling cluster info for later process. */
+	cpumask_copy(msk, node_cpumask);
+	for (n = 0; n < ncluster; n++) {
+		cpu = cpumask_first(msk);
+		cluster_mask = topology_cluster_cpumask(cpu);
+		nc = cpumask_weight_and(cluster_mask, node_cpumask);
+		clusters[n] = cluster_mask;
+		cluster_groups[n].id = n;
+		cluster_groups[n].ncpus = nc;
+		cpumask_andnot(msk, msk, cluster_mask);
+	}
+
+	alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster);
+
+	*clusters_ptr = clusters;
+	*cluster_groups_ptr = cluster_groups;
+	return ncluster;
+
+ fail_cluster_groups:
+	kfree(clusters);
+ no_cluster:
+	return 0;
+}
+
+/*
+ * Try group CPUs evenly for cluster locality within a NUMA node.
+ *
+ * Return: true if success, false otherwise.
+ */
+static bool __try_group_cluster_cpus(unsigned int ncpus,
+				     unsigned int ngroups,
+				     struct cpumask *node_cpumask,
+				     struct cpumask *masks,
+				     unsigned int *curgrp,
+				     unsigned int last_grp)
+{
+	struct node_groups *cluster_groups;
+	const struct cpumask **clusters;
+	unsigned int ncluster;
+	bool ret = false;
+	cpumask_var_t nmsk;
+	unsigned int i, nc;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		goto fail_nmsk_alloc;
+
+	ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk,
+					&clusters, &cluster_groups);
+
+	if (ncluster == 0)
+		goto fail_no_clusters;
+
+	for (i = 0; i < ncluster; i++) {
+		struct node_groups *nv = &cluster_groups[i];
+
+		/* Get the cpus on this cluster. */
+		cpumask_and(nmsk, node_cpumask, clusters[nv->id]);
+		nc = cpumask_weight(nmsk);
+		if (!nc)
+			continue;
+		WARN_ON_ONCE(nv->ngroups > nc);
+
+		assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
+	}
+
+	ret = true;
+	kfree(cluster_groups);
+	kfree(clusters);
+ fail_no_clusters:
+	free_cpumask_var(nmsk);
+ fail_nmsk_alloc:
+	return ret;
+}
+
 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 			       cpumask_var_t *node_to_cpumask,
 			       const struct cpumask *cpu_mask,
 			       struct cpumask *nmsk, struct cpumask *masks)
 {
-	unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+	unsigned int i, n, nodes, done = 0;
 	unsigned int last_grp = numgrps;
 	unsigned int curgrp = startgrp;
 	nodemask_t nodemsk = NODE_MASK_NONE;
@@ -287,7 +440,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 	alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
 			   nodemsk, nmsk, node_groups);
 	for (i = 0; i < nr_node_ids; i++) {
-		unsigned int ncpus, v;
+		unsigned int ncpus;
 		struct node_groups *nv = &node_groups[i];
 
 		if (nv->ngroups == UINT_MAX)
@@ -301,28 +454,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 
 		WARN_ON_ONCE(nv->ngroups > ncpus);
 
-		/* Account for rounding errors */
-		extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
-
-		/* Spread allocated groups on CPUs of the current node */
-		for (v = 0; v < nv->ngroups; v++, curgrp++) {
-			cpus_per_grp = ncpus / nv->ngroups;
-
-			/* Account for extra groups to compensate rounding errors */
-			if (extra_grps) {
-				cpus_per_grp++;
-				--extra_grps;
-			}
-
-			/*
-			 * wrapping has to be considered given 'startgrp'
-			 * may start anywhere
-			 */
-			if (curgrp >= last_grp)
-				curgrp = 0;
-			grp_spread_init_one(&masks[curgrp], nmsk,
-						cpus_per_grp);
+		if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
+					     masks, &curgrp, last_grp)) {
+			done += nv->ngroups;
+			continue;
 		}
+
+		assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
+				      last_grp);
 		done += nv->ngroups;
 	}
 	kfree(node_groups);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-11  2:06 [PATCH RESEND] lib/group_cpus: make group CPU cluster aware Wangyang Guo
@ 2025-11-11  3:25 ` Ming Lei
  2025-11-11  5:31   ` Guo, Wangyang
  0 siblings, 1 reply; 11+ messages in thread
From: Ming Lei @ 2025-11-11  3:25 UTC (permalink / raw)
  To: Wangyang Guo
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
> As CPU core counts increase, the number of NVMe IRQs may be smaller than
> the total number of CPUs. This forces multiple CPUs to share the same
> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
> performance penalty can be observed on some platforms.

Can you add details why/how CPU cluster isn't aligned with IRQ
affinity? And how performance penalty is caused?

Is it caused by remote IO completion in blk_mq_complete_need_ipi()?

	/* same CPU or cache domain and capacity?  Complete locally */
	if (cpu == rq->mq_ctx->cpu ||
	    (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
	     cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
	     cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
	        return false;

If yes, which case you are addressing to? cache domain or capccity?

AMD's CCX shares L3 cache inside NUMA node, which has similar issue,
I guess this patchset may cover it?

> This patch improves IRQ affinity by grouping CPUs by cluster within each
> NUMA domain, ensuring better locality between CPUs and their assigned
> NVMe IRQs.

Will look into this patch, but I feel one easier way is to build
sub-node(cluster) cpumask array, and just spread over the sub-node(cluster). 


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-11  3:25 ` Ming Lei
@ 2025-11-11  5:31   ` Guo, Wangyang
  2025-11-11 12:08     ` Ming Lei
  0 siblings, 1 reply; 11+ messages in thread
From: Guo, Wangyang @ 2025-11-11  5:31 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/11/2025 11:25 AM, Ming Lei wrote:
> On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
>> As CPU core counts increase, the number of NVMe IRQs may be smaller than
>> the total number of CPUs. This forces multiple CPUs to share the same
>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>> performance penalty can be observed on some platforms.
> 
> Can you add details why/how CPU cluster isn't aligned with IRQ
> affinity? And how performance penalty is caused?

Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share 
the L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 
IRQs to dispatch. The existing algorithm will map first 7 IRQs each with 
4 CPUs and remained 4 IRQs each with 3 CPUs each. The last 4 IRQs may 
have cross cluster issue. For example, the 9th IRQ which pinned to 
CPU32, then for CPU31, it will have cross L2 memory access.

CPU |28 29 30 31|32 33 34 35|36 ...
      -------- -------- --------
IRQ      8        9       10

If this patch applied, then first 2 IRQs each mapped with 2 CPUs and 
rest 9 IRQs each mapped with 4 CPUs, which avoids the cross cluster 
memory access.

CPU |00 01 02 03|04 05 06 07|08 09 10 11| ...
      ----- ----- ----------- -----------
IRQ  1      2        3           4

BR
Wangyang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-11  5:31   ` Guo, Wangyang
@ 2025-11-11 12:08     ` Ming Lei
  2025-11-12  3:02       ` Guo, Wangyang
  0 siblings, 1 reply; 11+ messages in thread
From: Ming Lei @ 2025-11-11 12:08 UTC (permalink / raw)
  To: Guo, Wangyang
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
> On 11/11/2025 11:25 AM, Ming Lei wrote:
> > On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
> > > As CPU core counts increase, the number of NVMe IRQs may be smaller than
> > > the total number of CPUs. This forces multiple CPUs to share the same
> > > IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
> > > performance penalty can be observed on some platforms.
> > 
> > Can you add details why/how CPU cluster isn't aligned with IRQ
> > affinity? And how performance penalty is caused?
> 
> Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share the
> L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 IRQs to
> dispatch. The existing algorithm will map first 7 IRQs each with 4 CPUs and
> remained 4 IRQs each with 3 CPUs each. The last 4 IRQs may have cross
> cluster issue. For example, the 9th IRQ which pinned to CPU32, then for
> CPU31, it will have cross L2 memory access.


CPUs sharing L2 usually have small number, and it is common to see one queue
mapping includes CPUs from different L2.

So how much does crossing L2 hurt IO perf?

They still should share same L3 cache, and cpus_share_cache() should be
true when the IO completes on the CPU which belong to different L2 with the
submission CPU, and remote completion via IPI won't be triggered.

From my observation, remote completion does hurt NVMe IO perf very much,
for example, AMD's crossing L3 mapping.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-11 12:08     ` Ming Lei
@ 2025-11-12  3:02       ` Guo, Wangyang
  2025-11-13  1:38         ` Ming Lei
  0 siblings, 1 reply; 11+ messages in thread
From: Guo, Wangyang @ 2025-11-12  3:02 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/11/2025 8:08 PM, Ming Lei wrote:
> On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
>> On 11/11/2025 11:25 AM, Ming Lei wrote:
>>> On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
>>>> As CPU core counts increase, the number of NVMe IRQs may be smaller than
>>>> the total number of CPUs. This forces multiple CPUs to share the same
>>>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>>>> performance penalty can be observed on some platforms.
>>>
>>> Can you add details why/how CPU cluster isn't aligned with IRQ
>>> affinity? And how performance penalty is caused?
>>
>> Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share the
>> L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 IRQs to
>> dispatch. The existing algorithm will map first 7 IRQs each with 4 CPUs and
>> remained 4 IRQs each with 3 CPUs each. The last 4 IRQs may have cross
>> cluster issue. For example, the 9th IRQ which pinned to CPU32, then for
>> CPU31, it will have cross L2 memory access.
> 
> 
> CPUs sharing L2 usually have small number, and it is common to see one queue
> mapping includes CPUs from different L2.
> 
> So how much does crossing L2 hurt IO perf?
We see 15%+ performance difference in FIO libaio/randread/bs=8k.

> They still should share same L3 cache, and cpus_share_cache() should be
> true when the IO completes on the CPU which belong to different L2 with the
> submission CPU, and remote completion via IPI won't be triggered.
Yes, remote IPI not triggered.


BR
Wangyang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-12  3:02       ` Guo, Wangyang
@ 2025-11-13  1:38         ` Ming Lei
  2025-11-13  3:32           ` Guo, Wangyang
  2025-11-18  6:29           ` Guo, Wangyang
  0 siblings, 2 replies; 11+ messages in thread
From: Ming Lei @ 2025-11-13  1:38 UTC (permalink / raw)
  To: Guo, Wangyang
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
> On 11/11/2025 8:08 PM, Ming Lei wrote:
> > On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
> > > On 11/11/2025 11:25 AM, Ming Lei wrote:
> > > > On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
> > > > > As CPU core counts increase, the number of NVMe IRQs may be smaller than
> > > > > the total number of CPUs. This forces multiple CPUs to share the same
> > > > > IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
> > > > > performance penalty can be observed on some platforms.
> > > > 
> > > > Can you add details why/how CPU cluster isn't aligned with IRQ
> > > > affinity? And how performance penalty is caused?
> > > 
> > > Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share the
> > > L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 IRQs to
> > > dispatch. The existing algorithm will map first 7 IRQs each with 4 CPUs and
> > > remained 4 IRQs each with 3 CPUs each. The last 4 IRQs may have cross
> > > cluster issue. For example, the 9th IRQ which pinned to CPU32, then for
> > > CPU31, it will have cross L2 memory access.
> > 
> > 
> > CPUs sharing L2 usually have small number, and it is common to see one queue
> > mapping includes CPUs from different L2.
> > 
> > So how much does crossing L2 hurt IO perf?
> We see 15%+ performance difference in FIO libaio/randread/bs=8k.

As I mentioned, it is common to see CPUs crossing L2 in same group, but why
does it make a difference here? You mentioned just some platforms are
affected.

> > They still should share same L3 cache, and cpus_share_cache() should be
> > true when the IO completes on the CPU which belong to different L2 with the
> > submission CPU, and remote completion via IPI won't be triggered.
> Yes, remote IPI not triggered.

OK, in my test on AMD zen4, NVMe performance can be dropped to 1/2 - 1/3 if
remote IPI is triggered in case of crossing L3, which is understandable.

I will check if topo cluster can cover L3, if yes, the patch still can be
simplified a lot by introducing sub-node spread by changing build_node_to_cpumask()
and adding nr_sub_nodes.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-13  1:38         ` Ming Lei
@ 2025-11-13  3:32           ` Guo, Wangyang
  2025-11-18  6:29           ` Guo, Wangyang
  1 sibling, 0 replies; 11+ messages in thread
From: Guo, Wangyang @ 2025-11-13  3:32 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/13/2025 9:38 AM, Ming Lei wrote:
> On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
>> On 11/11/2025 8:08 PM, Ming Lei wrote:
>>> On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
>>>> On 11/11/2025 11:25 AM, Ming Lei wrote:
>>>>> On Tue, Nov 11, 2025 at 10:06:08AM +0800, Wangyang Guo wrote:
>>>>>> As CPU core counts increase, the number of NVMe IRQs may be smaller than
>>>>>> the total number of CPUs. This forces multiple CPUs to share the same
>>>>>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>>>>>> performance penalty can be observed on some platforms.
>>>>>
>>>>> Can you add details why/how CPU cluster isn't aligned with IRQ
>>>>> affinity? And how performance penalty is caused?
>>>>
>>>> Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share the
>>>> L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 IRQs to
>>>> dispatch. The existing algorithm will map first 7 IRQs each with 4 CPUs and
>>>> remained 4 IRQs each with 3 CPUs each. The last 4 IRQs may have cross
>>>> cluster issue. For example, the 9th IRQ which pinned to CPU32, then for
>>>> CPU31, it will have cross L2 memory access.
>>>
>>>
>>> CPUs sharing L2 usually have small number, and it is common to see one queue
>>> mapping includes CPUs from different L2.
>>>
>>> So how much does crossing L2 hurt IO perf?
>> We see 15%+ performance difference in FIO libaio/randread/bs=8k.
> 
> As I mentioned, it is common to see CPUs crossing L2 in same group, but why
> does it make a difference here? You mentioned just some platforms are
> affected.

We observed the performance difference in Intel Xeon E platform which 
has 4 physical CPU cores as 1 module (cluster) sharing the same L2 
cache. For other platforms like Intel P-core or AMD, I think it's hard 
to show performance benefit with L2 locality, because:
1. L2 cache is only shared within 2 logic core when HT enabled
2. If IRQ pinned to corresponding HT core, the L2 cache locality is 
good, but other aspects like retiring maybe affected since they are 
sharing the same physical CPU resources.

BR
Wangyang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-13  1:38         ` Ming Lei
  2025-11-13  3:32           ` Guo, Wangyang
@ 2025-11-18  6:29           ` Guo, Wangyang
  2025-11-19  1:52             ` Ming Lei
  1 sibling, 1 reply; 11+ messages in thread
From: Guo, Wangyang @ 2025-11-18  6:29 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/13/2025 9:38 AM, Ming Lei wrote:
> On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
>> On 11/11/2025 8:08 PM, Ming Lei wrote:
>>> On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
>>> They still should share same L3 cache, and cpus_share_cache() should be
>>> true when the IO completes on the CPU which belong to different L2 with the
>>> submission CPU, and remote completion via IPI won't be triggered.
>> Yes, remote IPI not triggered.
> 
> OK, in my test on AMD zen4, NVMe performance can be dropped to 1/2 - 1/3 if
> remote IPI is triggered in case of crossing L3, which is understandable.
> 
> I will check if topo cluster can cover L3, if yes, the patch still can be
> simplified a lot by introducing sub-node spread by changing build_node_to_cpumask()
> and adding nr_sub_nodes.

Do you mean using cluster as "NUMA" nodes to spread CPU, instead of two 
level NUMA-cluster spreading?

BR
Wangyang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-18  6:29           ` Guo, Wangyang
@ 2025-11-19  1:52             ` Ming Lei
  2025-11-24  7:58               ` Guo, Wangyang
  0 siblings, 1 reply; 11+ messages in thread
From: Ming Lei @ 2025-11-19  1:52 UTC (permalink / raw)
  To: Guo, Wangyang
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On Tue, Nov 18, 2025 at 02:29:20PM +0800, Guo, Wangyang wrote:
> On 11/13/2025 9:38 AM, Ming Lei wrote:
> > On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
> > > On 11/11/2025 8:08 PM, Ming Lei wrote:
> > > > On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
> > > > They still should share same L3 cache, and cpus_share_cache() should be
> > > > true when the IO completes on the CPU which belong to different L2 with the
> > > > submission CPU, and remote completion via IPI won't be triggered.
> > > Yes, remote IPI not triggered.
> > 
> > OK, in my test on AMD zen4, NVMe performance can be dropped to 1/2 - 1/3 if
> > remote IPI is triggered in case of crossing L3, which is understandable.
> > 
> > I will check if topo cluster can cover L3, if yes, the patch still can be
> > simplified a lot by introducing sub-node spread by changing build_node_to_cpumask()
> > and adding nr_sub_nodes.
> 
> Do you mean using cluster as "NUMA" nodes to spread CPU, instead of two
> level NUMA-cluster spreading?

Yes, I think the change may be minimized by introducing sub-numa-node for
covering it, what do you think of this approach?

However, it is bad to use cluster as sub-numa-node at default, because cluster
is aligned with CPUs sharing L2 cache, so there could be too many clusters
for many systems in which one cluster just includes two CPUs, then the finally
calculated mapping crosses clusters inevitably because nr_queues is
less than nr_clusters.

I'd suggest to map CPUs sharing L3 cache into one sub-numa-node.

For your case, either adding one kernel parameter, or adding group_cpus_cluster()
API for the unusual case by sharing single code path.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-19  1:52             ` Ming Lei
@ 2025-11-24  7:58               ` Guo, Wangyang
  2025-12-08  2:47                 ` Guo, Wangyang
  0 siblings, 1 reply; 11+ messages in thread
From: Guo, Wangyang @ 2025-11-24  7:58 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/19/2025 9:52 AM, Ming Lei wrote:
> On Tue, Nov 18, 2025 at 02:29:20PM +0800, Guo, Wangyang wrote:
>> On 11/13/2025 9:38 AM, Ming Lei wrote:
>>> On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
>>>> On 11/11/2025 8:08 PM, Ming Lei wrote:
>>>>> On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
>>>>> They still should share same L3 cache, and cpus_share_cache() should be
>>>>> true when the IO completes on the CPU which belong to different L2 with the
>>>>> submission CPU, and remote completion via IPI won't be triggered.
>>>> Yes, remote IPI not triggered.
>>>
>>> OK, in my test on AMD zen4, NVMe performance can be dropped to 1/2 - 1/3 if
>>> remote IPI is triggered in case of crossing L3, which is understandable.
>>>
>>> I will check if topo cluster can cover L3, if yes, the patch still can be
>>> simplified a lot by introducing sub-node spread by changing build_node_to_cpumask()
>>> and adding nr_sub_nodes.
>>
>> Do you mean using cluster as "NUMA" nodes to spread CPU, instead of two
>> level NUMA-cluster spreading?
> 
> Yes, I think the change may be minimized by introducing sub-numa-node for
> covering it, what do you think of this approach?
> 
> However, it is bad to use cluster as sub-numa-node at default, because cluster
> is aligned with CPUs sharing L2 cache, so there could be too many clusters
> for many systems in which one cluster just includes two CPUs, then the finally
> calculated mapping crosses clusters inevitably because nr_queues is
> less than nr_clusters.
> 
> I'd suggest to map CPUs sharing L3 cache into one sub-numa-node.
> 
> For your case, either adding one kernel parameter, or adding group_cpus_cluster()
> API for the unusual case by sharing single code path.

Yes, it will make the change simpler, but no matter using cluster or L3 
cache as sub-numa-node, as CPU core count increase, there is potential 
risk that nr_queues < nr_sub_numa_node, which will make CPU spreading no 
affinity at all.

I think we need to keep two level NUMA/sub-NUMA spreading to avoid such 
regression.

For most platforms, there is no need for 2nd level spreading since L2 is 
not shared between physical cores and L3 mapping is the same as NUMA. I 
think we can add a platform specified configuration: by default, only 
NUMA spreading is used; if platform like Intel Xeon E or some AMD 
platform need second level spreading, we can config it to use L3 or 
cluster as sub-numa-node for 2nd level spreading.

How does that sound to you?

BR
Wangyang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RESEND] lib/group_cpus: make group CPU cluster aware
  2025-11-24  7:58               ` Guo, Wangyang
@ 2025-12-08  2:47                 ` Guo, Wangyang
  0 siblings, 0 replies; 11+ messages in thread
From: Guo, Wangyang @ 2025-12-08  2:47 UTC (permalink / raw)
  To: Ming Lei
  Cc: Andrew Morton, Thomas Gleixner, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, linux-kernel, linux-nvme,
	virtualization, linux-block, Tianyou Li, Tim Chen, Dan Liang

On 11/24/2025 3:58 PM, Guo, Wangyang wrote:
> On 11/19/2025 9:52 AM, Ming Lei wrote:
>> On Tue, Nov 18, 2025 at 02:29:20PM +0800, Guo, Wangyang wrote:
>>> On 11/13/2025 9:38 AM, Ming Lei wrote:
>>>> On Wed, Nov 12, 2025 at 11:02:47AM +0800, Guo, Wangyang wrote:
>>>>> On 11/11/2025 8:08 PM, Ming Lei wrote:
>>>>>> On Tue, Nov 11, 2025 at 01:31:04PM +0800, Guo, Wangyang wrote:
>>>>>> They still should share same L3 cache, and cpus_share_cache() 
>>>>>> should be
>>>>>> true when the IO completes on the CPU which belong to different L2 
>>>>>> with the
>>>>>> submission CPU, and remote completion via IPI won't be triggered.
>>>>> Yes, remote IPI not triggered.
>>>>
>>>> OK, in my test on AMD zen4, NVMe performance can be dropped to 1/2 - 
>>>> 1/3 if
>>>> remote IPI is triggered in case of crossing L3, which is 
>>>> understandable.
>>>>
>>>> I will check if topo cluster can cover L3, if yes, the patch still 
>>>> can be
>>>> simplified a lot by introducing sub-node spread by changing 
>>>> build_node_to_cpumask()
>>>> and adding nr_sub_nodes.
>>>
>>> Do you mean using cluster as "NUMA" nodes to spread CPU, instead of two
>>> level NUMA-cluster spreading?
>>
>> Yes, I think the change may be minimized by introducing sub-numa-node for
>> covering it, what do you think of this approach?
>>
>> However, it is bad to use cluster as sub-numa-node at default, because 
>> cluster
>> is aligned with CPUs sharing L2 cache, so there could be too many 
>> clusters
>> for many systems in which one cluster just includes two CPUs, then the 
>> finally
>> calculated mapping crosses clusters inevitably because nr_queues is
>> less than nr_clusters.
>>
>> I'd suggest to map CPUs sharing L3 cache into one sub-numa-node.
>>
>> For your case, either adding one kernel parameter, or adding 
>> group_cpus_cluster()
>> API for the unusual case by sharing single code path.
> 
> Yes, it will make the change simpler, but no matter using cluster or L3 
> cache as sub-numa-node, as CPU core count increase, there is potential 
> risk that nr_queues < nr_sub_numa_node, which will make CPU spreading no 
> affinity at all.
> 
> I think we need to keep two level NUMA/sub-NUMA spreading to avoid such 
> regression.
> 
> For most platforms, there is no need for 2nd level spreading since L2 is 
> not shared between physical cores and L3 mapping is the same as NUMA. I 
> think we can add a platform specified configuration: by default, only 
> NUMA spreading is used; if platform like Intel Xeon E or some AMD 
> platform need second level spreading, we can config it to use L3 or 
> cluster as sub-numa-node for 2nd level spreading.
> 
> How does that sound to you?

Hi Ming,

Hope you're doing well.

Just wanted to kindly follow up on my previous email. Please let me know 
if you have any further feedback/guidance on the proposed changes or if 
there’s anything I can do to help move this forward.

Thanks for your time and guidance!

BR
Wangyang


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-12-08  2:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-11  2:06 [PATCH RESEND] lib/group_cpus: make group CPU cluster aware Wangyang Guo
2025-11-11  3:25 ` Ming Lei
2025-11-11  5:31   ` Guo, Wangyang
2025-11-11 12:08     ` Ming Lei
2025-11-12  3:02       ` Guo, Wangyang
2025-11-13  1:38         ` Ming Lei
2025-11-13  3:32           ` Guo, Wangyang
2025-11-18  6:29           ` Guo, Wangyang
2025-11-19  1:52             ` Ming Lei
2025-11-24  7:58               ` Guo, Wangyang
2025-12-08  2:47                 ` Guo, Wangyang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).