[PATCH] lib/group_cpus: make group CPU cluster aware

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] lib/group_cpus: make group CPU cluster aware
@ 2025-10-24  2:30 Wangyang Guo
  2025-12-21 19:10 ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Wangyang Guo @ 2025-10-24  2:30 UTC (permalink / raw)
  To: Andrew Morton, Thomas Gleixner, linux-kernel
  Cc: Wangyang Guo, Tianyou Li, Tim Chen, Dan Liang

As CPU core counts increase, the number of NVMe IRQs may be smaller than
the total number of CPUs. This forces multiple CPUs to share the same
IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
performance penalty can be observed on some platforms.

This patch improves IRQ affinity by grouping CPUs by cluster within each
NUMA domain, ensuring better locality between CPUs and their assigned
NVMe IRQs.

Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Tested-by: Dan Liang <dan.liang@intel.com>
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
 lib/group_cpus.c | 269 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 204 insertions(+), 65 deletions(-)

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index 6d08ac05f371..56ca6193736d 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
 	return ln->ncpus - rn->ncpus;
 }
 
-/*
- * Allocate group number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated number is <= active CPU number of this node
- *
- * The actual allocated total groups may be less than @numgrps when
- * active total CPU number is less than @numgrps.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_groups(unsigned int numgrps,
-			       cpumask_var_t *node_to_cpumask,
-			       const struct cpumask *cpu_mask,
-			       const nodemask_t nodemsk,
-			       struct cpumask *nmsk,
-			       struct node_groups *node_groups)
+static void alloc_groups_to_nodes(unsigned int numgrps,
+				  unsigned int numcpus,
+				  struct node_groups *node_groups,
+				  unsigned int num_nodes)
 {
-	unsigned n, remaining_ncpus = 0;
-
-	for (n = 0; n < nr_node_ids; n++) {
-		node_groups[n].id = n;
-		node_groups[n].ncpus = UINT_MAX;
-	}
-
-	for_each_node_mask(n, nodemsk) {
-		unsigned ncpus;
-
-		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
-		ncpus = cpumask_weight(nmsk);
+	unsigned int n, remaining_ncpus = numcpus;
+	unsigned int  ngroups, ncpus;
 
-		if (!ncpus)
-			continue;
-		remaining_ncpus += ncpus;
-		node_groups[n].ncpus = ncpus;
-	}
-
-	numgrps = min_t(unsigned, remaining_ncpus, numgrps);
-
-	sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
+	sort(node_groups, num_nodes, sizeof(node_groups[0]),
 	     ncpus_cmp_func, NULL);
 
 	/*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
 	 * finally for each node X: grps(X) <= ncpu(X).
 	 *
 	 */
-	for (n = 0; n < nr_node_ids; n++) {
-		unsigned ngroups, ncpus;
 
+	for (n = 0; n < num_nodes; n++) {
 		if (node_groups[n].ncpus == UINT_MAX)
 			continue;
 
@@ -246,12 +212,199 @@ static void alloc_nodes_groups(unsigned int numgrps,
 	}
 }
 
+/*
+ * Allocate group number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated number is <= active CPU number of this node
+ *
+ * The actual allocated total groups may be less than @numgrps when
+ * active total CPU number is less than @numgrps.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_groups(unsigned int numgrps,
+			       cpumask_var_t *node_to_cpumask,
+			       const struct cpumask *cpu_mask,
+			       const nodemask_t nodemsk,
+			       struct cpumask *nmsk,
+			       struct node_groups *node_groups)
+{
+	unsigned int n, numcpus = 0;
+
+	for (n = 0; n < nr_node_ids; n++) {
+		node_groups[n].id = n;
+		node_groups[n].ncpus = UINT_MAX;
+	}
+
+	for_each_node_mask(n, nodemsk) {
+		unsigned int ncpus;
+
+		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+		ncpus = cpumask_weight(nmsk);
+
+		if (!ncpus)
+			continue;
+		numcpus += ncpus;
+		node_groups[n].ncpus = ncpus;
+	}
+
+	numgrps = min_t(unsigned int, numcpus, numgrps);
+	alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
+}
+
+static void assign_cpus_to_groups(unsigned int ncpus,
+				  struct cpumask *nmsk,
+				  struct node_groups *nv,
+				  struct cpumask *masks,
+				  unsigned int *curgrp,
+				  unsigned int last_grp)
+{
+	unsigned int v, cpus_per_grp, extra_grps;
+	/* Account for rounding errors */
+	extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
+
+	/* Spread allocated groups on CPUs of the current node */
+	for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
+		cpus_per_grp = ncpus / nv->ngroups;
+
+		/* Account for extra groups to compensate rounding errors */
+		if (extra_grps) {
+			cpus_per_grp++;
+			--extra_grps;
+		}
+
+		/*
+		 * wrapping has to be considered given 'startgrp'
+		 * may start anywhere
+		 */
+		if (*curgrp >= last_grp)
+			*curgrp = 0;
+		grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp);
+	}
+}
+
+static int alloc_cluster_groups(unsigned int ncpus,
+				unsigned int ngroups,
+				struct cpumask *node_cpumask,
+				cpumask_var_t msk,
+				const struct cpumask ***clusters_ptr,
+				struct node_groups **cluster_groups_ptr)
+{
+	unsigned int ncluster = 0;
+	unsigned int cpu, nc, n;
+	const struct cpumask *cluster_mask;
+	const struct cpumask **clusters;
+	struct node_groups *cluster_groups;
+
+	cpumask_copy(msk, node_cpumask);
+
+	/* Probe how many clusters in this node. */
+	while (1) {
+		cpu = cpumask_first(msk);
+		if (cpu >= nr_cpu_ids)
+			break;
+
+		cluster_mask = topology_cluster_cpumask(cpu);
+		/* Clean out CPUs on the same cluster. */
+		cpumask_andnot(msk, msk, cluster_mask);
+		ncluster++;
+	}
+
+	/* If ngroups < ncluster, cross cluster is inevitable, skip. */
+	if (ncluster == 0 || ncluster > ngroups)
+		goto no_cluster;
+
+	/* Allocate memory based on cluster number. */
+	clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL);
+	if (!clusters)
+		goto no_cluster;
+	cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL);
+	if (!cluster_groups)
+		goto fail_cluster_groups;
+
+	/* Filling cluster info for later process. */
+	cpumask_copy(msk, node_cpumask);
+	for (n = 0; n < ncluster; n++) {
+		cpu = cpumask_first(msk);
+		cluster_mask = topology_cluster_cpumask(cpu);
+		nc = cpumask_weight_and(cluster_mask, node_cpumask);
+		clusters[n] = cluster_mask;
+		cluster_groups[n].id = n;
+		cluster_groups[n].ncpus = nc;
+		cpumask_andnot(msk, msk, cluster_mask);
+	}
+
+	alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster);
+
+	*clusters_ptr = clusters;
+	*cluster_groups_ptr = cluster_groups;
+	return ncluster;
+
+ fail_cluster_groups:
+	kfree(clusters);
+ no_cluster:
+	return 0;
+}
+
+/*
+ * Try group CPUs evenly for cluster locality within a NUMA node.
+ *
+ * Return: true if success, false otherwise.
+ */
+static bool __try_group_cluster_cpus(unsigned int ncpus,
+				     unsigned int ngroups,
+				     struct cpumask *node_cpumask,
+				     struct cpumask *masks,
+				     unsigned int *curgrp,
+				     unsigned int last_grp)
+{
+	struct node_groups *cluster_groups;
+	const struct cpumask **clusters;
+	unsigned int ncluster;
+	bool ret = false;
+	cpumask_var_t nmsk;
+	unsigned int i, nc;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		goto fail_nmsk_alloc;
+
+	ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk,
+					&clusters, &cluster_groups);
+
+	if (ncluster == 0)
+		goto fail_no_clusters;
+
+	for (i = 0; i < ncluster; i++) {
+		struct node_groups *nv = &cluster_groups[i];
+
+		/* Get the cpus on this cluster. */
+		cpumask_and(nmsk, node_cpumask, clusters[nv->id]);
+		nc = cpumask_weight(nmsk);
+		if (!nc)
+			continue;
+		WARN_ON_ONCE(nv->ngroups > nc);
+
+		assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
+	}
+
+	ret = true;
+	kfree(cluster_groups);
+	kfree(clusters);
+ fail_no_clusters:
+	free_cpumask_var(nmsk);
+ fail_nmsk_alloc:
+	return ret;
+}
+
 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 			       cpumask_var_t *node_to_cpumask,
 			       const struct cpumask *cpu_mask,
 			       struct cpumask *nmsk, struct cpumask *masks)
 {
-	unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+	unsigned int i, n, nodes, done = 0;
 	unsigned int last_grp = numgrps;
 	unsigned int curgrp = startgrp;
 	nodemask_t nodemsk = NODE_MASK_NONE;
@@ -287,7 +440,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 	alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
 			   nodemsk, nmsk, node_groups);
 	for (i = 0; i < nr_node_ids; i++) {
-		unsigned int ncpus, v;
+		unsigned int ncpus;
 		struct node_groups *nv = &node_groups[i];
 
 		if (nv->ngroups == UINT_MAX)
@@ -301,28 +454,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 
 		WARN_ON_ONCE(nv->ngroups > ncpus);
 
-		/* Account for rounding errors */
-		extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
-
-		/* Spread allocated groups on CPUs of the current node */
-		for (v = 0; v < nv->ngroups; v++, curgrp++) {
-			cpus_per_grp = ncpus / nv->ngroups;
-
-			/* Account for extra groups to compensate rounding errors */
-			if (extra_grps) {
-				cpus_per_grp++;
-				--extra_grps;
-			}
-
-			/*
-			 * wrapping has to be considered given 'startgrp'
-			 * may start anywhere
-			 */
-			if (curgrp >= last_grp)
-				curgrp = 0;
-			grp_spread_init_one(&masks[curgrp], nmsk,
-						cpus_per_grp);
+		if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
+					     masks, &curgrp, last_grp)) {
+			done += nv->ngroups;
+			continue;
 		}
+
+		assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
+				      last_grp);
 		done += nv->ngroups;
 	}
 	kfree(node_groups);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2025-10-24  2:30 [PATCH] lib/group_cpus: make group CPU cluster aware Wangyang Guo
@ 2025-12-21 19:10 ` Andrew Morton
  2025-12-22  3:03   ` Guo, Wangyang
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2025-12-21 19:10 UTC (permalink / raw)
  To: Wangyang Guo
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

On Fri, 24 Oct 2025 10:30:38 +0800 Wangyang Guo <wangyang.guo@intel.com> wrote:

> As CPU core counts increase, the number of NVMe IRQs may be smaller than
> the total number of CPUs. This forces multiple CPUs to share the same
> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
> performance penalty can be observed on some platforms.

It would be helpful to quantify "performance penalty".  At least give
readers some approximate understanding of how serious this issue is,
please.

> This patch improves IRQ affinity by grouping CPUs by cluster within each
> NUMA domain, ensuring better locality between CPUs and their assigned
> NVMe IRQs.
>
> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Tested-by: Dan Liang <dan.liang@intel.com>
> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>

Patch hasn't attracted additional review so I'll queue this version for
some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
note-to-self that a changelog addition is desirable.

Thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2025-12-21 19:10 ` Andrew Morton
@ 2025-12-22  3:03   ` Guo, Wangyang
  2026-01-09 19:13     ` Radu Rendec
  0 siblings, 1 reply; 8+ messages in thread
From: Guo, Wangyang @ 2025-12-22  3:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

On 12/22/2025 3:10 AM, Andrew Morton wrote:
> On Fri, 24 Oct 2025 10:30:38 +0800 Wangyang Guo <wangyang.guo@intel.com> wrote:
> 
>> As CPU core counts increase, the number of NVMe IRQs may be smaller than
>> the total number of CPUs. This forces multiple CPUs to share the same
>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>> performance penalty can be observed on some platforms.
> 
> It would be helpful to quantify "performance penalty".  At least give
> readers some approximate understanding of how serious this issue is,
> please.
> 
Thanks for your reminder, will update changelog in next version. We see 
15%+ performance difference in FIO libaio/randread/bs=8k.

>> This patch improves IRQ affinity by grouping CPUs by cluster within each
>> NUMA domain, ensuring better locality between CPUs and their assigned
>> NVMe IRQs.
>>
>> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Tested-by: Dan Liang <dan.liang@intel.com>
>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> 
> Patch hasn't attracted additional review so I'll queue this version for
> some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
> note-to-self that a changelog addition is desirable.


Thanks a lot for your time and support! Please let me know if you have 
any further comments or guidance. Any feedback would be appreciated.


BR
Wangyang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2025-12-22  3:03   ` Guo, Wangyang
@ 2026-01-09 19:13     ` Radu Rendec
  2026-01-09 22:47       ` Andrew Morton
  2026-01-10  2:24       ` Guo, Wangyang
  0 siblings, 2 replies; 8+ messages in thread
From: Radu Rendec @ 2026-01-09 19:13 UTC (permalink / raw)
  To: Guo, Wangyang, Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

Hi all,

On Mon, 2025-12-22 at 11:03 +0800, Guo, Wangyang wrote:
> On 12/22/2025 3:10 AM, Andrew Morton wrote:
> > On Fri, 24 Oct 2025 10:30:38 +0800 Wangyang Guo <wangyang.guo@intel.com> wrote:
> > 
> > > As CPU core counts increase, the number of NVMe IRQs may be smaller than
> > > the total number of CPUs. This forces multiple CPUs to share the same
> > > IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
> > > performance penalty can be observed on some platforms.
> > 
> > It would be helpful to quantify "performance penalty".  At least give
> > readers some approximate understanding of how serious this issue is,
> > please.
> > 
> Thanks for your reminder, will update changelog in next version. We see 
> 15%+ performance difference in FIO libaio/randread/bs=8k.
> 
> > > This patch improves IRQ affinity by grouping CPUs by cluster within each
> > > NUMA domain, ensuring better locality between CPUs and their assigned
> > > NVMe IRQs.
> > > 
> > > Reviewed-by: Tianyou Li <tianyou.li@intel.com>
> > > Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> > > Tested-by: Dan Liang <dan.liang@intel.com>
> > > Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> > 
> > Patch hasn't attracted additional review so I'll queue this version for
> > some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
> > note-to-self that a changelog addition is desirable.
> 
> 
> Thanks a lot for your time and support! Please let me know if you have 
> any further comments or guidance. Any feedback would be appreciated.

With this patch applied, I see a weird issue in a qemu x86_64 vm if I
start it with a higher number of max CPUs than active CPUs, for example
`-smp 4,maxcpus=8` on the qemu command line.

What I see is the `while (1)` loop in alloc_cluster_groups() spinning
forever. Removing the `maxcpus=8` from the qemu command line fixes the
issue but so does reverting the patch :)

FWIW, this is my full qemu command line, in case it helps reproduce the
issue.

qemu-system-x86_64 -nographic -m 1G -accel kvm -machine q35,hpet=off \
        -cpu host -smp 4,maxcpus=8 \
        -netdev bridge,br=vbr-nat,id=net0 -device virtio-net-pci,netdev=net0 \
        -drive file=root.img,format=raw,if=virtio \
        -kernel bzImage \
        -append "console=ttyS0 root=/dev/vda1 rw nokaslr" -s

And this is a stack trace extracted from gdb attached to qemu's gdb
server.

#0  check_region_inline (addr=0xffffffff833b7d40 <__per_cpu_offset+32>, 
    size=8, write=false, ret_ip=18446744071582127502)
    at mm/kasan/generic.c:185
#1  __asan_load8 (addr=0xffffffff833b7d40 <__per_cpu_offset+32>)
    at mm/kasan/generic.c:273
#2  0xffffffff8132158e in cpu_l2c_shared_mask (cpu=cpu@entry=4)
    at arch/x86/include/asm/smp.h:149
#3  cpu_clustergroup_mask (cpu=cpu@entry=4)
    at arch/x86/kernel/smpboot.c:672
#4  0xffffffff81e9d6b6 in alloc_cluster_groups (ncpus=ncpus@entry=4, 
    ngroups=ngroups@entry=4, 
    node_cpumask=node_cpumask@entry=0xffff88800d7a28e0, 
    msk=0xffff88800d7a24e0, 
    clusters_ptr=clusters_ptr@entry=0xffffc9000001eef0, 
    cluster_groups_ptr=cluster_groups_ptr@entry=0xffffc9000001eed0)
    at lib/group_cpus.c:310
#5  0xffffffff81e9da22 in __try_group_cluster_cpus (ncpus=ncpus@entry=4, 
    ngroups=4, node_cpumask=node_cpumask@entry=0xffff88800d7a28e0, 
    masks=masks@entry=0xffff88800e9a8000, 
    curgrp=curgrp@entry=0xffffc9000001f000, last_grp=last_grp@entry=4)
    at lib/group_cpus.c:374
#6  0xffffffff81e9decf in __group_cpus_evenly (startgrp=startgrp@entry=0, 
    numgrps=numgrps@entry=4, 
    node_to_cpumask=node_to_cpumask@entry=0xffff88800d7a28a0, 
    cpu_mask=<optimized out>, nmsk=<optimized out>, 
    masks=masks@entry=0xffff88800e9a8000)
    at lib/group_cpus.c:457
#7  0xffffffff81e9e406 in group_cpus_evenly (numgrps=numgrps@entry=4, 
    nummasks=nummasks@entry=0xffffc9000001f248)
    at lib/group_cpus.c:547
#8  0xffffffff8121690c in irq_create_affinity_masks (nvecs=nvecs@entry=5, 
    affd=affd@entry=0xffffc9000001f738)
    at kernel/irq/affinity.c:74
#9  0xffffffff81efc253 in msix_setup_interrupts (dev=<optimized out>, 
    entries=<optimized out>, nvec=5, affd=0xffffc9000001f738)
    at drivers/pci/msi/msi.c:694
#10 msix_capability_init (dev=0xffff88800e014000, entries=<optimized out>, 
    nvec=5, affd=<optimized out>)
    at drivers/pci/msi/msi.c:738
#11 __pci_enable_msix_range (dev=dev@entry=0xffff88800e014000, 
    entries=entries@entry=0x0, minvec=minvec@entry=5, maxvec=maxvec@entry=5, 
    affd=affd@entry=0xffffc9000001f738, flags=flags@entry=12)
    at drivers/pci/msi/msi.c:846
#12 0xffffffff81ef9ef3 in pci_alloc_irq_vectors_affinity (
    dev=0xffff88800e014000, min_vecs=min_vecs@entry=5, 
    max_vecs=max_vecs@entry=5, flags=flags@entry=12, 
    affd=affd@entry=0xffffc9000001f738)
    at drivers/pci/msi/api.c:268
#13 0xffffffff82039d1f in vp_request_msix_vectors (
    vdev=vdev@entry=0xffff88800e86a000, nvectors=5, 
    per_vq_vectors=per_vq_vectors@entry=true, desc=<optimized out>, 
    desc@entry=0xffffc9000001f738)
    at drivers/virtio/virtio_pci_common.c:160
#14 0xffffffff8203a615 in vp_find_vqs_msix (
    vdev=vdev@entry=0xffff88800e86a000, nvqs=nvqs@entry=4, 
    vqs=vqs@entry=0xffff88800d739ac0, 
    vqs_info=vqs_info@entry=0xffff88800d761100, 
    vector_policy=vector_policy@entry=VP_VQ_VECTOR_POLICY_EACH, 
    desc=desc@entry=0xffffc9000001f738)
    at drivers/virtio/virtio_pci_common.c:417
#15 0xffffffff8203ac0a in vp_find_vqs (vdev=vdev@entry=0xffff88800e86a000, 
    nvqs=4, vqs=0xffff88800d739ac0, vqs_info=0xffff88800d761100, 
    desc=0xffffc9000001f738)
    at drivers/virtio/virtio_pci_common.c:525
#16 0xffffffff82036553 in vp_modern_find_vqs (vdev=0xffff88800e86a000, 
    nvqs=<optimized out>, vqs=<optimized out>, vqs_info=<optimized out>, 
    desc=<optimized out>)
    at drivers/virtio/virtio_pci_modern.c:751
#17 0xffffffff82152ea9 in virtio_find_vqs (vdev=0xffff88800e86a000, nvqs=4, 
    vqs=0xffff88800d739ac0, vqs_info=0xffff88800d761100, 
    desc=0xffffc9000001f738)
    at include/linux/virtio_config.h:298
#18 init_vq (vblk=vblk@entry=0xffff88800e7fe800)
    at drivers/block/virtio_blk.c:1017
#19 0xffffffff82153212 in virtblk_probe (vdev=0xffff88800e86a000)
    at drivers/block/virtio_blk.c:1470
#20 0xffffffff82024133 in virtio_dev_probe (_d=0xffff88800e86a098)
    at drivers/virtio/virtio.c:347
#21 0xffffffff82114166 in call_driver_probe (dev=dev@entry=0xffff88800e86a098, 
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/dd.c:581
#22 0xffffffff82115038 in really_probe (dev=dev@entry=0xffff88800e86a098, 
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/dd.c:659
#23 0xffffffff821153d9 in __driver_probe_device (
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>, 
    dev=dev@entry=0xffff88800e86a098)
    at drivers/base/dd.c:801
#24 0xffffffff8211590a in driver_probe_device (
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>, 
    dev=dev@entry=0xffff88800e86a098)
    at drivers/base/dd.c:831
#25 0xffffffff82115c46 in __driver_attach (dev=0xffff88800e86a098, 
    data=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/dd.c:1225
#26 0xffffffff82111700 in bus_for_each_dev (bus=<optimized out>, 
    start=start@entry=0x0, data=data@entry=0xffffffff840a5020 <virtio_blk>, 
    fn=fn@entry=0xffffffff82115b30 <__driver_attach>)
    at drivers/base/bus.c:383
#27 0xffffffff82114738 in driver_attach (
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/dd.c:1243
#28 0xffffffff82113860 in bus_add_driver (
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/bus.c:715
#29 0xffffffff821171f6 in driver_register (
    drv=drv@entry=0xffffffff840a5020 <virtio_blk>)
    at drivers/base/driver.c:249
#30 0xffffffff8202306b in __register_virtio_driver (
    driver=driver@entry=0xffffffff840a5020 <virtio_blk>, owner=owner@entry=0x0)
    at drivers/virtio/virtio.c:456
#31 0xffffffff84b19134 in virtio_blk_init ()
    at drivers/block/virtio_blk.c:1707
#32 0xffffffff812884eb in do_one_initcall (
    fn=0xffffffff84b190e0 <virtio_blk_init>)
    at init/main.c:1379
#33 0xffffffff84a82ba7 in do_initcall_level (level=6, 
    command_line=0xffff8880090e1880 "console")
    at init/main.c:1441
#34 do_initcalls ()
    at init/main.c:1457
#35 0xffffffff84a82f24 in do_basic_setup ()
    at init/main.c:1476
#36 kernel_init_freeable ()
    at init/main.c:1689
#37 0xffffffff8277ad98 in kernel_init (unused=<optimized out>)
    at init/main.c:1579
#38 0xffffffff812e11fb in ret_from_fork (prev=<optimized out>, 
    regs=0xffffc9000001ff48, fn=0xffffffff8277ad80 <kernel_init>, fn_arg=0x0)
    at arch/x86/kernel/process.c:158
#39 0xffffffff8129390a in ret_from_fork_asm ()
    at arch/x86/entry/entry_64.S:246

-- 
Best regards,
Radu


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2026-01-09 19:13     ` Radu Rendec
@ 2026-01-09 22:47       ` Andrew Morton
  2026-01-13  2:37         ` Guo, Wangyang
  2026-01-10  2:24       ` Guo, Wangyang
  1 sibling, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2026-01-09 22:47 UTC (permalink / raw)
  To: Radu Rendec
  Cc: Guo, Wangyang, Thomas Gleixner, linux-kernel, Tianyou Li,
	Tim Chen, Dan Liang

On Fri, 09 Jan 2026 14:13:32 -0500 Radu Rendec <rrendec@redhat.com> wrote:

> > > > This patch improves IRQ affinity by grouping CPUs by cluster within each
> > > > NUMA domain, ensuring better locality between CPUs and their assigned
> > > > NVMe IRQs.
> > > > 
> > > > Reviewed-by: Tianyou Li <tianyou.li@intel.com>
> > > > Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> > > > Tested-by: Dan Liang <dan.liang@intel.com>
> > > > Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> > > 
> > > Patch hasn't attracted additional review so I'll queue this version for
> > > some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
> > > note-to-self that a changelog addition is desirable.
> > 
> > 
> > Thanks a lot for your time and support! Please let me know if you have 
> > any further comments or guidance. Any feedback would be appreciated.
> 
> With this patch applied, I see a weird issue in a qemu x86_64 vm if I
> start it with a higher number of max CPUs than active CPUs, for example
> `-smp 4,maxcpus=8` on the qemu command line.
> 
> What I see is the `while (1)` loop in alloc_cluster_groups() spinning
> forever. Removing the `maxcpus=8` from the qemu command line fixes the
> issue but so does reverting the patch :)

Great, thanks, I'll drop the patch.

I have notes here that an updated version is expected anyway.  Perhaps
due to planned changelog updates.  And perhaps hopes for additional
info regarding the runtime effects of the change.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2026-01-09 19:13     ` Radu Rendec
  2026-01-09 22:47       ` Andrew Morton
@ 2026-01-10  2:24       ` Guo, Wangyang
  2026-01-13  1:59         ` Guo, Wangyang
  1 sibling, 1 reply; 8+ messages in thread
From: Guo, Wangyang @ 2026-01-10  2:24 UTC (permalink / raw)
  To: Radu Rendec, Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

On 1/10/2026 3:13 AM, Radu Rendec wrote:
> Hi all,
> 
> On Mon, 2025-12-22 at 11:03 +0800, Guo, Wangyang wrote:
>> On 12/22/2025 3:10 AM, Andrew Morton wrote:
>>> On Fri, 24 Oct 2025 10:30:38 +0800 Wangyang Guo <wangyang.guo@intel.com> wrote:
>>>
>>>> As CPU core counts increase, the number of NVMe IRQs may be smaller than
>>>> the total number of CPUs. This forces multiple CPUs to share the same
>>>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>>>> performance penalty can be observed on some platforms.
>>>
>>> It would be helpful to quantify "performance penalty".  At least give
>>> readers some approximate understanding of how serious this issue is,
>>> please.
>>>
>> Thanks for your reminder, will update changelog in next version. We see
>> 15%+ performance difference in FIO libaio/randread/bs=8k.
>>
>>>> This patch improves IRQ affinity by grouping CPUs by cluster within each
>>>> NUMA domain, ensuring better locality between CPUs and their assigned
>>>> NVMe IRQs.
>>>>
>>>> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
>>>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>>> Tested-by: Dan Liang <dan.liang@intel.com>
>>>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
>>>
>>> Patch hasn't attracted additional review so I'll queue this version for
>>> some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
>>> note-to-self that a changelog addition is desirable.
>>
>>
>> Thanks a lot for your time and support! Please let me know if you have
>> any further comments or guidance. Any feedback would be appreciated.
> 
> With this patch applied, I see a weird issue in a qemu x86_64 vm if I
> start it with a higher number of max CPUs than active CPUs, for example
> `-smp 4,maxcpus=8` on the qemu command line.
> 
> What I see is the `while (1)` loop in alloc_cluster_groups() spinning
> forever. Removing the `maxcpus=8` from the qemu command line fixes the
> issue but so does reverting the patch :)

Thanks for the reporting. I will investigate this problem.

BR
Wangyang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2026-01-10  2:24       ` Guo, Wangyang
@ 2026-01-13  1:59         ` Guo, Wangyang
  0 siblings, 0 replies; 8+ messages in thread
From: Guo, Wangyang @ 2026-01-13  1:59 UTC (permalink / raw)
  To: Radu Rendec, Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

On 1/10/2026 10:24 AM, Guo, Wangyang wrote:
> On 1/10/2026 3:13 AM, Radu Rendec wrote:
>> Hi all,
>>
>> On Mon, 2025-12-22 at 11:03 +0800, Guo, Wangyang wrote:
>>> On 12/22/2025 3:10 AM, Andrew Morton wrote:
>>>> On Fri, 24 Oct 2025 10:30:38 +0800 Wangyang Guo 
>>>> <wangyang.guo@intel.com> wrote:
>>>>
>>>>> As CPU core counts increase, the number of NVMe IRQs may be smaller 
>>>>> than
>>>>> the total number of CPUs. This forces multiple CPUs to share the same
>>>>> IRQ. If the IRQ affinity and the CPU’s cluster do not align, a
>>>>> performance penalty can be observed on some platforms.
>>>>
>>>> It would be helpful to quantify "performance penalty".  At least give
>>>> readers some approximate understanding of how serious this issue is,
>>>> please.
>>>>
>>> Thanks for your reminder, will update changelog in next version. We see
>>> 15%+ performance difference in FIO libaio/randread/bs=8k.
>>>
>>>>> This patch improves IRQ affinity by grouping CPUs by cluster within 
>>>>> each
>>>>> NUMA domain, ensuring better locality between CPUs and their assigned
>>>>> NVMe IRQs.
>>>>>
>>>>> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
>>>>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>>>> Tested-by: Dan Liang <dan.liang@intel.com>
>>>>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
>>>>
>>>> Patch hasn't attracted additional review so I'll queue this version for
>>>> some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
>>>> note-to-self that a changelog addition is desirable.
>>>
>>>
>>> Thanks a lot for your time and support! Please let me know if you have
>>> any further comments or guidance. Any feedback would be appreciated.
>>
>> With this patch applied, I see a weird issue in a qemu x86_64 vm if I
>> start it with a higher number of max CPUs than active CPUs, for example
>> `-smp 4,maxcpus=8` on the qemu command line.
>>
>> What I see is the `while (1)` loop in alloc_cluster_groups() spinning
>> forever. Removing the `maxcpus=8` from the qemu command line fixes the
>> issue but so does reverting the patch :)
> 
> Thanks for the reporting. I will investigate this problem.
The problem happens in this loop:

	/* Probe how many clusters in this node. */
	while (1) {
		cpu = cpumask_first(msk);
		if (cpu >= nr_cpu_ids)
			break;

		cluster_mask = topology_cluster_cpumask(cpu);
		/* Clean out CPUs on the same cluster. */
		cpumask_andnot(msk, msk, cluster_mask);
		ncluster++;
	}

In this case, topology_cluster_cpumask(cpu) return an empty 
cluster_mask, which causes later cpumask_andnot invalid, entering a 
endless loop.

It can be fixed by checking returned cluster_mask:

                 cluster_mask = topology_cluster_cpumask(cpu);
+               if (!cpumask_weight(cluster_mask))
+                       goto no_cluster;
                 /* Clean out CPUs on the same cluster. */
                 cpumask_andnot(msk, msk, cluster_mask);
                 ncluster++;

BR
Wangyang


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lib/group_cpus: make group CPU cluster aware
  2026-01-09 22:47       ` Andrew Morton
@ 2026-01-13  2:37         ` Guo, Wangyang
  0 siblings, 0 replies; 8+ messages in thread
From: Guo, Wangyang @ 2026-01-13  2:37 UTC (permalink / raw)
  To: Andrew Morton, Radu Rendec
  Cc: Thomas Gleixner, linux-kernel, Tianyou Li, Tim Chen, Dan Liang

On 1/10/2026 6:47 AM, Andrew Morton wrote:
> On Fri, 09 Jan 2026 14:13:32 -0500 Radu Rendec <rrendec@redhat.com> wrote:
> 
>>>>> This patch improves IRQ affinity by grouping CPUs by cluster within each
>>>>> NUMA domain, ensuring better locality between CPUs and their assigned
>>>>> NVMe IRQs.
>>>>>
>>>>> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
>>>>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>>>> Tested-by: Dan Liang <dan.liang@intel.com>
>>>>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
>>>>
>>>> Patch hasn't attracted additional review so I'll queue this version for
>>>> some testing in mm.git's mm-nonmm-unstable branch.  I'll add a
>>>> note-to-self that a changelog addition is desirable.
>>>
>>>
>>> Thanks a lot for your time and support! Please let me know if you have
>>> any further comments or guidance. Any feedback would be appreciated.
>>
>> With this patch applied, I see a weird issue in a qemu x86_64 vm if I
>> start it with a higher number of max CPUs than active CPUs, for example
>> `-smp 4,maxcpus=8` on the qemu command line.
>>
>> What I see is the `while (1)` loop in alloc_cluster_groups() spinning
>> forever. Removing the `maxcpus=8` from the qemu command line fixes the
>> issue but so does reverting the patch :)
> 
> Great, thanks, I'll drop the patch.
> 
> I have notes here that an updated version is expected anyway.  Perhaps
> due to planned changelog updates.  And perhaps hopes for additional
> info regarding the runtime effects of the change.

This issue has been fixed and change log also updated as below:
https://lore.kernel.org/all/20260113022958.3379650-1-wangyang.guo@intel.com/T/#u

BR
Wangyang

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-01-13  2:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-24  2:30 [PATCH] lib/group_cpus: make group CPU cluster aware Wangyang Guo
2025-12-21 19:10 ` Andrew Morton
2025-12-22  3:03   ` Guo, Wangyang
2026-01-09 19:13     ` Radu Rendec
2026-01-09 22:47       ` Andrew Morton
2026-01-13  2:37         ` Guo, Wangyang
2026-01-10  2:24       ` Guo, Wangyang
2026-01-13  1:59         ` Guo, Wangyang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox