From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7BC46285C9F for ; Wed, 25 Feb 2026 16:32:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772037179; cv=none; b=Sd4cHchGw8GrHw+RzBw04M+WSQb47q1qvkJP+37dVHxfaek6G2eVsaPTUCdL4CNQ/HQTXq8Psxcg0URQSQhl4imwe5OLj83ZpLjcx9McyCowoxGdsIVPnrqmF4Wv0mF6m3e3KvPcAyHo1d+gG4CoIuP8DC7sSpa8jJDN0H8Cq60= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772037179; c=relaxed/simple; bh=I9luGRtaNsnAIkXfZmW1VSDXW3rCTc1yVsU0WxXjQBU=; h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version: Content-Type:Content-Disposition:In-Reply-To; b=BexfYS0e0h71EFqMnoAYLS/0+IFKPLp18tGEW2bSCl9+uoLwtx9t0BLuOzaerKMY3yKUc1pGLjRD8ky2/jSaYXOHMPxoe69/1BjhJ3pEpEP5Fw9PX/YCa0IklA+pyu7geX4+W/eKYFDre780+H15VXl8PswH5wjnf5qa5Lrj8g0= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=l3u1zmIj; arc=none smtp.client-ip=90.155.92.199 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="l3u1zmIj" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=desiato.20200630; h=In-Reply-To:Content-Type:MIME-Version: References:Message-ID:Subject:Cc:To:From:Date:Sender:Reply-To: Content-Transfer-Encoding:Content-ID:Content-Description; bh=qI4A86O5o02WZ19ztIpedZuQ69fuePZjBEMpXPMH4e8=; b=l3u1zmIj2dt/K8WROPQCot1ety w1Mq4keNiHacSA2pDppmmQNY9gNzv3l91cD6QvP81a4+ODZC1TctUJLvklzKpMo0X5vf4VMUVd0Oc ty2GFKhUcQIZvkRF+mBZV9S+Gao33ga7in3Y5S1csjtJWnHAWKI51d8DZrcodDwyNH5/vbmQ45hx6 rkIVw6/Yg2PNUe1lmRaV7UMHoel3TfUBoaDN0kVYSCgMpVn9porqJoN3pcMHfzt4tgZO+/8fjlRIy rib790BdB//k0zyIMMv15mRCmVVmRfx4znTJPb+PCpC4xIgHSUiS0S/8//kVFsgyM10NdDAjCJ4s+ YGyELdNw==; Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net) by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1vvHoe-00000009QtC-0uSg; Wed, 25 Feb 2026 16:32:48 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 1000) id DB4A43007C4; Wed, 25 Feb 2026 17:32:46 +0100 (CET) Date: Wed, 25 Feb 2026 17:32:46 +0100 From: Peter Zijlstra To: "Chen, Yu C" Cc: Kyle Meyer , tim.c.chen@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com, mingo@redhat.com, tglx@kernel.org, vinicius.gomes@intel.com, brgerst@gmail.com, hpa@zytor.com, kprateek.nayak@amd.com, linux-kernel@vger.kernel.org, patryk.wlazlyn@linux.intel.com, rafael.j.wysocki@intel.com, russ.anderson@hpe.com, x86@kernel.org, zhao1.liu@intel.com Subject: Re: [PATCH v2] sched/topology: Check average distances to remote packages Message-ID: <20260225163246.GX1395416@noisy.programming.kicks-ass.net> References: <20260223170314.GU1395266@noisy.programming.kicks-ass.net> <20260225123052.GN3016024@noisy.programming.kicks-ass.net> <20260225154409.GD1282955@noisy.programming.kicks-ass.net> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20260225154409.GD1282955@noisy.programming.kicks-ass.net> On Wed, Feb 25, 2026 at 04:44:09PM +0100, Peter Zijlstra wrote: > Yes, so this assumes that all u sized clusters on the trace are similar > and 'sane' without verification. That gave me an idea; how's this then? --- diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5cd6950ab672..b1e464fd98c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -513,33 +513,99 @@ static void __init build_sched_topology(void) } #ifdef CONFIG_NUMA -static int sched_avg_remote_distance; -static int avg_remote_numa_distance(void) + +static bool slit_cluster_symmetric(int i, int j, int n) { - int i, j; - int distance, nr_remote, total_distance; + WARN_ON_ONCE((i % n) || (j % n)); - if (sched_avg_remote_distance > 0) - return sched_avg_remote_distance; - - nr_remote = 0; - total_distance = 0; - for_each_node_state(i, N_CPU) { - for_each_node_state(j, N_CPU) { - distance = node_distance(i, j); - - if (distance >= REMOTE_DISTANCE) { - nr_remote++; - total_distance += distance; - } + for (int k = i; k < i + n; k++) { + for (int l = k; l < j + n; l++) { + if (node_distance(k, l) != node_distance(k, l)) + return false; } } - if (nr_remote) - sched_avg_remote_distance = total_distance / nr_remote; - else - sched_avg_remote_distance = REMOTE_DISTANCE; - return sched_avg_remote_distance; + return true; +} + +static bool slit_cluster_match(int i, int j, int x, int y, int n) +{ + WARN_ON_ONCE((i % n) || (j % n) || (x % n) || (y % n)); + + for (int k = 0; k < n; k++) { + for (int l = k; l < n; l++) { + if (node_distance(i + k, j + l) != node_distance(x + k, y + l)) + return false; + } + } + + return true; +} + +/* + * Find the largest symmetric,repeating cluster in an attempt to identify the + * unit size. + */ +static int slit_cluster_size(void) +{ + int nodes = num_possible_nodes(); + + /* + * There are at least 2 packages; so half-nodes is the largest + * possible unit, go down from that. + */ + for (int u = nodes / 2; u; u--) { + /* + * If u doesn't divide nodes, it can't be a unit. + */ + if (nodes % u) + continue; + + /* + * Unit must be symmetric, + */ + if (!slit_cluster_symmetric(0, 0, u)) + continue; + + /* + * and repeating. + */ + if (slit_cluster_match(0, 0, u, u, u)) + return u; + } + + return nodes; +} + +static int slit_cluster_distance(int i, int j) +{ + static int u = 0; + long d = 0; + int x, y; + + if (!u) + u = slit_cluster_size(); + + /* + * Is this a unit cluster on the trace? + */ + if ((i / u) == (j / u)) + return node_distance(i, j); + + /* + * Off-trace cluster, return average of the cluster to force symmetry. + */ + x = i - (i % u); + y = j - (j % u); + + for (i = x; i < x + u; i++) { + for (j = y; j < y + u; j++) { + d += node_distance(i, j); + d += node_distance(j, i); + } + } + + return d / (2*u*u); } int arch_sched_node_distance(int from, int to) @@ -550,8 +616,7 @@ int arch_sched_node_distance(int from, int to) case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - if (!x86_has_numa_in_package || topology_max_packages() == 1 || - d < REMOTE_DISTANCE) + if (!x86_has_numa_in_package || topology_max_packages() == 1) return d; /* @@ -564,19 +629,8 @@ int arch_sched_node_distance(int from, int to) * in the remote package in the same sched group. * Simplify NUMA domains and avoid extra NUMA levels including * different remote NUMA nodes and local nodes. - * - * GNR and CWF don't expect systems with more than 2 packages - * and more than 2 hops between packages. Single average remote - * distance won't be appropriate if there are more than 2 - * packages as average distance to different remote packages - * could be different. */ - WARN_ONCE(topology_max_packages() > 2, - "sched: Expect only up to 2 packages for GNR or CWF, " - "but saw %d packages when building sched domains.", - topology_max_packages()); - - d = avg_remote_numa_distance(); + return slit_cluster_distance(from, to); } return d; }