linux-doc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tony Luck <tony.luck@intel.com>
To: Fenghua Yu <fenghua.yu@intel.com>,
	Reinette Chatre <reinette.chatre@intel.com>,
	Peter Newman <peternewman@google.com>,
	Jonathan Corbet <corbet@lwn.net>,
	x86@kernel.org
Cc: Shaopeng Tan <tan.shaopeng@fujitsu.com>,
	James Morse <james.morse@arm.com>,
	Jamie Iles <quic_jiles@quicinc.com>,
	Babu Moger <babu.moger@amd.com>,
	linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org,
	patches@lists.linux.dev, Tony Luck <tony.luck@intel.com>
Subject: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Date: Wed, 21 Jun 2023 10:40:06 -0700	[thread overview]
Message-ID: <20230621174006.42533-8-tony.luck@intel.com> (raw)
In-Reply-To: <20230621174006.42533-1-tony.luck@intel.com>

There isn't a simple hardware enumeration to indicate to software that
a system is running with Sub-NUMA Cluster enabled.

Compare the number of NUMA nodes with the number of L3 caches to calculate
the number of Sub-NUMA nodes per L3 cache.

When Sub-NUMA cluster mode is enabled in BIOS setup the RMID counters
are distributed equally between the SNC nodes within each socket.

E.g. if there are 400 RMID counters, and the system is configured with
two SNC nodes per socket, then RMID counter 0..199 are used on SNC node
0 on the socket, and RMID counter 200..399 on SNC node 1.

A model specific MSR (0xca0) can change the configuration of the RMIDs
when SNC mode is enabled.

The MSR controls the interpretation of the RMID field in the
IA32_PQR_ASSOC MSR so that the appropriate hardware counters
within the SNC node are updated.

Also initialize a per-cpu RMID offset value. Use this
to calculate the value to write to the IA32_QM_EVTSEL MSR when
reading RMID event values.

N.B. this works well for well-behaved NUMA applications that access
memory predominantly from the local memory node. For applications that
access memory across multiple nodes it may be necessary for the user
to read counters for all SNC nodes on a socket and add the values to
get the actual LLC occupancy or memory bandwidth. Perhaps this isn't
all that different from applications that span across multiple sockets
in a legacy system.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/resctrl.h        |  2 +
 arch/x86/kernel/cpu/resctrl/core.c    | 99 ++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/resctrl/monitor.c |  2 +-
 3 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 255a78d9d906..f95e69bacc65 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -35,6 +35,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 
+DECLARE_PER_CPU(int, rmid_offset);
+
 /*
  * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index af3be3c2db96..869cfb46e8e4 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -16,11 +16,14 @@
 
 #define pr_fmt(fmt)	"resctrl: " fmt
 
+#include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpuhotplug.h>
+#include <linux/mod_devicetable.h>
 
+#include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include <asm/resctrl.h>
 #include "internal.h"
@@ -524,6 +527,39 @@ static int get_domain_id(int cpu, enum resctrl_scope scope)
 	}
 }
 
+DEFINE_PER_CPU(int, rmid_offset);
+
+static void set_per_cpu_rmid_offset(int cpu, struct rdt_resource *r)
+{
+	this_cpu_write(rmid_offset, (cpu_to_node(cpu) % snc_ways) * r->num_rmid);
+}
+
+/*
+ * This MSR provides for configuration of RMIDs on Sub-NUMA Cluster
+ * systems.
+ * Bit0 = 1 (default) For legacy configuration
+ * Bit0 = 0 RMIDs are divided evenly between SNC nodes.
+ */
+#define MSR_RMID_SNC_CONFIG   0xCA0
+
+static void snc_add_pkg(void)
+{
+	u64	msrval;
+
+	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+	msrval |= BIT_ULL(0);
+	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}
+
+static void snc_remove_pkg(void)
+{
+	u64	msrval;
+
+	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+	msrval &= ~BIT_ULL(0);
+	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -555,6 +591,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		cpumask_set_cpu(cpu, &d->cpu_mask);
 		if (r->cache.arch_has_per_cpu_cfg)
 			rdt_domain_reconfigure_cdp(r);
+		if (r->mon_capable)
+			set_per_cpu_rmid_offset(cpu, r);
 		return;
 	}
 
@@ -573,11 +611,17 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
-		domain_free(hw_dom);
-		return;
+	if (r->mon_capable) {
+		if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+			domain_free(hw_dom);
+			return;
+		}
+		set_per_cpu_rmid_offset(cpu, r);
 	}
 
+	if (r->pkg_actions)
+		snc_add_pkg();
+
 	list_add_tail(&d->list, add_pos);
 
 	err = resctrl_online_domain(r, d);
@@ -613,6 +657,9 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 			d->plr->d = NULL;
 		domain_free(hw_dom);
 
+		if (r->pkg_actions)
+			snc_remove_pkg();
+
 		return;
 	}
 
@@ -899,11 +946,57 @@ static __init bool get_rdt_resources(void)
 	return (rdt_mon_capable || rdt_alloc_capable);
 }
 
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0),
+	{}
+};
+
+/*
+ * There isn't a simple enumeration bit to show whether SNC mode
+ * is enabled. Look at the ratio of number of NUMA nodes to the
+ * number of distinct L3 caches. Take care to skip memory-only nodes.
+ */
+static __init int find_snc_ways(void)
+{
+	unsigned long *node_caches;
+	int mem_only_nodes = 0;
+	int cpu, node, ret;
+
+	if (!x86_match_cpu(snc_cpu_ids))
+		return 1;
+
+	node_caches = kcalloc(BITS_TO_LONGS(nr_node_ids), sizeof(*node_caches), GFP_KERNEL);
+	if (!node_caches)
+		return 1;
+
+	cpus_read_lock();
+	for_each_node(node) {
+		cpu = cpumask_first(cpumask_of_node(node));
+		if (cpu < nr_cpu_ids)
+			set_bit(get_cpu_cacheinfo_id(cpu, 3), node_caches);
+		else
+			mem_only_nodes++;
+	}
+	cpus_read_unlock();
+
+	ret = (nr_node_ids - mem_only_nodes) / bitmap_weight(node_caches, nr_node_ids);
+	kfree(node_caches);
+
+	if (ret > 1)
+		rdt_resources_all[RDT_RESOURCE_PKG].r_resctrl.pkg_actions = true;
+
+	return ret;
+}
+
 static __init void rdt_init_res_defs_intel(void)
 {
 	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
+	snc_ways = find_snc_ways();
+
 	for_each_rdt_resource(r) {
 		hw_res = resctrl_to_arch_res(r);
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index da3f36212898..74db99d299e1 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -160,7 +160,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
 	 * are error bits.
 	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid + this_cpu_read(rmid_offset));
 	rdmsrl(MSR_IA32_QM_CTR, msr_val);
 
 	if (msr_val & RMID_VAL_ERROR)
-- 
2.40.1


  parent reply	other threads:[~2023-06-21 17:40 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-21 17:39 [PATCH v2 0/7] x86/resctrl: Add support for Sub-NUMA cluster (SNC) systems Tony Luck
2023-06-21 17:40 ` [PATCH v2 1/7] x86/resctrl: Refactor in preparation for node-scoped resources Tony Luck
2023-06-21 17:40 ` [PATCH v2 2/7] x86/resctrl: Remove hard code of RDT_RESOURCE_L3 in monitor.c Tony Luck
2023-06-21 17:40 ` [PATCH v2 3/7] x86/resctrl: Add a new node-scoped resource to rdt_resources_all[] Tony Luck
2023-06-21 17:40 ` [PATCH v2 4/7] x86/resctrl: Add code to setup monitoring at L3 or NODE scope Tony Luck
2023-06-21 17:40 ` [PATCH v2 5/7] x86/resctrl: Add package scoped resource Tony Luck
2023-06-29  7:38   ` Shaopeng Tan (Fujitsu)
2023-06-21 17:40 ` [PATCH v2 6/7] x86/resctrl: Update documentation with Sub-NUMA cluster changes Tony Luck
2023-06-21 17:52   ` Randy Dunlap
2023-06-21 17:40 ` Tony Luck [this message]
2023-06-22 14:25   ` [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize Peter Newman
2023-06-22 16:01     ` Luck, Tony
2023-06-23 15:19       ` Peter Newman
2023-06-23 20:20         ` Tony Luck
2023-06-26 12:18           ` Peter Newman
2023-06-26 12:47   ` Peter Newman
2023-06-26 15:52     ` Luck, Tony
2023-06-26 16:20       ` Luck, Tony
2023-06-28 13:43       ` Peter Newman
2023-06-29  7:34   ` Shaopeng Tan (Fujitsu)
2023-06-29 15:56     ` Luck, Tony
2023-06-29  8:13 ` [PATCH v2 0/7] x86/resctrl: Add support for Sub-NUMA cluster (SNC) systems Shaopeng Tan (Fujitsu)
2023-06-29 16:05   ` Luck, Tony
2023-07-11 20:50     ` Reinette Chatre
2023-07-11 21:23       ` Tony Luck
2023-07-11 21:55         ` Reinette Chatre

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230621174006.42533-8-tony.luck@intel.com \
    --to=tony.luck@intel.com \
    --cc=babu.moger@amd.com \
    --cc=corbet@lwn.net \
    --cc=fenghua.yu@intel.com \
    --cc=james.morse@arm.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@lists.linux.dev \
    --cc=peternewman@google.com \
    --cc=quic_jiles@quicinc.com \
    --cc=reinette.chatre@intel.com \
    --cc=tan.shaopeng@fujitsu.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).