* [PATCH 01/11] x86/intel_cqm: Modify hot cpu notification handling
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 02/11] x86/intel_rapl: " Vikas Shivappa
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
- In cqm_pick_event_reader, use the existing package<->core map instead
of looping through all cpus in cqm_cpumask.
- In intel_cqm_cpu_exit, use the same map instead of looping through
all online cpus. In large systems with large number of cpus the time
taken to loop may be expensive and also the time increases linearly.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 34 +++++++++++++++---------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8..93e54ad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -62,6 +62,12 @@ static LIST_HEAD(cache_groups);
*/
static cpumask_t cqm_cpumask;
+/*
+ * Temporary cpumask used during hot cpu notificaiton handling. The usage
+ * is serialized by hot cpu locks.
+ */
+static cpumask_t tmp_cpumask;
+
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
@@ -1244,15 +1250,13 @@ static struct pmu intel_cqm_pmu = {
static inline void cqm_pick_event_reader(int cpu)
{
- int phys_id = topology_physical_package_id(cpu);
- int i;
+ cpumask_and(&tmp_cpumask, &cqm_cpumask, topology_core_cpumask(cpu));
- for_each_cpu(i, &cqm_cpumask) {
- if (phys_id == topology_physical_package_id(i))
- return; /* already got reader for this socket */
- }
-
- cpumask_set_cpu(cpu, &cqm_cpumask);
+ /*
+ * Pick a reader if there isn't one already.
+ */
+ if (cpumask_empty(&tmp_cpumask))
+ cpumask_set_cpu(cpu, &cqm_cpumask);
}
static void intel_cqm_cpu_starting(unsigned int cpu)
@@ -1270,7 +1274,6 @@ static void intel_cqm_cpu_starting(unsigned int cpu)
static void intel_cqm_cpu_exit(unsigned int cpu)
{
- int phys_id = topology_physical_package_id(cpu);
int i;
/*
@@ -1279,15 +1282,12 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
return;
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
+ cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
+ cpumask_clear_cpu(cpu, &tmp_cpumask);
+ i = cpumask_any(&tmp_cpumask);
- if (phys_id == topology_physical_package_id(i)) {
- cpumask_set_cpu(i, &cqm_cpumask);
- break;
- }
- }
+ if (i < nr_cpu_ids)
+ cpumask_set_cpu(i, &cqm_cpumask);
}
static int intel_cqm_cpu_notifier(struct notifier_block *nb,
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 02/11] x86/intel_rapl: Modify hot cpu notification handling
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
2015-09-09 19:24 ` [PATCH 01/11] x86/intel_cqm: Modify hot cpu notification handling Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 03/11] x86/intel_rdt: Cache Allocation documentation Vikas Shivappa
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
- In rapl_cpu_init, use the existing package<->core map instead of
looping through all cpus in rapl_cpumask.
- In rapl_cpu_exit, use the same mapping instead of looping all online
cpus. In large systems with large number of cpus the time taken to
loop may be expensive and also the time increase linearly.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel_rapl.c | 35 ++++++++++++++---------------
1 file changed, 17 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 5cbd4e6..3f3fb4c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -132,6 +132,12 @@ static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
+/*
+ * Temporary cpumask used during hot cpu notificaiton handling. The usage
+ * is serialized by hot cpu locks.
+ */
+static cpumask_t tmp_cpumask;
+
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
@@ -523,18 +529,16 @@ static struct pmu rapl_pmu_class = {
static void rapl_cpu_exit(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
- int i, phys_id = topology_physical_package_id(cpu);
int target = -1;
+ int i;
/* find a new cpu on same package */
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (phys_id == topology_physical_package_id(i)) {
- target = i;
- break;
- }
- }
+ cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
+ cpumask_clear_cpu(cpu, &tmp_cpumask);
+ i = cpumask_any(&tmp_cpumask);
+ if (i < nr_cpu_ids)
+ target = i;
+
/*
* clear cpu from cpumask
* if was set in cpumask and still some cpu on package,
@@ -556,15 +560,10 @@ static void rapl_cpu_exit(int cpu)
static void rapl_cpu_init(int cpu)
{
- int i, phys_id = topology_physical_package_id(cpu);
-
- /* check if phys_is is already covered */
- for_each_cpu(i, &rapl_cpu_mask) {
- if (phys_id == topology_physical_package_id(i))
- return;
- }
- /* was not found, so add it */
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
+ /* check if cpu's package is already covered.If not, add it.*/
+ cpumask_and(&tmp_cpumask, &rapl_cpu_mask, topology_core_cpumask(cpu));
+ if (cpumask_empty(&tmp_cpumask))
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
static __init void rapl_hsw_server_quirk(void)
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 03/11] x86/intel_rdt: Cache Allocation documentation
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
2015-09-09 19:24 ` [PATCH 01/11] x86/intel_cqm: Modify hot cpu notification handling Vikas Shivappa
2015-09-09 19:24 ` [PATCH 02/11] x86/intel_rapl: " Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 04/11] x86/intel_rdt: Add support for Cache Allocation detection Vikas Shivappa
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
Adds a description of Cache allocation technology, overview of kernel
framework implementation. The framework has APIs to manage class of
service, capacity bitmask(CBM), scheduling support and other
architecture specific implementation. The APIs are used to build the
cgroup interface in later patches.
Cache allocation is a sub-feature of Resource Director Technology (RDT)
or Platform Shared resource control which provides support to control
Platform shared resources like L3 cache.
Cache Allocation Technology provides a way for the Software (OS/VMM) to
restrict cache allocation to a defined 'subset' of cache which may be
overlapping with other 'subsets'. This feature is used when allocating a
line in cache ie when pulling new data into the cache. The tasks are
grouped into CLOS (class of service). OS uses MSR writes to indicate the
CLOSid of the thread when scheduling in and to indicate the cache
capacity associated with the CLOSid. Currently cache allocation is
supported for L3 cache.
More information can be found in the Intel SDM June 2015, Volume 3,
section 17.16.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
Documentation/x86/intel_rdt.txt | 109 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 109 insertions(+)
create mode 100644 Documentation/x86/intel_rdt.txt
diff --git a/Documentation/x86/intel_rdt.txt b/Documentation/x86/intel_rdt.txt
new file mode 100644
index 0000000..05ec819
--- /dev/null
+++ b/Documentation/x86/intel_rdt.txt
@@ -0,0 +1,109 @@
+ Intel RDT
+ ---------
+
+Copyright (C) 2014 Intel Corporation
+Written by vikas.shivappa@linux.intel.com
+
+CONTENTS:
+=========
+
+1. Cache Allocation Technology
+ 1.1 What is RDT and Cache allocation ?
+ 1.2 Why is Cache allocation needed ?
+ 1.3 Cache allocation implementation overview
+ 1.4 Assignment of CBM and CLOS
+ 1.5 Scheduling and Context Switch
+
+1. Cache Allocation Technology
+===================================
+
+1.1 What is RDT and Cache allocation
+------------------------------------
+
+Cache allocation is a sub-feature of Resource Director Technology (RDT)
+Allocation or Platform Shared resource control which provides support to
+control Platform shared resources like L3 cache. Currently L3 Cache is
+the only resource that is supported in RDT. More information can be
+found in the Intel SDM June 2015, Volume 3, section 17.16.
+
+Cache Allocation Technology provides a way for the Software (OS/VMM) to
+restrict cache allocation to a defined 'subset' of cache which may be
+overlapping with other 'subsets'. This feature is used when allocating a
+line in cache ie when pulling new data into the cache. The programming
+of the h/w is done via programming MSRs.
+
+The different cache subsets are identified by CLOS identifier (class of
+service) and each CLOS has a CBM (cache bit mask). The CBM is a
+contiguous set of bits which defines the amount of cache resource that
+is available for each 'subset'.
+
+1.2 Why is Cache allocation needed
+----------------------------------
+
+In todays new processors the number of cores is continuously increasing
+especially in large scale usage models where VMs are used like
+webservers and datacenters. The number of cores increase the number of
+threads or workloads that can simultaneously be run. When
+multi-threaded-applications, VMs, workloads run concurrently they
+compete for shared resources including L3 cache.
+
+The architecture also allows dynamically changing these subsets during
+runtime to further optimize the performance of the higher priority
+application with minimal degradation to the low priority app.
+Additionally, resources can be rebalanced for system throughput benefit.
+
+This technique may be useful in managing large computer server systems
+with large L3 cache, in the cloud and container context. Examples may be
+large servers running instances of webservers or database servers. In
+such complex systems, these subsets can be used for more careful placing
+of the available cache resources by a centralized root accessible
+interface.
+
+A specific use case may be to solve the noisy neighbour issue when a app
+which is constantly copying data like streaming app is using large
+amount of cache which could have otherwise been used by a high priority
+computing application. Using the cache allocation feature, the streaming
+application can be confined to use a smaller cache and the high priority
+application be awarded a larger amount of cache space.
+
+1.3 Cache allocation implementation Overview
+--------------------------------------------
+
+Kernel has a new field in the task_struct called 'closid' which
+represents the Class of service ID of the task.
+
+There is a 1:1 CLOSid <-> CBM (capacity bit mask) mapping. A CLOS (Class
+of service) is represented by a CLOSid. Each closid would have one CBM
+and would just represent one cache 'subset'. The tasks would get to
+fill the L3 cache represented by the capacity bit mask or CBM.
+
+The APIs to manage the closid and CBM can be used to develop user
+interfaces.
+
+1.4 Assignment of CBM, CLOS
+---------------------------
+
+The framework provides APIs to manage the closid and CBM which can be
+used to develop user/kernel mode interfaces.
+
+1.5 Scheduling and Context Switch
+---------------------------------
+
+During context switch kernel implements this by writing the CLOSid of
+the task to the CPU's IA32_PQR_ASSOC MSR. The MSR is only written when
+there is a change in the CLOSid for the CPU in order to minimize the
+latency incurred during context switch.
+
+The following considerations are done for the PQR MSR write so that it
+has minimal impact on scheduling hot path:
+ - This path doesn't exist on any non-intel platforms.
+ - On Intel platforms, this would not exist by default unless INTEL_RDT
+ is enabled.
+ - remains a no-op when INTEL_RDT is enabled and intel hardware does
+ not support the feature.
+ - When feature is available, does not do MSR write till the user
+ starts using the feature *and* assigns a new cache capacity mask.
+ - per cpu PQR values are cached and the MSR write is only done when
+ there is a task with different PQR is scheduled on the CPU. Typically
+ if the task groups are bound to be scheduled on a set of CPUs, the
+ number of MSR writes is greatly reduced.
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 04/11] x86/intel_rdt: Add support for Cache Allocation detection
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (2 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 03/11] x86/intel_rdt: Cache Allocation documentation Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 05/11] x86/intel_rdt: Add Class of service management Vikas Shivappa
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
This patch includes CPUID enumeration routines for Cache allocation and
new values to track resources to the cpuinfo_x86 structure.
Cache allocation provides a way for the Software (OS/VMM) to restrict
cache allocation to a defined 'subset' of cache which may be overlapping
with other 'subsets'. This feature is used when allocating a line in
cache ie when pulling new data into the cache. The programming of the
hardware is done via programming MSRs (model specific registers).
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/include/asm/cpufeature.h | 6 +++++-
arch/x86/include/asm/processor.h | 3 +++
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 15 +++++++++++++++
arch/x86/kernel/cpu/intel_rdt.c | 40 +++++++++++++++++++++++++++++++++++++++
init/Kconfig | 12 ++++++++++++
6 files changed, 76 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/kernel/cpu/intel_rdt.c
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606f..ae5ae9d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 13 /* N 32-bit words worth of info */
+#define NCAPINTS 14 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -229,6 +229,7 @@
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT ( 9*32+15) /* Resource Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
@@ -252,6 +253,9 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+/* Intel-defined CPU features, CPUID level 0x00000010:0 (ebx), word 13 */
+#define X86_FEATURE_CAT_L3 (13*32 + 1) /* Cache Allocation L3 */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 944f178..0a1a1bc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -120,6 +120,9 @@ struct cpuinfo_x86 {
int x86_cache_occ_scale; /* scale to bytes */
int x86_power;
unsigned long loops_per_jiffy;
+ /* Cache Allocation values: */
+ u16 x86_cache_max_cbm_len;
+ u16 x86_cache_max_closid;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9bff687..a6ea2b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_nhmex.o
endif
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb9e5df..5bb46d9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -653,6 +653,21 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
}
}
+ /* Additional Intel-defined flags: level 0x00000010 */
+ if (c->cpuid_level >= 0x00000010) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000010, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[13] = ebx;
+
+ if (cpu_has(c, X86_FEATURE_CAT_L3)) {
+
+ cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_cache_max_closid = edx + 1;
+ c->x86_cache_max_cbm_len = eax + 1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 0000000..f49e970
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,40 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2014 Intel Corporation
+ *
+ * 2015-05-25 Written by
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2015, volume 3, section 17.15.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+
+static int __init intel_rdt_late_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (!cpu_has(c, X86_FEATURE_CAT_L3))
+ return -ENODEV;
+
+ pr_info("Intel cache allocation detected\n");
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/init/Kconfig b/init/Kconfig
index af09b4f..6a067ca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -937,6 +937,18 @@ menuconfig CGROUPS
Say N if unsure.
+config INTEL_RDT
+ bool "Intel Resource Director Technology support"
+ depends on X86_64 && CPU_SUP_INTEL
+ help
+ This option provides support for Cache allocation which is a
+ sub-feature of Intel Resource Director Technology(RDT).
+ Current implementation supports L3 cache allocation.
+ Using this feature a user can specify the amount of L3 cache space
+ into which an application can fill.
+
+ Say N if unsure.
+
if CGROUPS
config CGROUP_DEBUG
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 05/11] x86/intel_rdt: Add Class of service management
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (3 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 04/11] x86/intel_rdt: Add support for Cache Allocation detection Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 06/11] x86/intel_rdt: Add L3 cache capacity bitmask management Vikas Shivappa
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
Adds some data-structures and APIs to support Class of service
management(closid). There is a new clos_cbm table which keeps a 1:1
mapping between closid and capacity bit mask (cbm)
and a count of usage of closid. Each task would be associated with a
Closid at a time and this patch adds a new field closid to task_struct
to keep track of the same.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/include/asm/intel_rdt.h | 12 ++++++
arch/x86/kernel/cpu/intel_rdt.c | 85 +++++++++++++++++++++++++++++++++++++++-
include/linux/sched.h | 3 ++
3 files changed, 98 insertions(+), 2 deletions(-)
create mode 100644 arch/x86/include/asm/intel_rdt.h
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
new file mode 100644
index 0000000..88b7643
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -0,0 +1,12 @@
+#ifndef _RDT_H_
+#define _RDT_H_
+
+#ifdef CONFIG_INTEL_RDT
+
+struct clos_cbm_table {
+ unsigned long l3_cbm;
+ unsigned int clos_refcnt;
+};
+
+#endif
+#endif
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index f49e970..cc988b1 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -24,17 +24,98 @@
#include <linux/slab.h>
#include <linux/err.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * cctable maintains 1:1 mapping between CLOSid and cache bitmask.
+ */
+static struct clos_cbm_table *cctable;
+/*
+ * closid availability bit map.
+ */
+unsigned long *closmap;
+static DEFINE_MUTEX(rdt_group_mutex);
+
+static inline void closid_get(u32 closid)
+{
+ struct clos_cbm_table *cct = &cctable[closid];
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ cct->clos_refcnt++;
+}
+
+static int closid_alloc(u32 *closid)
+{
+ u32 maxid;
+ u32 id;
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ maxid = boot_cpu_data.x86_cache_max_closid;
+ id = find_first_zero_bit(closmap, maxid);
+ if (id == maxid)
+ return -ENOSPC;
+
+ set_bit(id, closmap);
+ closid_get(id);
+ *closid = id;
+
+ return 0;
+}
+
+static inline void closid_free(u32 closid)
+{
+ clear_bit(closid, closmap);
+ cctable[closid].l3_cbm = 0;
+}
+
+static void closid_put(u32 closid)
+{
+ struct clos_cbm_table *cct = &cctable[closid];
+
+ lockdep_assert_held(&rdt_group_mutex);
+ if (WARN_ON(!cct->clos_refcnt))
+ return;
+
+ if (!--cct->clos_refcnt)
+ closid_free(closid);
+}
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 maxid, max_cbm_len;
+ int err = 0, size, i;
if (!cpu_has(c, X86_FEATURE_CAT_L3))
return -ENODEV;
- pr_info("Intel cache allocation detected\n");
+ maxid = c->x86_cache_max_closid;
+ max_cbm_len = c->x86_cache_max_cbm_len;
- return 0;
+ size = maxid * sizeof(struct clos_cbm_table);
+ cctable = kzalloc(size, GFP_KERNEL);
+ if (!cctable) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ for (i = 0; i < maxid; i++)
+ cctable[i].l3_cbm = (1ULL << max_cbm_len) - 1;
+
+ size = BITS_TO_LONGS(maxid) * sizeof(long);
+ closmap = kzalloc(size, GFP_KERNEL);
+ if (!closmap) {
+ kfree(cctable);
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ pr_info("Intel cache allocation enabled\n");
+out_err:
+
+ return err;
}
late_initcall(intel_rdt_late_init);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b5ada..b1b5bf7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1636,6 +1636,9 @@ struct task_struct {
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
+#ifdef CONFIG_INTEL_RDT
+ u32 closid;
+#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 06/11] x86/intel_rdt: Add L3 cache capacity bitmask management
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (4 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 05/11] x86/intel_rdt: Add Class of service management Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 07/11] x86/intel_rdt: Implement scheduling support for Intel RDT Vikas Shivappa
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
This patch adds different APIs to manage the L3 cache capacity bitmask.
The capacity bit mask(CBM) needs to have only contiguous bits set. The
current implementation has a global CBM for each class of service id.
There are APIs added to update the CBM via MSR write to IA32_L3_MASK_n
on all packages. Other APIs are to read and write entries to the
clos_cbm_table.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/include/asm/intel_rdt.h | 4 ++
arch/x86/kernel/cpu/intel_rdt.c | 122 +++++++++++++++++++++++++++++++++++++++
2 files changed, 126 insertions(+)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 88b7643..4f45dc8 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -3,6 +3,10 @@
#ifdef CONFIG_INTEL_RDT
+#define MAX_CBM_LENGTH 32
+#define IA32_L3_CBM_BASE 0xc90
+#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
+
struct clos_cbm_table {
unsigned long l3_cbm;
unsigned int clos_refcnt;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index cc988b1..c9db0ed 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -34,6 +34,15 @@ static struct clos_cbm_table *cctable;
* closid availability bit map.
*/
unsigned long *closmap;
+/*
+ * Mask of CPUs for writing CBM values. We only need one CPU per-socket.
+ */
+static cpumask_t rdt_cpumask;
+/*
+ * Temporary cpumask used during hot cpu notificaiton handling. The usage
+ * is serialized by hot cpu locks.
+ */
+static cpumask_t tmp_cpumask;
static DEFINE_MUTEX(rdt_group_mutex);
static inline void closid_get(u32 closid)
@@ -82,6 +91,117 @@ static void closid_put(u32 closid)
closid_free(closid);
}
+static bool cbm_validate(unsigned long var)
+{
+ u32 max_cbm_len = boot_cpu_data.x86_cache_max_cbm_len;
+ unsigned long first_bit, zero_bit;
+ u64 max_cbm;
+
+ if (bitmap_weight(&var, max_cbm_len) < 1)
+ return false;
+
+ max_cbm = (1ULL << max_cbm_len) - 1;
+ if (var & ~max_cbm)
+ return false;
+
+ first_bit = find_first_bit(&var, max_cbm_len);
+ zero_bit = find_next_zero_bit(&var, max_cbm_len, first_bit);
+
+ if (find_next_bit(&var, max_cbm_len, zero_bit) < max_cbm_len)
+ return false;
+
+ return true;
+}
+
+static int clos_cbm_table_read(u32 closid, unsigned long *l3_cbm)
+{
+ u32 maxid = boot_cpu_data.x86_cache_max_closid;
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ if (closid >= maxid)
+ return -EINVAL;
+
+ *l3_cbm = cctable[closid].l3_cbm;
+
+ return 0;
+}
+
+/*
+ * clos_cbm_table_update() - Update a clos cbm table entry.
+ * @closid: the closid whose cbm needs to be updated
+ * @cbm: the new cbm value that has to be updated
+ *
+ * This assumes the cbm is validated as per the interface requirements
+ * and the cache allocation requirements(through the cbm_validate).
+ */
+static int clos_cbm_table_update(u32 closid, unsigned long cbm)
+{
+ u32 maxid = boot_cpu_data.x86_cache_max_closid;
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ if (closid >= maxid)
+ return -EINVAL;
+
+ cctable[closid].l3_cbm = cbm;
+
+ return 0;
+}
+
+static bool cbm_search(unsigned long cbm, u32 *closid)
+{
+ u32 maxid = boot_cpu_data.x86_cache_max_closid;
+ u32 i;
+
+ for (i = 0; i < maxid; i++) {
+ if (cctable[i].clos_refcnt &&
+ bitmap_equal(&cbm, &cctable[i].l3_cbm, MAX_CBM_LENGTH)) {
+ *closid = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void closcbm_map_dump(void)
+{
+ u32 i;
+
+ pr_debug("CBMMAP\n");
+ for (i = 0; i < boot_cpu_data.x86_cache_max_closid; i++) {
+ pr_debug("l3_cbm: 0x%x,clos_refcnt: %u\n",
+ (unsigned int)cctable[i].l3_cbm, cctable[i].clos_refcnt);
+ }
+}
+
+static void cbm_cpu_update(void *info)
+{
+ u32 closid = (u32) info;
+
+ wrmsrl(CBM_FROM_INDEX(closid), cctable[closid].l3_cbm);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static void cbm_update_all(u32 closid)
+{
+ on_each_cpu_mask(&rdt_cpumask, cbm_cpu_update, (void *)closid, 1);
+}
+
+static inline bool rdt_cpumask_update(int cpu)
+{
+ cpumask_and(&tmp_cpumask, &rdt_cpumask, topology_core_cpumask(cpu));
+ if (cpumask_empty(&tmp_cpumask)) {
+ cpumask_set_cpu(cpu, &rdt_cpumask);
+ return true;
+ }
+
+ return false;
+}
+
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -112,6 +232,8 @@ static int __init intel_rdt_late_init(void)
goto out_err;
}
+ for_each_online_cpu(i)
+ rdt_cpumask_update(i);
pr_info("Intel cache allocation enabled\n");
out_err:
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 07/11] x86/intel_rdt: Implement scheduling support for Intel RDT
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (5 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 06/11] x86/intel_rdt: Add L3 cache capacity bitmask management Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:24 ` [PATCH 08/11] x86/intel_rdt: Hot cpu support for Cache Allocation Vikas Shivappa
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
Adds support for IA32_PQR_ASSOC MSR writes during task scheduling. For
Cache Allocation, MSR write would let the task fill in the cache
'subset' represented by the task's capacity bit mask.
The high 32 bits in the per processor MSR IA32_PQR_ASSOC represents the
CLOSid. During context switch kernel implements this by writing the
CLOSid of the task belongs to the CPU's IA32_PQR_ASSOC MSR.
This patch also implements a common software cache for IA32_PQR_MSR
(RMID 0:9, CLOSId 32:63) to be used by both Cache monitoring (CMT) and
Cache allocation. CMT updates the RMID where as cache_alloc updates the
CLOSid in the software cache. During scheduling when the new RMID/CLOSid
value is different from the cached values, IA32_PQR_MSR is updated.
Since the measured rdmsr latency for IA32_PQR_MSR is very high (~250
cycles) this software cache is necessary to avoid reading the MSR to
compare the current CLOSid value.
The following considerations are done for the PQR MSR write so that it
minimally impacts scheduler hot path:
- This path does not exist on any non-intel platforms.
- On Intel platforms, this would not exist by default unless INTEL_RDT
is enabled.
- remains a no-op when INTEL_RDT is enabled and intel SKU does not
support the feature.
- When feature is available and enabled, never does MSR write till the
user manually starts using one of the capacity bit masks.
- MSR write is only done when there is a task with different Closid is
scheduled on the CPU. Typically if the task groups are bound to be
scheduled on a set of CPUs, the number of MSR writes is greatly
reduced.
- A per CPU cache of CLOSids is maintained to do the check so that we
dont have to do a rdmsr which actually costs a lot of cycles.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/include/asm/intel_rdt.h | 26 ++++++++++++++++++++++++++
arch/x86/include/asm/pqr_common.h | 27 +++++++++++++++++++++++++++
arch/x86/kernel/cpu/intel_rdt.c | 17 +++++++++++++++++
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 26 +++-----------------------
arch/x86/kernel/process_64.c | 6 ++++++
5 files changed, 79 insertions(+), 23 deletions(-)
create mode 100644 arch/x86/include/asm/pqr_common.h
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 4f45dc8..384add9 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -7,10 +7,36 @@
#define IA32_L3_CBM_BASE 0xc90
#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
+extern struct static_key rdt_enable_key;
+void __intel_rdt_sched_in(void);
+
struct clos_cbm_table {
unsigned long l3_cbm;
unsigned int clos_refcnt;
};
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports L3 cache allocation.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ * when a task with a different CLOSid is scheduled in.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+ /*
+ * Call the schedule in code only when RDT is enabled.
+ */
+ if (static_key_false(&rdt_enable_key))
+ __intel_rdt_sched_in();
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
#endif
#endif
diff --git a/arch/x86/include/asm/pqr_common.h b/arch/x86/include/asm/pqr_common.h
new file mode 100644
index 0000000..11e985c
--- /dev/null
+++ b/arch/x86/include/asm/pqr_common.h
@@ -0,0 +1,27 @@
+#ifndef _X86_RDT_H_
+#define _X86_RDT_H_
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid: The cached Resource Monitoring ID
+ * @closid: The cached Class Of Service ID
+ * @rmid_usecnt: The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 rmid;
+ u32 closid;
+ int rmid_usecnt;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+#endif
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index c9db0ed..1f06c68 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -24,6 +24,8 @@
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/sched.h>
+#include <asm/pqr_common.h>
#include <asm/intel_rdt.h>
/*
@@ -44,6 +46,19 @@ static cpumask_t rdt_cpumask;
*/
static cpumask_t tmp_cpumask;
static DEFINE_MUTEX(rdt_group_mutex);
+struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
+
+void __intel_rdt_sched_in(void)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 closid = current->closid;
+
+ if (closid == state->closid)
+ return;
+
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
+ state->closid = closid;
+}
static inline void closid_get(u32 closid)
{
@@ -234,6 +249,8 @@ static int __init intel_rdt_late_init(void)
for_each_online_cpu(i)
rdt_cpumask_update(i);
+
+ static_key_slow_inc(&rdt_enable_key);
pr_info("Intel cache allocation enabled\n");
out_err:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 93e54ad..04a696f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -7,41 +7,22 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
+#include <asm/pqr_common.h>
#include "perf_event.h"
-#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid: The cached Resource Monitoring ID
- * @closid: The cached Class Of Service ID
- * @rmid_usecnt: The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
- u32 rmid;
- u32 closid;
- int rmid_usecnt;
-};
-
/*
* The cached intel_pqr_state is strictly per CPU and can never be
* updated from a remote CPU. Both functions which modify the state
* (intel_cqm_event_start and intel_cqm_event_stop) are called with
* interrupts disabled, which is sufficient for the protection.
*/
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -408,9 +389,9 @@ static void __intel_cqm_event_count(void *info);
*/
static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
{
- struct perf_event *event;
struct list_head *head = &group->hw.cqm_group_entry;
u32 old_rmid = group->hw.cqm_rmid;
+ struct perf_event *event;
lockdep_assert_held(&cache_mutex);
@@ -1265,7 +1246,6 @@ static void intel_cqm_cpu_starting(unsigned int cpu)
struct cpuinfo_x86 *c = &cpu_data(cpu);
state->rmid = 0;
- state->closid = 0;
state->rmid_usecnt = 0;
WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f6b9163..8c42b64 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/intel_rdt.h>
asmlinkage extern void ret_from_fork(void);
@@ -445,6 +446,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
loadsegment(ss, __KERNEL_DS);
}
+ /*
+ * Load the Intel cache allocation PQR MSR.
+ */
+ intel_rdt_sched_in();
+
return prev_p;
}
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 08/11] x86/intel_rdt: Hot cpu support for Cache Allocation
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (6 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 07/11] x86/intel_rdt: Implement scheduling support for Intel RDT Vikas Shivappa
@ 2015-09-09 19:24 ` Vikas Shivappa
2015-09-09 19:25 ` [PATCH 09/11] x86/intel_rdt: Intel haswell Cache Allocation enumeration Vikas Shivappa
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:24 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
This patch adds hot cpu support for Intel Cache allocation. Support
includes updating the cache bitmask MSRs IA32_L3_QOS_n when a new CPU
package comes online or goes offline. The IA32_L3_QOS_n MSRs are one per
Class of service on each CPU package. The new package's MSRs are
synchronized with the values of existing MSRs. Also the software cache
for IA32_PQR_ASSOC MSRs are reset during hot cpu notifications.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/kernel/cpu/intel_rdt.c | 72 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 1f06c68..38fa6ac 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/cpu.h>
#include <linux/sched.h>
#include <asm/pqr_common.h>
#include <asm/intel_rdt.h>
@@ -217,6 +218,71 @@ static inline bool rdt_cpumask_update(int cpu)
return false;
}
+/*
+ * cbm_update_msrs() - Updates all the existing IA32_L3_MASK_n MSRs
+ * which are one per CLOSid on the current package.
+ */
+static void cbm_update_msrs(void *info)
+{
+ int maxid = boot_cpu_data.x86_cache_max_closid;
+ unsigned int i;
+
+ for (i = 0; i < maxid; i++) {
+ if (cctable[i].clos_refcnt)
+ cbm_cpu_update((void *)i);
+ }
+}
+
+static inline void intel_rdt_cpu_start(int cpu)
+{
+ struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+
+ state->closid = 0;
+ mutex_lock(&rdt_group_mutex);
+ if (rdt_cpumask_update(cpu))
+ smp_call_function_single(cpu, cbm_update_msrs, NULL, 1);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static void intel_rdt_cpu_exit(unsigned int cpu)
+{
+ int i;
+
+ mutex_lock(&rdt_group_mutex);
+ if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) {
+ mutex_unlock(&rdt_group_mutex);
+ return;
+ }
+
+ cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
+ cpumask_clear_cpu(cpu, &tmp_cpumask);
+ i = cpumask_any(&tmp_cpumask);
+
+ if (i < nr_cpu_ids)
+ cpumask_set_cpu(i, &rdt_cpumask);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static int intel_rdt_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ intel_rdt_cpu_start(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ intel_rdt_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -247,9 +313,15 @@ static int __init intel_rdt_late_init(void)
goto out_err;
}
+ cpu_notifier_register_begin();
+
for_each_online_cpu(i)
rdt_cpumask_update(i);
+ __hotcpu_notifier(intel_rdt_cpu_notifier, 0);
+
+ cpu_notifier_register_done();
+
static_key_slow_inc(&rdt_enable_key);
pr_info("Intel cache allocation enabled\n");
out_err:
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 09/11] x86/intel_rdt: Intel haswell Cache Allocation enumeration
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (7 preceding siblings ...)
2015-09-09 19:24 ` [PATCH 08/11] x86/intel_rdt: Hot cpu support for Cache Allocation Vikas Shivappa
@ 2015-09-09 19:25 ` Vikas Shivappa
2015-09-09 19:25 ` [PATCH 10/11] x86,cgroup/intel_rdt : Add intel_rdt cgroup documentation Vikas Shivappa
2015-09-09 19:25 ` [PATCH 11/11] x86,cgroup/intel_rdt : Add a cgroup interface to manage Intel cache allocation Vikas Shivappa
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:25 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
This patch is specific to Intel haswell (hsw) server SKUs. Cache
Allocation on hsw server needs to be enumerated separately as HSW does
not have support for CPUID enumeration for Cache Allocation. This patch
does a probe by writing a CLOSid (Class of service id) into high 32 bits
of IA32_PQR_MSR and see if the bits stick. The probe is only done after
confirming that the CPU is HSW server. Other hardcoded values are:
- L3 cache bit mask must be at least two bits.
- Maximum CLOSids supported is always 4.
- Maximum bits support in cache bit mask is always 20.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/kernel/cpu/intel_rdt.c | 58 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 56 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 38fa6ac..5608f49 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -38,6 +38,10 @@ static struct clos_cbm_table *cctable;
*/
unsigned long *closmap;
/*
+ * Minimum bits required in Cache bitmask.
+ */
+static unsigned int min_bitmask_len = 1;
+/*
* Mask of CPUs for writing CBM values. We only need one CPU per-socket.
*/
static cpumask_t rdt_cpumask;
@@ -49,6 +53,56 @@ static cpumask_t tmp_cpumask;
static DEFINE_MUTEX(rdt_group_mutex);
struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as it does not have CPUID enumeration support for Cache allocation.
+ *
+ * Probes by writing to the high 32 bits(CLOSid) of the IA32_PQR_MSR and
+ * testing if the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+ u32 l, h_old, h_new, h_tmp;
+
+ if (rdmsr_safe(MSR_IA32_PQR_ASSOC, &l, &h_old))
+ return false;
+
+ /*
+ * Default value is always 0 if feature is present.
+ */
+ h_tmp = h_old ^ 0x1U;
+ if (wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_tmp) ||
+ rdmsr_safe(MSR_IA32_PQR_ASSOC, &l, &h_new))
+ return false;
+
+ if (h_tmp != h_new)
+ return false;
+
+ wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old);
+
+ boot_cpu_data.x86_cache_max_closid = 4;
+ boot_cpu_data.x86_cache_max_cbm_len = 20;
+ min_bitmask_len = 2;
+
+ return true;
+}
+
+static inline bool cache_alloc_supported(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_CAT_L3))
+ return true;
+
+ /*
+ * Probe for Haswell server CPUs.
+ */
+ if (c->x86 == 0x6 && c->x86_model == 0x3f)
+ return cache_alloc_hsw_probe();
+
+ return false;
+}
+
void __intel_rdt_sched_in(void)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -113,7 +167,7 @@ static bool cbm_validate(unsigned long var)
unsigned long first_bit, zero_bit;
u64 max_cbm;
- if (bitmap_weight(&var, max_cbm_len) < 1)
+ if (bitmap_weight(&var, max_cbm_len) < min_bitmask_len)
return false;
max_cbm = (1ULL << max_cbm_len) - 1;
@@ -289,7 +343,7 @@ static int __init intel_rdt_late_init(void)
u32 maxid, max_cbm_len;
int err = 0, size, i;
- if (!cpu_has(c, X86_FEATURE_CAT_L3))
+ if (!cache_alloc_supported(c))
return -ENODEV;
maxid = c->x86_cache_max_closid;
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 10/11] x86,cgroup/intel_rdt : Add intel_rdt cgroup documentation
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (8 preceding siblings ...)
2015-09-09 19:25 ` [PATCH 09/11] x86/intel_rdt: Intel haswell Cache Allocation enumeration Vikas Shivappa
@ 2015-09-09 19:25 ` Vikas Shivappa
2015-09-09 19:25 ` [PATCH 11/11] x86,cgroup/intel_rdt : Add a cgroup interface to manage Intel cache allocation Vikas Shivappa
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:25 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
Add documentation on using the cache allocation cgroup interface with
examples.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
Documentation/cgroups/rdt.txt | 133 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 133 insertions(+)
create mode 100644 Documentation/cgroups/rdt.txt
diff --git a/Documentation/cgroups/rdt.txt b/Documentation/cgroups/rdt.txt
new file mode 100644
index 0000000..bc6b20a
--- /dev/null
+++ b/Documentation/cgroups/rdt.txt
@@ -0,0 +1,133 @@
+ RDT
+ ---
+
+Copyright (C) 2014 Intel Corporation
+Written by vikas.shivappa@linux.intel.com
+
+CONTENTS:
+=========
+
+1. Cache Allocation Technology
+ 1.1 Why is Cache allocation needed?
+2. Usage Examples and Syntax
+
+1. Cache Allocation Technology
+===================================
+
+1.1 Why is Cache allocation needed
+----------------------------------
+
+In todays new processors the number of cores is continuously increasing
+especially in large scale usage models where VMs are used like
+webservers and datacenters. The number of cores increase the number of
+threads or workloads that can simultaneously be run. When
+multi-threaded-applications, VMs, workloads run concurrently they
+compete for shared resources including L3 cache.
+
+The architecture also allows dynamically changing these subsets during
+runtime to further optimize the performance of the higher priority
+application with minimal degradation to the low priority app.
+Additionally, resources can be rebalanced for system throughput benefit.
+This technique may be useful in managing large computer systems which
+large L3 cache.
+
+Cloud/Container use case:
+They key use case scenarios are in large server clusters in a typical
+cloud or container context. A central 'managing agent' would control
+resource allocations to a set of VMs or containers. In todays resource
+management, cgroups are widely used already and a significant amount of
+plumbing in user space is already done to perform tasks like
+allocating/configuring resources dynamically and statically. An
+important example is dockers using systemd and systemd inturn using
+cgroups in its core to manage resources. This makes cgroup interface an
+easily adaptable interface for cache allocation.
+
+Noisy neighbour use case:
+A more specific use case may be when a streaming app which is constantly
+copying data and accessing linear space larger than L3 cache
+and hence evicting a large amount of cache which could have
+otherwise been used by a high priority computing application. Using the
+cache allocation feature, the 'noisy neighbours' like the streaming
+application can be confined to use a smaller cache and the high priority
+application be awarded a larger amount of cache space. A managing agent
+can monitor the cache allocation using cache monitoring through libperf
+and be able to make resource adjustments either statically or
+dynamically.
+This interface hence helps in maintaining a resource policy to
+provide the quality of service requirements like number of requests
+handled, response time.
+
+More information can be found in the Intel SDM June 2015, Volume 3,
+section 17.16. More information on kernel implementation details can be
+found in Documentation/x86/intel_rdt.txt.
+
+2. Usage examples and syntax
+============================
+
+Following is an example on how a system administrator/root user can
+configure L3 cache allocation to threads.
+
+To enable the cache allocation during compile time set the
+CONFIG_INTEL_RDT=y.
+
+To check if Cache allocation was enabled on your system
+ $ dmesg | grep -i intel_rdt
+ intel_rdt: Intel Cache Allocation enabled
+
+ $ cat /proc/cpuinfo
+output would have 'rdt' (if rdt is enabled) and 'cat_l3' (if L3
+cache allocation is enabled).
+
+example1: Following would mount the cache allocation cgroup subsystem
+and create 2 directories.
+
+ $ cd /sys/fs/cgroup
+ $ mkdir rdt
+ $ mount -t cgroup -ointel_rdt intel_rdt /sys/fs/cgroup/rdt
+ $ cd rdt
+ $ mkdir group1
+ $ mkdir group2
+
+Following are some of the Files in the directory
+
+ $ ls
+ intel_rdt.l3_cbm
+ tasks
+
+Say if the cache is 4MB (looked up from /proc/cpuinfo) and max cbm is 16
+bits (indicated by the root nodes cbm). This assigns 1MB of cache to
+group1 and group2 which is exclusive between them.
+
+ $ cd group1
+ $ /bin/echo 0xf > intel_rdt.l3_cbm
+
+ $ cd group2
+ $ /bin/echo 0xf0 > intel_rdt.l3_cbm
+
+Assign tasks to the group2
+
+ $ /bin/echo PID1 > tasks
+ $ /bin/echo PID2 > tasks
+
+Now threads PID1 and PID2 get to fill the 1MB of cache that was
+allocated to group2. Similarly assign tasks to group1.
+
+example2: Below commands allocate '1MB L3 cache on socket1 to group1'
+and '2MB of L3 cache on socket2 to group2'.
+This mounts both cpuset and intel_rdt and hence the ls would list the
+files in both the subsystems.
+ $ mount -t cgroup -ocpuset,intel_rdt cpuset,intel_rdt rdt/
+
+Assign the cache
+ $ /bin/echo 0xf > /sys/fs/cgroup/rdt/group1/intel_rdt.l3_cbm
+ $ /bin/echo 0xff > /sys/fs/cgroup/rdt/group2/intel_rdt.l3_cbm
+
+Assign tasks for group1 and group2
+ $ /bin/echo PID1 > /sys/fs/cgroup/rdt/group1/tasks
+ $ /bin/echo PID2 > /sys/fs/cgroup/rdt/group1/tasks
+ $ /bin/echo PID3 > /sys/fs/cgroup/rdt/group2/tasks
+ $ /bin/echo PID4 > /sys/fs/cgroup/rdt/group2/tasks
+
+Tie the group1 to socket1 and group2 to socket2
+ $ /bin/echo <cpumask for socket1> > /sys/fs/cgroup/rdt/group1/cpuset.cpus
+ $ /bin/echo <cpumask for socket2> > /sys/fs/cgroup/rdt/group2/cpuset.cpus
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 11/11] x86,cgroup/intel_rdt : Add a cgroup interface to manage Intel cache allocation
2015-09-09 19:24 [PATCH V14 0/9] Intel cache allocation and Hot cpu handling changes to cqm, rapl Vikas Shivappa
` (9 preceding siblings ...)
2015-09-09 19:25 ` [PATCH 10/11] x86,cgroup/intel_rdt : Add intel_rdt cgroup documentation Vikas Shivappa
@ 2015-09-09 19:25 ` Vikas Shivappa
10 siblings, 0 replies; 12+ messages in thread
From: Vikas Shivappa @ 2015-09-09 19:25 UTC (permalink / raw)
To: vikas.shivappa
Cc: vikas.shivappa, x86, linux-kernel, hpa, tglx, mingo, tj, peterz,
matt.fleming, will.auld, h.peter.anvin, glenn.p.williamson,
kanaka.d.juvva, bruce.schlobohm
Adds a new cgroup 'intel_rdt' to manage cache allocation. Each cgroup
directory is associated with a class of service id(closid). To map a
task with closid during scheduling, this patch removes the closid field
from task_struct and uses the already existing 'cgroups' field in
task_struct.
The cgroup has a file 'l3_cbm' which represents the L3 cache capacity
bitmask(CBM). The CBM is global for the whole system currently. The
capacity bitmask needs to have only contiguous bits set and number of
bits that can be set is less than the max bits that can be set. The
tasks belonging to a cgroup get to fill in the L3 cache represented by
the capacity bitmask of the cgroup. For ex: if the max bits in the CBM
is 10 and the cache size is 10MB, each bit represents 1MB of cache
capacity.
Root cgroup always has all the bits set in the l3_cbm. User can create
more cgroups with mkdir syscall. By default the child cgroups inherit
the capacity bitmask(CBM) from parent. User can change the CBM specified
in hex for each cgroup. Each unique bitmask is associated with a class
of service ID and an -ENOSPC is returned once we run out of
closids.
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
arch/x86/include/asm/intel_rdt.h | 38 +++++++-
arch/x86/kernel/cpu/intel_rdt.c | 192 +++++++++++++++++++++++++++++++++++++--
include/linux/cgroup_subsys.h | 3 +
include/linux/sched.h | 3 -
init/Kconfig | 4 +-
5 files changed, 228 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 384add9..8244bc1 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -3,6 +3,8 @@
#ifdef CONFIG_INTEL_RDT
+#include <linux/cgroup.h>
+
#define MAX_CBM_LENGTH 32
#define IA32_L3_CBM_BASE 0xc90
#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
@@ -10,20 +12,54 @@
extern struct static_key rdt_enable_key;
void __intel_rdt_sched_in(void);
+struct intel_rdt {
+ struct cgroup_subsys_state css;
+ u32 closid;
+};
+
struct clos_cbm_table {
unsigned long l3_cbm;
unsigned int clos_refcnt;
};
/*
+ * Return rdt group corresponding to this container.
+ */
+static inline struct intel_rdt *css_rdt(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct intel_rdt, css) : NULL;
+}
+
+static inline struct intel_rdt *parent_rdt(struct intel_rdt *ir)
+{
+ return css_rdt(ir->css.parent);
+}
+
+/*
+ * Return rdt group to which this task belongs.
+ */
+static inline struct intel_rdt *task_rdt(struct task_struct *task)
+{
+ return css_rdt(task_css(task, intel_rdt_cgrp_id));
+}
+
+/*
* intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
*
* Following considerations are made so that this has minimal impact
* on scheduler hot path:
* - This will stay as no-op unless we are running on an Intel SKU
* which supports L3 cache allocation.
+ * - When support is present and enabled, does not do any
+ * IA32_PQR_MSR writes until the user starts really using the feature
+ * ie creates a rdt cgroup directory and assigns a cache_mask thats
+ * different from the root cgroup's cache_mask.
* - Caches the per cpu CLOSid values and does the MSR write only
- * when a task with a different CLOSid is scheduled in.
+ * when a task with a different CLOSid is scheduled in. That
+ * means the task belongs to a different cgroup.
+ * - Closids are allocated so that different cgroup directories
+ * with same cache_mask gets the same CLOSid. This minimizes CLOSids
+ * used and reduces MSR write frequency.
*/
static inline void intel_rdt_sched_in(void)
{
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5608f49..3245408 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -53,6 +53,10 @@ static cpumask_t tmp_cpumask;
static DEFINE_MUTEX(rdt_group_mutex);
struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
+struct intel_rdt rdt_root_group;
+#define rdt_for_each_child(pos_css, parent_ir) \
+ css_for_each_child((pos_css), &(parent_ir)->css)
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as it does not have CPUID enumeration support for Cache allocation.
@@ -106,13 +110,13 @@ static inline bool cache_alloc_supported(struct cpuinfo_x86 *c)
void __intel_rdt_sched_in(void)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
- u32 closid = current->closid;
+ struct intel_rdt *ir = task_rdt(current);
- if (closid == state->closid)
+ if (ir->closid == state->closid)
return;
- wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
- state->closid = closid;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, ir->closid);
+ state->closid = ir->closid;
}
static inline void closid_get(u32 closid)
@@ -337,15 +341,174 @@ static int intel_rdt_cpu_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
+static struct cgroup_subsys_state *
+intel_rdt_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct intel_rdt *parent = css_rdt(parent_css);
+ struct intel_rdt *ir;
+
+ /*
+ * cgroup_init cannot handle failures gracefully.
+ * Return rdt_root_group.css instead of failure
+ * always even when Cache allocation is not supported.
+ */
+ if (!parent)
+ return &rdt_root_group.css;
+
+ ir = kzalloc(sizeof(struct intel_rdt), GFP_KERNEL);
+ if (!ir)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&rdt_group_mutex);
+ ir->closid = parent->closid;
+ closid_get(ir->closid);
+ mutex_unlock(&rdt_group_mutex);
+
+ return &ir->css;
+}
+
+static void intel_rdt_css_free(struct cgroup_subsys_state *css)
+{
+ struct intel_rdt *ir = css_rdt(css);
+
+ mutex_lock(&rdt_group_mutex);
+ closid_put(ir->closid);
+ kfree(ir);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static int intel_cache_alloc_cbm_read(struct seq_file *m, void *v)
+{
+ struct intel_rdt *ir = css_rdt(seq_css(m));
+ unsigned long l3_cbm = 0;
+
+ clos_cbm_table_read(ir->closid, &l3_cbm);
+ seq_printf(m, "%08lx\n", l3_cbm);
+
+ return 0;
+}
+
+static int cbm_validate_rdt_cgroup(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+ struct cgroup_subsys_state *css;
+ struct intel_rdt *par, *c;
+ unsigned long cbm_tmp = 0;
+ int err = 0;
+
+ if (!cbm_validate(cbmvalue)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ par = parent_rdt(ir);
+ clos_cbm_table_read(par->closid, &cbm_tmp);
+ if (!bitmap_subset(&cbmvalue, &cbm_tmp, MAX_CBM_LENGTH)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rcu_read_lock();
+ rdt_for_each_child(css, ir) {
+ c = css_rdt(css);
+ clos_cbm_table_read(par->closid, &cbm_tmp);
+ if (!bitmap_subset(&cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+ rcu_read_unlock();
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+ rcu_read_unlock();
+out_err:
+
+ return err;
+}
+
+/*
+ * intel_cache_alloc_cbm_write() - Validates and writes the
+ * cache bit mask(cbm) to the IA32_L3_MASK_n
+ * and also store the same in the cctable.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * This helps to use the scant CLOSids optimally. This also
+ * implies that at context switch write to PQR-MSR is done
+ * only when a task with a different bitmask is scheduled in.
+ */
+static int intel_cache_alloc_cbm_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cbmvalue)
+{
+ struct intel_rdt *ir = css_rdt(css);
+ unsigned long ccbm = 0;
+ int err = 0;
+ u32 closid;
+
+ if (ir == &rdt_root_group)
+ return -EPERM;
+
+ /*
+ * Need global mutex as cbm write may allocate a closid.
+ */
+ mutex_lock(&rdt_group_mutex);
+
+ clos_cbm_table_read(ir->closid, &ccbm);
+ if (cbmvalue == ccbm)
+ goto out;
+
+ err = cbm_validate_rdt_cgroup(ir, cbmvalue);
+ if (err)
+ goto out;
+
+ /*
+ * Try to get a reference for a different CLOSid and release the
+ * reference to the current CLOSid.
+ * Need to put down the reference here and get it back in case we
+ * run out of closids. Otherwise we run into a problem when
+ * we could be using the last closid that could have been available.
+ */
+ closid_put(ir->closid);
+ if (cbm_search(cbmvalue, &closid)) {
+ ir->closid = closid;
+ closid_get(closid);
+ } else {
+ closid = ir->closid;
+ err = closid_alloc(&ir->closid);
+ if (err) {
+ closid_get(ir->closid);
+ goto out;
+ }
+
+ clos_cbm_table_update(ir->closid, cbmvalue);
+ cbm_update_all(ir->closid);
+ }
+ closcbm_map_dump();
+out:
+ mutex_unlock(&rdt_group_mutex);
+
+ return err;
+}
+
+static void rdt_cgroup_init(void)
+{
+ int max_cbm_len = boot_cpu_data.x86_cache_max_cbm_len;
+ u32 closid;
+
+ closid_alloc(&closid);
+
+ WARN_ON(closid != 0);
+
+ rdt_root_group.closid = closid;
+ clos_cbm_table_update(closid, (1ULL << max_cbm_len) - 1);
+}
+
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
u32 maxid, max_cbm_len;
int err = 0, size, i;
- if (!cache_alloc_supported(c))
+ if (!cache_alloc_supported(c)) {
+ rdt_root_group.css.ss->disabled = 1;
return -ENODEV;
-
+ }
maxid = c->x86_cache_max_closid;
max_cbm_len = c->x86_cache_max_cbm_len;
@@ -375,6 +538,7 @@ static int __init intel_rdt_late_init(void)
__hotcpu_notifier(intel_rdt_cpu_notifier, 0);
cpu_notifier_register_done();
+ rdt_cgroup_init();
static_key_slow_inc(&rdt_enable_key);
pr_info("Intel cache allocation enabled\n");
@@ -384,3 +548,19 @@ out_err:
}
late_initcall(intel_rdt_late_init);
+
+static struct cftype rdt_files[] = {
+ {
+ .name = "l3_cbm",
+ .seq_show = intel_cache_alloc_cbm_read,
+ .write_u64 = intel_cache_alloc_cbm_write,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys intel_rdt_cgrp_subsys = {
+ .css_alloc = intel_rdt_css_alloc,
+ .css_free = intel_rdt_css_free,
+ .legacy_cftypes = rdt_files,
+ .early_init = 0,
+};
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb..5702f55 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -47,6 +47,9 @@ SUBSYS(net_prio)
SUBSYS(hugetlb)
#endif
+#if IS_ENABLED(CONFIG_INTEL_RDT)
+SUBSYS(intel_rdt)
+#endif
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b1b5bf7..04b5ada 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1636,9 +1636,6 @@ struct task_struct {
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
-#ifdef CONFIG_INTEL_RDT
- u32 closid;
-#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
diff --git a/init/Kconfig b/init/Kconfig
index 6a067ca..d44ac06 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -937,6 +937,8 @@ menuconfig CGROUPS
Say N if unsure.
+if CGROUPS
+
config INTEL_RDT
bool "Intel Resource Director Technology support"
depends on X86_64 && CPU_SUP_INTEL
@@ -949,8 +951,6 @@ config INTEL_RDT
Say N if unsure.
-if CGROUPS
-
config CGROUP_DEBUG
bool "Example debug cgroup subsystem"
default n
--
1.9.1
^ permalink raw reply related [flat|nested] 12+ messages in thread