* [PATCH v4 1/3] cpufreq: intel_pstate: Enforce _PPC limits
2016-04-27 22:48 [PATCH v4 0/3] Enforce _PPC limits Srinivas Pandruvada
@ 2016-04-27 22:48 ` Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 2/3] cpufreq: intel_pstate: Adjust policy->max Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 3/3] cpufreq: intel_pstate: Enable PPC enforcement for servers Srinivas Pandruvada
2 siblings, 0 replies; 4+ messages in thread
From: Srinivas Pandruvada @ 2016-04-27 22:48 UTC (permalink / raw)
To: rjw; +Cc: khlebnikov, linux-pm, Srinivas Pandruvada
Use ACPI _PPC notification to limit max P state driver will request.
ACPI _PPC change notification is sent by BIOS to limit max P state
in several cases:
- Reduce impact of platform thermal condition
- When Config TDP feature is used, a changed _PPC is sent to
follow TDP change
- Remote node managers in server want to control platform power
via baseboard management controller (BMC)
This change registers with ACPI processor performance lib so that
_PPC changes are notified to cpufreq core, which in turns will
result in call to .setpolicy() callback. Also the way _PSS
table identifies a turbo frequency is not compatible to max turbo
frequency in intel_pstate, so the very first entry in _PSS needs
to be adjusted.
This feature can be turned on by using kernel parameters:
intel_pstate=support_acpi_ppc
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
Documentation/kernel-parameters.txt | 2 +
drivers/cpufreq/Kconfig.x86 | 1 +
drivers/cpufreq/intel_pstate.c | 135 +++++++++++++++++++++++++++++++++++-
3 files changed, 136 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0b3de80..4199a1b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1661,6 +1661,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
hwp_only
Only load intel_pstate on systems which support
hardware P state control (HWP) if available.
+ support_acpi_ppc
+ Enforce ACPI _PPC performance limits.
intremap= [X86-64, Intel-IOMMU]
on enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index c59bdcb..adbd1de 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -5,6 +5,7 @@
config X86_INTEL_PSTATE
bool "Intel P state control"
depends on X86
+ select ACPI_PROCESSOR if ACPI
help
This driver provides a P state for Intel core processors.
The driver implements an internal governor and will become
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 30fe323..fc37446 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -39,6 +39,10 @@
#define ATOM_TURBO_RATIOS 0x66c
#define ATOM_TURBO_VIDS 0x66d
+#ifdef CONFIG_ACPI
+#include <acpi/processor.h>
+#endif
+
#define FRAC_BITS 8
#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
#define fp_toint(X) ((X) >> FRAC_BITS)
@@ -172,6 +176,8 @@ struct _pid {
* @prev_cummulative_iowait: IO Wait time difference from last and
* current sample
* @sample: Storage for storing last Sample data
+ * @acpi_perf_data: Stores ACPI perf information read from _PSS
+ * @valid_pss_table: Set to true for valid ACPI _PSS entries found
*
* This structure stores per CPU instance data for all CPUs.
*/
@@ -190,6 +196,10 @@ struct cpudata {
u64 prev_tsc;
u64 prev_cummulative_iowait;
struct sample sample;
+#ifdef CONFIG_ACPI
+ struct acpi_processor_performance acpi_perf_data;
+ bool valid_pss_table;
+#endif
};
static struct cpudata **all_cpu_data;
@@ -258,6 +268,9 @@ static struct pstate_adjust_policy pid_params;
static struct pstate_funcs pstate_funcs;
static int hwp_active;
+#ifdef CONFIG_ACPI
+static bool acpi_ppc;
+#endif
/**
* struct perf_limits - Store user and policy limits
@@ -331,6 +344,110 @@ static struct perf_limits *limits = &performance_limits;
static struct perf_limits *limits = &powersave_limits;
#endif
+#ifdef CONFIG_ACPI
+/*
+ * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and
+ * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and
+ * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state
+ * ratio, out of it only high 8 bits are used. For example 0x1700 is setting
+ * target ratio 0x17. The _PSS control value stores in a format which can be
+ * directly written to PERF_CTL MSR. But in intel_pstate driver this shift
+ * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()).
+ * This function converts the _PSS control value to intel pstate driver format
+ * for comparison and assignment.
+ */
+static int convert_to_native_pstate_format(struct cpudata *cpu, int index)
+{
+ return cpu->acpi_perf_data.states[index].control >> 8;
+}
+
+static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu;
+ int turbo_pss_ctl;
+ int ret;
+ int i;
+
+ if (!acpi_ppc)
+ return;
+
+ cpu = all_cpu_data[policy->cpu];
+
+ ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
+ policy->cpu);
+ if (ret)
+ return;
+
+ /*
+ * Check if the control value in _PSS is for PERF_CTL MSR, which should
+ * guarantee that the states returned by it map to the states in our
+ * list directly.
+ */
+ if (cpu->acpi_perf_data.control_register.space_id !=
+ ACPI_ADR_SPACE_FIXED_HARDWARE)
+ goto err_unreg_perf;
+
+ /*
+ * If there is only one entry _PSS, simply ignore _PSS and continue as
+ * usual without taking _PSS into account
+ */
+ if (cpu->acpi_perf_data.state_count < 2)
+ goto err_unreg_perf;
+
+ pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
+ for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
+ pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n",
+ (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
+ (u32) cpu->acpi_perf_data.states[i].core_frequency,
+ (u32) cpu->acpi_perf_data.states[i].power,
+ (u32) cpu->acpi_perf_data.states[i].control);
+ }
+
+ /*
+ * The _PSS table doesn't contain whole turbo frequency range.
+ * This just contains +1 MHZ above the max non turbo frequency,
+ * with control value corresponding to max turbo ratio. But
+ * when cpufreq set policy is called, it will call with this
+ * max frequency, which will cause a reduced performance as
+ * this driver uses real max turbo frequency as the max
+ * frequency. So correct this frequency in _PSS table to
+ * correct max turbo frequency based on the turbo ratio.
+ * Also need to convert to MHz as _PSS freq is in MHz.
+ */
+ turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0);
+ if (turbo_pss_ctl > cpu->pstate.max_pstate)
+ cpu->acpi_perf_data.states[0].core_frequency =
+ policy->cpuinfo.max_freq / 1000;
+ cpu->valid_pss_table = true;
+ pr_info("_PPC limits will be enforced\n");
+
+ return;
+err_unreg_perf:
+ cpu->valid_pss_table = false;
+ acpi_processor_unregister_performance(policy->cpu);
+}
+
+static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu;
+
+ cpu = all_cpu_data[policy->cpu];
+ if (!cpu->valid_pss_table)
+ return;
+
+ acpi_processor_unregister_performance(policy->cpu);
+}
+
+#else
+static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+{
+}
+
+static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+{
+}
+#endif
+
static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
int deadband, int integral) {
pid->setpoint = int_tofp(setpoint);
@@ -1396,18 +1513,27 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
policy->cpuinfo.max_freq =
cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+ intel_pstate_init_acpi_perf_limits(policy);
policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
cpumask_set_cpu(policy->cpu, policy->cpus);
return 0;
}
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+ intel_pstate_exit_perf_limits(policy);
+
+ return 0;
+}
+
static struct cpufreq_driver intel_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS,
.verify = intel_pstate_verify_policy,
.setpolicy = intel_pstate_set_policy,
.get = intel_pstate_get,
.init = intel_pstate_cpu_init,
+ .exit = intel_pstate_cpu_exit,
.stop_cpu = intel_pstate_stop_cpu,
.name = "intel_pstate",
};
@@ -1451,8 +1577,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
}
-#if IS_ENABLED(CONFIG_ACPI)
-#include <acpi/processor.h>
+#ifdef CONFIG_ACPI
static bool intel_pstate_no_acpi_pss(void)
{
@@ -1658,6 +1783,12 @@ static int __init intel_pstate_setup(char *str)
force_load = 1;
if (!strcmp(str, "hwp_only"))
hwp_only = 1;
+
+#ifdef CONFIG_ACPI
+ if (!strcmp(str, "support_acpi_ppc"))
+ acpi_ppc = true;
+#endif
+
return 0;
}
early_param("intel_pstate", intel_pstate_setup);
--
2.5.5
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v4 2/3] cpufreq: intel_pstate: Adjust policy->max
2016-04-27 22:48 [PATCH v4 0/3] Enforce _PPC limits Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 1/3] cpufreq: intel_pstate: " Srinivas Pandruvada
@ 2016-04-27 22:48 ` Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 3/3] cpufreq: intel_pstate: Enable PPC enforcement for servers Srinivas Pandruvada
2 siblings, 0 replies; 4+ messages in thread
From: Srinivas Pandruvada @ 2016-04-27 22:48 UTC (permalink / raw)
To: rjw; +Cc: khlebnikov, linux-pm, Srinivas Pandruvada
When policy->max is changed via _PPC or sysfs and is more than the max non
turbo frequency, it does not really change resulting performance in some
processors. When policy->max results in a P-State ratio more than the
turbo activation ratio, then processor can choose any P-State up to max
turbo. So the user or _PPC setting has no value, but this can cause
undesirable side effects like:
- Showing reduced max percentage in Intel P-State sysfs
- It can cause reduced max performance under certain boundary conditions:
The requested max scaling frequency either via _PPC or via cpufreq-sysfs,
will be converted into a fixed floating point max percent scale. In
majority of the cases this will result in correct max. But not 100% of the
time. If the _PPC is requested at a point where the calculation lead to a
lower max, this can result in a lower P-State then expected and it will
impact performance.
Example of this condition using a Broadwell laptop with config TDP.
ACPI _PSS table from a Broadwell laptop
2301000 2300000 2200000 2000000 1900000 1800000 1700000 1500000 1400000
1300000 1100000 1000000 900000 800000 600000 500000
The actual results by disabling config TDP so that we can get what is
requested on or below 2300000Khz.
scaling_max_freq Max Requested P-State Resultant scaling
max
---------------------------------------- ----------------------
2400000 18 2900000 (max
turbo)
2300000 17 2300000 (max
physical non turbo)
2200000 15 2100000
2100000 15 2100000
2000000 13 1900000
1900000 13 1900000
1800000 12 1800000
1700000 11 1700000
1600000 10 1600000
1500000 f 1500000
1400000 e 1400000
1300000 d 1300000
1200000 c 1200000
1100000 a 1000000
1000000 a 1000000
900000 9 900000
800000 8 800000
700000 7 700000
600000 6 600000
500000 5 500000
------------------------------------------------------------------
Now set the config TDP level 1 ratio as 0x0b (equivalent to 1100000KHz)
in BIOS (not every system will let you adjust this).
The turbo activation ratio will be set to one less than that, which will
be 0x0a (So any request above 1000000KHz should result in turbo region
assuming no thermal limits).
Here _PPC will request max to 1100000KHz (which basically should still
result in turbo as this is more than the turbo activation ratio up to
max allowable turbo frequency), but actual calculation resulted in a max
ceiling P-State which is 0x0a. So under any load condition, this driver
will not request turbo P-States. This will be a huge performance hit.
When config TDP feature is ON, if the _PPC points to a frequency above
turbo activation ratio, the performance can still reach max turbo. In this
case we don't need to treat this as the reduced frequency in set_policy
callback.
In this change when config TDP is active (by checking if the physical max
non turbo ratio is more than the current max non turbo ratio), any request
above current max non turbo is treated as full performance.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
drivers/cpufreq/intel_pstate.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fc37446..c4894ff 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1413,11 +1413,23 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
static int intel_pstate_set_policy(struct cpufreq_policy *policy)
{
+ struct cpudata *cpu;
+
if (!policy->cpuinfo.max_freq)
return -ENODEV;
intel_pstate_clear_update_util_hook(policy->cpu);
+ cpu = all_cpu_data[0];
+ if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) {
+ if (policy->max < policy->cpuinfo.max_freq &&
+ policy->max > (cpu->pstate.max_pstate *
+ cpu->pstate.scaling)) {
+ pr_info("policy->max > max non turbo frequency\n");
+ policy->max = policy->cpuinfo.max_freq;
+ }
+ }
+
if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
limits = &performance_limits;
if (policy->max >= policy->cpuinfo.max_freq) {
--
2.5.5
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v4 3/3] cpufreq: intel_pstate: Enable PPC enforcement for servers
2016-04-27 22:48 [PATCH v4 0/3] Enforce _PPC limits Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 1/3] cpufreq: intel_pstate: " Srinivas Pandruvada
2016-04-27 22:48 ` [PATCH v4 2/3] cpufreq: intel_pstate: Adjust policy->max Srinivas Pandruvada
@ 2016-04-27 22:48 ` Srinivas Pandruvada
2 siblings, 0 replies; 4+ messages in thread
From: Srinivas Pandruvada @ 2016-04-27 22:48 UTC (permalink / raw)
To: rjw; +Cc: khlebnikov, linux-pm, Srinivas Pandruvada
For platforms which are controlled via remove node manager, enable _PPC by
default. These platforms are mostly categorized as enterprise server or
performance servers. These platforms needs to go through some
certifications tests, which tests control via _PPC.
The relative risk of enabling by default is low as this is is less likely
that these systems have broken _PSS table.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
Documentation/kernel-parameters.txt | 5 ++++-
drivers/cpufreq/intel_pstate.c | 12 +++++++++++-
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4199a1b..52292b2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1662,7 +1662,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Only load intel_pstate on systems which support
hardware P state control (HWP) if available.
support_acpi_ppc
- Enforce ACPI _PPC performance limits.
+ Enforce ACPI _PPC performance limits. If the Fixed ACPI
+ Description Table, specifies preferred power management
+ profile as "Enterprise Server" or "Performance Server",
+ then this feature is turned on by default.
intremap= [X86-64, Intel-IOMMU]
on enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c4894ff..7da7672 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -345,6 +345,16 @@ static struct perf_limits *limits = &powersave_limits;
#endif
#ifdef CONFIG_ACPI
+
+static bool intel_pstate_get_ppc_enable_status(void)
+{
+ if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
+ acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
+ return true;
+
+ return acpi_ppc;
+}
+
/*
* The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and
* in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and
@@ -368,7 +378,7 @@ static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
int ret;
int i;
- if (!acpi_ppc)
+ if (!intel_pstate_get_ppc_enable_status())
return;
cpu = all_cpu_data[policy->cpu];
--
2.5.5
^ permalink raw reply related [flat|nested] 4+ messages in thread