* [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
@ 2015-04-23 14:00 Wei Wang
2015-04-25 18:05 ` Julien Grall
0 siblings, 1 reply; 6+ messages in thread
From: Wei Wang @ 2015-04-23 14:00 UTC (permalink / raw)
To: jbeulich, andrew.cooper3, ian.campbell, kevin.tian, eddie.dong,
xen-devel
Cc: yang.z.zhang, Wei Wang
The intel_pstate driver is ported following its kernel code logic
(commit: 93f0822d).
In the kernel, a user can adjust the limits via sysfs
(limits.min_sysfs_pct/max_sysfs_pct). In Xen, the
policy->min_perf_pct/max_perf_pct acts as the transit station.
A user interacts with it via xenpm.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
xen/arch/x86/acpi/cpufreq/Makefile | 1 +
xen/arch/x86/acpi/cpufreq/intel_pstate.c | 843 ++++++++++++++++++++++++++++++
xen/include/acpi/cpufreq/cpufreq.h | 6 +
xen/include/acpi/cpufreq/processor_perf.h | 1 +
xen/include/asm-x86/cpufeature.h | 1 +
xen/include/asm-x86/msr-index.h | 3 +
6 files changed, 855 insertions(+)
create mode 100644 xen/arch/x86/acpi/cpufreq/intel_pstate.c
diff --git a/xen/arch/x86/acpi/cpufreq/Makefile b/xen/arch/x86/acpi/cpufreq/Makefile
index f75da9b..99fa9f4 100644
--- a/xen/arch/x86/acpi/cpufreq/Makefile
+++ b/xen/arch/x86/acpi/cpufreq/Makefile
@@ -1,2 +1,3 @@
obj-y += cpufreq.o
+obj-y += intel_pstate.o
obj-y += powernow.o
diff --git a/xen/arch/x86/acpi/cpufreq/intel_pstate.c b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
new file mode 100644
index 0000000..f95026f
--- /dev/null
+++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
@@ -0,0 +1,843 @@
+#include <xen/kernel.h>
+#include <xen/types.h>
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/cpumask.h>
+#include <xen/timer.h>
+#include <asm/msr.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/div64.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define BYT_RATIOS 0x66a
+#define BYT_VIDS 0x66b
+#define BYT_TURBO_RATIOS 0x66c
+#define BYT_TURBO_VIDS 0x66d
+
+#define FRAC_BITS 8
+#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
+#define fp_toint(X) ((X) >> FRAC_BITS)
+
+static inline int32_t mul_fp(int32_t x, int32_t y)
+{
+ return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
+}
+
+static inline int32_t div_fp(int32_t x, int32_t y)
+{
+ return div_s64((int64_t)x << FRAC_BITS, y);
+}
+
+static inline int ceiling_fp(int32_t x)
+{
+ int mask, ret;
+
+ ret = fp_toint(x);
+ mask = (1 << FRAC_BITS) - 1;
+ if (x & mask)
+ ret += 1;
+ return ret;
+}
+
+struct sample {
+ int32_t core_pct_busy;
+ u64 aperf;
+ u64 mperf;
+ int freq;
+ s_time_t time;
+};
+
+struct pstate_data {
+ int current_pstate;
+ int min_pstate;
+ int max_pstate;
+ int scaling;
+ int turbo_pstate;
+};
+
+struct vid_data {
+ int min;
+ int max;
+ int turbo;
+ int32_t ratio;
+};
+
+struct _pid {
+ int setpoint;
+ int32_t integral;
+ int32_t p_gain;
+ int32_t i_gain;
+ int32_t d_gain;
+ int deadband;
+ int32_t last_err;
+};
+
+struct cpudata {
+ int cpu;
+
+ struct timer timer;
+
+ struct pstate_data pstate;
+ struct vid_data vid;
+ struct _pid pid;
+
+ s_time_t last_sample_time;
+ u64 prev_aperf;
+ u64 prev_mperf;
+ struct sample sample;
+};
+
+static struct cpudata **all_cpu_data;
+
+struct pstate_adjust_policy {
+ int sample_rate_ms;
+ int deadband;
+ int setpoint;
+ int p_gain_pct;
+ int d_gain_pct;
+ int i_gain_pct;
+};
+
+struct pstate_funcs {
+ int (*get_max)(void);
+ int (*get_min)(void);
+ int (*get_turbo)(void);
+ int (*get_scaling)(void);
+ void (*set)(struct cpudata*, int pstate);
+ void (*get_vid)(struct cpudata *);
+};
+
+struct cpu_defaults {
+ struct pstate_adjust_policy pid_policy;
+ struct pstate_funcs funcs;
+};
+
+static struct pstate_adjust_policy pid_params;
+static struct pstate_funcs pstate_funcs;
+
+struct perf_limits {
+ int no_turbo;
+ int turbo_disabled;
+ int max_perf_pct;
+ int min_perf_pct;
+ int32_t max_perf;
+ int32_t min_perf;
+ int max_policy_pct;
+ int min_policy_pct;
+};
+
+static struct perf_limits limits = {
+ .no_turbo = 0,
+ .turbo_disabled = 0,
+ .max_perf_pct = 100,
+ .max_perf = int_tofp(1),
+ .min_perf_pct = 0,
+ .min_perf = 0,
+ .max_policy_pct = 100,
+ .min_policy_pct = 0,
+};
+
+static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
+ int deadband, int integral)
+{
+ pid->setpoint = setpoint;
+ pid->deadband = deadband;
+ pid->integral = int_tofp(integral);
+ pid->last_err = int_tofp(setpoint) - int_tofp(busy);
+}
+
+static inline void pid_p_gain_set(struct _pid *pid, int percent)
+{
+ pid->p_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static inline void pid_i_gain_set(struct _pid *pid, int percent)
+{
+ pid->i_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static inline void pid_d_gain_set(struct _pid *pid, int percent)
+{
+ pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static signed int pid_calc(struct _pid *pid, int32_t busy)
+{
+ signed int result;
+ int32_t pterm, dterm, fp_error;
+ int32_t integral_limit;
+
+ fp_error = int_tofp(pid->setpoint) - busy;
+
+ if (abs(fp_error) <= int_tofp(pid->deadband))
+ return 0;
+
+ pterm = mul_fp(pid->p_gain, fp_error);
+
+ pid->integral += fp_error;
+
+ /*
+ * We limit the integral here so that it will never
+ * get higher than 30. This prevents it from becoming
+ * too large an input over long periods of time and allows
+ * it to get factored out sooner.
+ * The value of 30 was chosen through experimentation.
+ */
+ integral_limit = int_tofp(30);
+ if (pid->integral > integral_limit)
+ pid->integral = integral_limit;
+ if (pid->integral < -integral_limit)
+ pid->integral = -integral_limit;
+
+ dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
+ pid->last_err = fp_error;
+
+ result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
+ result = result + (1 << (FRAC_BITS-1));
+ return (signed int)fp_toint(result);
+}
+
+static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
+{
+ pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
+ pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
+ pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
+
+ pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
+}
+
+static inline void intel_pstate_reset_all_pid(void)
+{
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (all_cpu_data[cpu])
+ intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
+ }
+}
+
+static inline void update_turbo_state(void)
+{
+ u64 misc_en;
+ struct cpudata *cpu;
+
+ cpu = all_cpu_data[0];
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
+ limits.turbo_disabled =
+ (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
+ cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
+}
+
+static int byt_get_min_pstate(void)
+{
+ u64 value;
+
+ rdmsrl(BYT_RATIOS, value);
+ return (value >> 8) & 0x7F;
+}
+
+static int byt_get_max_pstate(void)
+{
+ u64 value;
+
+ rdmsrl(BYT_RATIOS, value);
+ return (value >> 16) & 0x7F;
+}
+
+static int byt_get_turbo_pstate(void)
+{
+ u64 value;
+
+ rdmsrl(BYT_TURBO_RATIOS, value);
+ return value & 0x7F;
+}
+
+static void byt_set_pstate(struct cpudata *cpudata, int pstate)
+{
+ u64 val;
+ int32_t vid_fp;
+ u32 vid;
+
+ val = pstate << 8;
+ if (limits.no_turbo && !limits.turbo_disabled)
+ val |= (u64)1 << 32;
+
+ vid_fp = cpudata->vid.min + mul_fp(
+ int_tofp(pstate - cpudata->pstate.min_pstate),
+ cpudata->vid.ratio);
+
+ vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
+ vid = ceiling_fp(vid_fp);
+
+ if (pstate > cpudata->pstate.max_pstate)
+ vid = cpudata->vid.turbo;
+
+ val |= vid;
+
+ wrmsrl(MSR_IA32_PERF_CTL, val);
+}
+
+#define BYT_BCLK_FREQS 5
+static int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800};
+
+static int byt_get_scaling(void)
+{
+ u64 value;
+ int i;
+
+ rdmsrl(MSR_FSB_FREQ, value);
+ i = value & 0x3;
+
+ BUG_ON(i > BYT_BCLK_FREQS);
+
+ return byt_freq_table[i] * 100;
+}
+
+static void byt_get_vid(struct cpudata *cpudata)
+{
+ u64 value;
+
+ rdmsrl(BYT_VIDS, value);
+ cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
+ cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
+ cpudata->vid.ratio = div_fp(
+ cpudata->vid.max - cpudata->vid.min,
+ int_tofp(cpudata->pstate.max_pstate -
+ cpudata->pstate.min_pstate));
+
+ rdmsrl(BYT_TURBO_VIDS, value);
+ cpudata->vid.turbo = value & 0x7f;
+}
+
+static int core_get_min_pstate(void)
+{
+ u64 value;
+
+ rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
+ return (value >> 40) & 0xFF;
+}
+
+static int core_get_max_pstate(void)
+{
+ u64 value;
+
+ rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
+ return (value >> 8) & 0xFF;
+}
+
+static int core_get_turbo_pstate(void)
+{
+ u64 value;
+ int nont, ret;
+
+ rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
+ nont = core_get_max_pstate();
+ ret = (value) & 255;
+ if (ret <= nont)
+ ret = nont;
+ return ret;
+}
+
+static inline int core_get_scaling(void)
+{
+ return 100000;
+}
+
+static void core_set_pstate(struct cpudata *cpudata, int pstate)
+{
+ u64 val;
+
+ val = pstate << 8;
+ if (limits.no_turbo && !limits.turbo_disabled)
+ val |= (u64)1 << 32;
+
+ wrmsrl(MSR_IA32_PERF_CTL, val);
+}
+
+static struct cpu_defaults core_params = {
+ .pid_policy = {
+ .sample_rate_ms = 10,
+ .deadband = 0,
+ .setpoint = 97,
+ .p_gain_pct = 20,
+ .d_gain_pct = 0,
+ .i_gain_pct = 0,
+ },
+ .funcs = {
+ .get_max = core_get_max_pstate,
+ .get_min = core_get_min_pstate,
+ .get_turbo = core_get_turbo_pstate,
+ .get_scaling = core_get_scaling,
+ .set = core_set_pstate,
+ },
+};
+
+static struct cpu_defaults byt_params = {
+ .pid_policy = {
+ .sample_rate_ms = 10,
+ .deadband = 0,
+ .setpoint = 97,
+ .p_gain_pct = 14,
+ .d_gain_pct = 0,
+ .i_gain_pct = 4,
+ },
+ .funcs = {
+ .get_max = byt_get_max_pstate,
+ .get_min = byt_get_min_pstate,
+ .get_turbo = byt_get_turbo_pstate,
+ .set = byt_set_pstate,
+ .get_scaling = byt_get_scaling,
+ .get_vid = byt_get_vid,
+ },
+};
+
+static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
+{
+ int max_perf = cpu->pstate.turbo_pstate;
+ int max_perf_adj;
+ int min_perf;
+
+ if (limits.no_turbo || limits.turbo_disabled)
+ max_perf = cpu->pstate.max_pstate;
+
+ /* performance can be limited by user through xenpm */
+ max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf));
+ *max = clamp_t(int, max_perf_adj,
+ cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
+
+ min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits.min_perf));
+ *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
+}
+
+static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
+{
+ int max_perf, min_perf;
+ struct cpufreq_policy *policy;
+
+ policy = per_cpu(cpufreq_cpu_policy, cpu->cpu);
+
+ update_turbo_state();
+
+ if (limits.turbo_disabled)
+ policy->turbo = CPUFREQ_TURBO_UNSUPPORTED;
+ else if (limits.no_turbo)
+ policy->turbo = CPUFREQ_TURBO_DISABLED;
+ else
+ policy->turbo = CPUFREQ_TURBO_ENABLED;
+
+ intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
+
+ pstate = clamp_t(int, pstate, min_perf, max_perf);
+
+ if (pstate == cpu->pstate.current_pstate)
+ return;
+
+ cpu->pstate.current_pstate = pstate;
+ policy->cur = pstate * 100000;
+
+ pstate_funcs.set(cpu, pstate);
+}
+
+static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
+{
+ cpu->pstate.min_pstate = pstate_funcs.get_min();
+ cpu->pstate.max_pstate = pstate_funcs.get_max();
+ cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
+ cpu->pstate.scaling = pstate_funcs.get_scaling();
+
+ if (pstate_funcs.get_vid)
+ pstate_funcs.get_vid(cpu);
+ intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
+}
+
+static inline void intel_pstate_calc_busy(struct cpudata *cpu)
+{
+ struct sample *sample = &cpu->sample;
+ int64_t core_pct;
+
+ core_pct = int_tofp(sample->aperf) * int_tofp(100);
+ core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
+
+ sample->freq = fp_toint(
+ mul_fp(int_tofp(
+ cpu->pstate.max_pstate * cpu->pstate.scaling / 100),
+ core_pct));
+
+ sample->core_pct_busy = (int32_t)core_pct;
+}
+
+static inline void intel_pstate_sample(struct cpudata *cpu)
+{
+ u64 aperf, mperf;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
+
+ cpu->last_sample_time = cpu->sample.time;
+ cpu->sample.time = get_s_time();
+ cpu->sample.aperf = aperf;
+ cpu->sample.mperf = mperf;
+ cpu->sample.aperf -= cpu->prev_aperf;
+ cpu->sample.mperf -= cpu->prev_mperf;
+
+ intel_pstate_calc_busy(cpu);
+
+ cpu->prev_aperf = aperf;
+ cpu->prev_mperf = mperf;
+}
+
+static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
+{
+ set_timer(&cpu->timer, NOW() + MILLISECS(pid_params.sample_rate_ms));
+}
+
+static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+{
+ int32_t core_busy, max_pstate, current_pstate, sample_ratio;
+ u32 duration_us;
+ u32 sample_time_us;
+
+ /*
+ * core_busy is the ratio of actual performance to max
+ * max_pstate is the max non turbo pstate available
+ * current_pstate was the pstate that was requested during
+ * the last sample period.
+ *
+ * We normalize core_busy, which was our actual percent
+ * performance to what we requested during the last sample
+ * period. The result will be a percentage of busy at a
+ * specified pstate.
+ */
+ core_busy = cpu->sample.core_pct_busy;
+ max_pstate = int_tofp(cpu->pstate.max_pstate);
+ current_pstate = int_tofp(cpu->pstate.current_pstate);
+ core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+
+ /*
+ * Since we have a deferred timer, it will not fire unless
+ * we are in C0. So, determine if the actual elapsed time
+ * is significantly greater (3x) than our sample interval. If it
+ * is, then we were idle for a long enough period of time
+ * to adjust our busyness.
+ */
+ sample_time_us = pid_params.sample_rate_ms * 1000ULL;
+ duration_us = (u32)((s_time_t)(cpu->sample.time - cpu->last_sample_time)
+ / 1000);
+ if (duration_us > sample_time_us * 3) {
+ sample_ratio = div_fp(int_tofp(sample_time_us),
+ int_tofp(duration_us));
+ core_busy = mul_fp(core_busy, sample_ratio);
+ }
+
+ return core_busy;
+}
+
+static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
+{
+ int32_t busy_scaled;
+ struct _pid *pid;
+ signed int ctl;
+
+ pid = &cpu->pid;
+ busy_scaled = intel_pstate_get_scaled_busy(cpu);
+
+ ctl = pid_calc(pid, busy_scaled);
+
+ /* Negative values of ctl increase the pstate and vice versa */
+ intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);
+}
+
+static void intel_pstate_timer_func(void *__data)
+{
+ struct cpudata *cpu = (struct cpudata *) __data;
+
+ intel_pstate_sample(cpu);
+
+ intel_pstate_adjust_busy_pstate(cpu);
+
+ intel_pstate_set_sample_time(cpu);
+}
+
+#define ICPU(model, policy) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
+ (unsigned long)&policy }
+
+static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
+ ICPU(0x2a, core_params),
+ ICPU(0x2d, core_params),
+ ICPU(0x37, byt_params),
+ ICPU(0x3a, core_params),
+ ICPU(0x3c, core_params),
+ ICPU(0x3d, core_params),
+ ICPU(0x3e, core_params),
+ ICPU(0x3f, core_params),
+ ICPU(0x45, core_params),
+ ICPU(0x46, core_params),
+ ICPU(0x47, core_params),
+ ICPU(0x4c, byt_params),
+ ICPU(0x4e, core_params),
+ ICPU(0x4f, core_params),
+ ICPU(0x56, core_params),
+ {}
+};
+
+static int intel_pstate_init_cpu(unsigned int cpunum)
+{
+ struct cpudata *cpu;
+ s_time_t expires;
+
+ if (!all_cpu_data[cpunum])
+ all_cpu_data[cpunum] = xzalloc(struct cpudata);
+ if (!all_cpu_data[cpunum])
+ return -ENOMEM;
+
+ cpu = all_cpu_data[cpunum];
+
+ cpu->cpu = cpunum;
+ intel_pstate_get_cpu_pstates(cpu);
+
+ init_timer(&cpu->timer, intel_pstate_timer_func, cpu, cpunum);
+ expires = NOW() + MILLISECS(10);
+
+ intel_pstate_busy_pid_reset(cpu);
+ intel_pstate_sample(cpu);
+
+ set_timer(&cpu->timer, expires);
+
+ return 0;
+}
+
+static int intel_pstate_set_policy(struct cpufreq_policy *policy)
+{
+ if (!policy->cpuinfo.max_freq)
+ return -ENODEV;
+
+ if (policy->policy == CPUFREQ_POLICY_PERFORMANCE &&
+ policy->max >= policy->cpuinfo.max_freq) {
+ limits.min_policy_pct = 100;
+ limits.min_perf_pct = 100;
+ limits.min_perf = int_tofp(1);
+ limits.max_policy_pct = 100;
+ limits.max_perf_pct = 100;
+ limits.max_perf = int_tofp(1);
+ limits.no_turbo = 0;
+ return 0;
+ }
+
+ limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
+ limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0, 100);
+ limits.min_perf_pct = max(limits.min_policy_pct, policy->min_perf_pct);
+ policy->min_perf_pct = limits.min_perf_pct;
+ limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
+
+ limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq;
+ limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0, 100);
+ limits.max_perf_pct = min(limits.max_policy_pct, policy->max_perf_pct);
+ policy->max_perf_pct = limits.max_perf_pct;
+ limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
+
+ return 0;
+}
+
+static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
+{
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+
+ if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
+ policy->policy != CPUFREQ_POLICY_PERFORMANCE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+ int cpu_num = policy->cpu;
+ struct cpudata *cpu = all_cpu_data[cpu_num];
+
+ kill_timer(&all_cpu_data[cpu_num]->timer);
+
+ intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
+
+ return 0;
+}
+
+static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu = all_cpu_data[0];
+
+ update_turbo_state();
+ if (limits.turbo_disabled) {
+ printk("Turbo disabled by BIOS or not supported on processor\n");
+ return -EINVAL;
+ }
+ limits.no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1;
+
+ if (limits.no_turbo)
+ policy->cpuinfo.max_freq =
+ cpu->pstate.max_pstate * cpu->pstate.scaling;
+ else
+ policy->cpuinfo.max_freq =
+ cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+
+ policy->max = clamp_t(unsigned int, policy->max,
+ policy->cpuinfo.min_freq, policy->cpuinfo.max_freq);
+
+ return 0;
+}
+
+static int get_turbo_pct(void)
+{
+ struct cpudata *cpu;
+ int total, no_turbo, turbo_pct;
+ uint32_t turbo_fp;
+
+ cpu = all_cpu_data[0];
+
+ total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
+ no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
+ turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total));
+ turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
+ return turbo_pct;
+}
+
+static unsigned int get_pstates_num(void)
+{
+ struct cpudata *cpu;
+ int total;
+
+ cpu = all_cpu_data[0];
+ total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
+ return total;
+}
+
+static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu;
+ int rc;
+
+ rc = intel_pstate_init_cpu(policy->cpu);
+ if (rc)
+ return rc;
+
+ cpu = all_cpu_data[policy->cpu];
+ if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100)
+ policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+ else
+ policy->policy = CPUFREQ_POLICY_POWERSAVE;
+
+ policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
+ policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+ policy->min_perf_pct = 0;
+ policy->max_perf_pct = 100;
+ policy->turbo_pct = get_turbo_pct();
+ policy->pstates_num = get_pstates_num();
+
+ /* cpuinfo and default policy values */
+ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
+ policy->cpuinfo.max_freq =
+ cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+ policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ cpumask_set_cpu(policy->cpu, policy->cpus);
+
+ return 0;
+}
+
+static struct cpufreq_driver intel_pstate_driver = {
+ .verify = intel_pstate_verify_policy,
+ .setpolicy = intel_pstate_set_policy,
+ .init = intel_pstate_cpu_init,
+ .exit = intel_pstate_cpu_exit,
+ .update = intel_pstate_turbo_update,
+ .name = "intel_pstate",
+};
+
+static int intel_pstate_msrs_not_valid(void)
+{
+ /* Check that all the msr's we are using are valid. */
+ u64 aperf, mperf, tmp;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ if (!pstate_funcs.get_max() ||
+ !pstate_funcs.get_min() ||
+ !pstate_funcs.get_turbo())
+ return -ENODEV;
+
+ rdmsrl(MSR_IA32_APERF, tmp);
+ if (!(tmp - aperf))
+ return -ENODEV;
+
+ rdmsrl(MSR_IA32_MPERF, tmp);
+ if (!(tmp - mperf))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void copy_pid_params(struct pstate_adjust_policy *policy)
+{
+ pid_params.sample_rate_ms = policy->sample_rate_ms;
+ pid_params.p_gain_pct = policy->p_gain_pct;
+ pid_params.i_gain_pct = policy->i_gain_pct;
+ pid_params.d_gain_pct = policy->d_gain_pct;
+ pid_params.deadband = policy->deadband;
+ pid_params.setpoint = policy->setpoint;
+}
+
+static void copy_cpu_funcs(struct pstate_funcs *funcs)
+{
+ pstate_funcs.get_max = funcs->get_max;
+ pstate_funcs.get_min = funcs->get_min;
+ pstate_funcs.get_turbo = funcs->get_turbo;
+ pstate_funcs.get_scaling = funcs->get_scaling;
+ pstate_funcs.set = funcs->set;
+ pstate_funcs.get_vid = funcs->get_vid;
+}
+
+int __init intel_pstate_init(void)
+{
+ int cpu, rc = 0;
+ const struct x86_cpu_id *id;
+ struct cpu_defaults *cpu_info;
+
+ if (cpuid_ecx(6) & 0x1)
+ set_bit(X86_FEATURE_APERFMPERF, &boot_cpu_data.x86_capability);
+
+ id = x86_match_cpu(intel_pstate_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ cpu_info = (struct cpu_defaults *)id->driver_data;
+
+ copy_pid_params(&cpu_info->pid_policy);
+ copy_cpu_funcs(&cpu_info->funcs);
+
+ if (intel_pstate_msrs_not_valid())
+ return -ENODEV;
+
+ all_cpu_data = xzalloc_array(struct cpudata *, num_online_cpus());
+ if (!all_cpu_data)
+ return -ENOMEM;
+
+ rc = cpufreq_register_driver(&intel_pstate_driver);
+ if (rc)
+ goto out;
+
+ return rc;
+out:
+ for_each_online_cpu(cpu) {
+ if (all_cpu_data[cpu]) {
+ kill_timer(&all_cpu_data[cpu]->timer);
+ xfree(all_cpu_data[cpu]);
+ }
+ }
+ xfree(all_cpu_data);
+ return -ENODEV;
+}
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index e288964..9136e5f 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -34,6 +34,12 @@ struct acpi_cpufreq_data {
extern struct acpi_cpufreq_data *cpufreq_drv_data[NR_CPUS];
+/*
+ * Maximum transition latency is in nanoseconds - if it's unknown,
+ * CPUFREQ_ETERNAL shall be used.
+ */
+#define CPUFREQ_ETERNAL (-1)
+
struct cpufreq_cpuinfo {
unsigned int max_freq;
unsigned int second_max_freq; /* P1 if Turbo Mode is on */
diff --git a/xen/include/acpi/cpufreq/processor_perf.h b/xen/include/acpi/cpufreq/processor_perf.h
index d8a1ba6..ebff11d 100644
--- a/xen/include/acpi/cpufreq/processor_perf.h
+++ b/xen/include/acpi/cpufreq/processor_perf.h
@@ -7,6 +7,7 @@
#define XEN_PX_INIT 0x80000000
+int intel_pstate_init(void);
int powernow_cpufreq_init(void);
unsigned int powernow_register_driver(void);
unsigned int get_measured_perf(unsigned int cpu, unsigned int flag);
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 7963a3a..efc9711 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -69,6 +69,7 @@
#define X86_FEATURE_XTOPOLOGY (3*32+13) /* cpu topology enum extensions */
#define X86_FEATURE_CPUID_FAULTING (3*32+14) /* cpuid faulting */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+15) /* clflush reqd with monitor */
+#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index 83f2f70..57945d9 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -52,6 +52,8 @@
#define MSR_IA32_MCG_STATUS 0x0000017a
#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
+
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
@@ -319,6 +321,7 @@
#define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18)
#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1<<22)
#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL<<38)
#define MSR_IA32_TSC_DEADLINE 0x000006E0
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
--
1.9.1
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
2015-04-23 14:00 [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver Wei Wang
@ 2015-04-25 18:05 ` Julien Grall
2015-04-26 3:32 ` Wang, Wei W
0 siblings, 1 reply; 6+ messages in thread
From: Julien Grall @ 2015-04-25 18:05 UTC (permalink / raw)
To: Wei Wang, jbeulich, andrew.cooper3, ian.campbell, kevin.tian,
eddie.dong, xen-devel
Cc: yang.z.zhang
Hi Wei,
On 23/04/2016 18:58, Wei Wang wrote:
> diff --git a/xen/include/acpi/cpufreq/processor_perf.h b/xen/include/acpi/cpufreq/processor_perf.h
> index d8a1ba6..ebff11d 100644
> --- a/xen/include/acpi/cpufreq/processor_perf.h
> +++ b/xen/include/acpi/cpufreq/processor_perf.h
> @@ -7,6 +7,7 @@
>
> #define XEN_PX_INIT 0x80000000
>
> +int intel_pstate_init(void);
The intel pstate driver is x86 specific. Although xen/include/acpi
contains common headers for common code.
Can you move the declaration in an x86 specific header (i.e in
xen/include/asm-x86)?
If I'm not mistaken, you have other patch with similar things in this
series.
Regards,
--
Julien Grall
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
2015-04-25 18:05 ` Julien Grall
@ 2015-04-26 3:32 ` Wang, Wei W
2015-04-26 13:02 ` Julien Grall
0 siblings, 1 reply; 6+ messages in thread
From: Wang, Wei W @ 2015-04-26 3:32 UTC (permalink / raw)
To: Julien Grall, jbeulich@suse.com, andrew.cooper3@citrix.com,
ian.campbell@citrix.com, Tian, Kevin, Dong, Eddie,
xen-devel@lists.xen.org
Cc: Zhang, Yang Z
Hi Julien,
On 24/04/2015 20:57, Julien Grall wrote
> On 23/04/2016 18:58, Wei Wang wrote:
> > diff --git a/xen/include/acpi/cpufreq/processor_perf.h
> > b/xen/include/acpi/cpufreq/processor_perf.h
> > index d8a1ba6..ebff11d 100644
> > --- a/xen/include/acpi/cpufreq/processor_perf.h
> > +++ b/xen/include/acpi/cpufreq/processor_perf.h
> > @@ -7,6 +7,7 @@
> >
> > #define XEN_PX_INIT 0x80000000
> >
> > +int intel_pstate_init(void);
>
> The intel pstate driver is x86 specific. Although xen/include/acpi contains
> common headers for common code.
Thanks for your comments. But I saw "int powernow_cpufreq_init(void);" is put there.
Best,
Wei
> Can you move the declaration in an x86 specific header (i.e in
> xen/include/asm-x86)?
>
> If I'm not mistaken, you have other patch with similar things in this series.
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
2015-04-26 3:32 ` Wang, Wei W
@ 2015-04-26 13:02 ` Julien Grall
2015-04-27 1:23 ` Wang, Wei W
2015-04-28 13:17 ` Jan Beulich
0 siblings, 2 replies; 6+ messages in thread
From: Julien Grall @ 2015-04-26 13:02 UTC (permalink / raw)
To: Wang, Wei W, Julien Grall, jbeulich@suse.com,
andrew.cooper3@citrix.com, ian.campbell@citrix.com, Tian, Kevin,
Dong, Eddie, xen-devel@lists.xen.org
Cc: Zhang, Yang Z
On 26/04/2015 05:32, Wang, Wei W wrote:
> Hi Julien,
Hi Wei,
> On 24/04/2015 20:57, Julien Grall wrote
>> On 23/04/2016 18:58, Wei Wang wrote:
>>> diff --git a/xen/include/acpi/cpufreq/processor_perf.h
>>> b/xen/include/acpi/cpufreq/processor_perf.h
>>> index d8a1ba6..ebff11d 100644
>>> --- a/xen/include/acpi/cpufreq/processor_perf.h
>>> +++ b/xen/include/acpi/cpufreq/processor_perf.h
>>> @@ -7,6 +7,7 @@
>>>
>>> #define XEN_PX_INIT 0x80000000
>>>
>>> +int intel_pstate_init(void);
>>
>> The intel pstate driver is x86 specific. Although xen/include/acpi contains
>> common headers for common code.
>
> Thanks for your comments. But I saw "int powernow_cpufreq_init(void);" is put there.
FWIW, this prototype doesn't have any implementation even on x86.
While currently some drivers (such as the x86 powernow) may define
prototype in the common header. This is wrong, the common code should
not be able to call those functions.
There is an ongoing support on ACPI for ARM (an RFC has been sent a
couple of months ago). Adding new x86 prototype in this directory
complicate the splitting. Please help us to at least avoid adding new
x86 specific prototype/code in the common code when it's possible.
We will take care of moving the current x86 prototype/code in the
arch-specific directories.
Although, I'm not a maintainer. They may have a different opinion on
this point.
Regards,
--
Julien Grall
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
2015-04-26 13:02 ` Julien Grall
@ 2015-04-27 1:23 ` Wang, Wei W
2015-04-28 13:17 ` Jan Beulich
1 sibling, 0 replies; 6+ messages in thread
From: Wang, Wei W @ 2015-04-27 1:23 UTC (permalink / raw)
To: Julien Grall, jbeulich@suse.com, andrew.cooper3@citrix.com,
ian.campbell@citrix.com, Tian, Kevin, Dong, Eddie,
xen-devel@lists.xen.org
Cc: Zhang, Yang Z
On 26/04/2015 01:33, Julien Grall wrote
Hi Julien,
> On 26/04/2015 05:32, Wang, Wei W wrote:
> > On 24/04/2015 20:57, Julien Grall wrote
> >> On 23/04/2016 18:58, Wei Wang wrote:
> >>> diff --git a/xen/include/acpi/cpufreq/processor_perf.h
> >>> b/xen/include/acpi/cpufreq/processor_perf.h
> >>> index d8a1ba6..ebff11d 100644
> >>> --- a/xen/include/acpi/cpufreq/processor_perf.h
> >>> +++ b/xen/include/acpi/cpufreq/processor_perf.h
> >>> @@ -7,6 +7,7 @@
> >>>
> >>> #define XEN_PX_INIT 0x80000000
> >>>
> >>> +int intel_pstate_init(void);
> >>
> >> The intel pstate driver is x86 specific. Although xen/include/acpi
> >> contains common headers for common code.
> >
> > Thanks for your comments. But I saw "int powernow_cpufreq_init(void);"
> is put there.
>
> FWIW, this prototype doesn't have any implementation even on x86.
>
> While currently some drivers (such as the x86 powernow) may define
> prototype in the common header. This is wrong, the common code should
> not be able to call those functions.
>
> There is an ongoing support on ACPI for ARM (an RFC has been sent a couple
> of months ago). Adding new x86 prototype in this directory complicate the
> splitting. Please help us to at least avoid adding new
> x86 specific prototype/code in the common code when it's possible.
>
> We will take care of moving the current x86 prototype/code in the arch-
> specific directories.
>
> Although, I'm not a maintainer. They may have a different opinion on this
> point.
Sure. I will do it if maintainers don't have a different opinion.
Best,
Wei
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver
2015-04-26 13:02 ` Julien Grall
2015-04-27 1:23 ` Wang, Wei W
@ 2015-04-28 13:17 ` Jan Beulich
1 sibling, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2015-04-28 13:17 UTC (permalink / raw)
To: wei.w.wang
Cc: kevin.tian, ian.campbell, andrew.cooper3, eddie.dong, xen-devel,
julien.grall, yang.z.zhang
>>> Julien Grall <julien.grall@citrix.com> 04/26/15 7:33 PM >>>
>On 26/04/2015 05:32, Wang, Wei W wrote:
>> Thanks for your comments. But I saw "int powernow_cpufreq_init(void);" is put there.
>
>FWIW, this prototype doesn't have any implementation even on x86.
>
>While currently some drivers (such as the x86 powernow) may define
>prototype in the common header. This is wrong, the common code should
>not be able to call those functions.
>
>There is an ongoing support on ACPI for ARM (an RFC has been sent a
>couple of months ago). Adding new x86 prototype in this directory
>complicate the splitting. Please help us to at least avoid adding new
>x86 specific prototype/code in the common code when it's possible.
>
>We will take care of moving the current x86 prototype/code in the
>arch-specific directories.
>
>Although, I'm not a maintainer. They may have a different opinion on
>this point.
I fully agree - bad examples shouldn't lead to more bad stuff getting added.
Jan
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-04-28 13:17 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-04-23 14:00 [PATCH 6/9] x86/intel_pstate: the main boby of the intel_pstate driver Wei Wang
2015-04-25 18:05 ` Julien Grall
2015-04-26 3:32 ` Wang, Wei W
2015-04-26 13:02 ` Julien Grall
2015-04-27 1:23 ` Wang, Wei W
2015-04-28 13:17 ` Jan Beulich
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.