- * [PATCH v6 1/6] x86/intel_pstate: add some calculation related support
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-11-17 17:36   ` Jan Beulich
  2015-10-28  3:21 ` [PATCH v6 2/6] x86/intel_pstate: introduce the internal_governor struct Wei Wang
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
The added calculation related functions will be used in the intel_pstate.c.
They are copied from the Linux kernel(commit 2418f4f2, f3002134, eb18cba7).
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 1) #define clamp() - remove the typecast on the result of max();
 2) add some comments to explain the functionality of some functions;
 3) alignment changes.
 xen/arch/x86/oprofile/op_model_athlon.c |  9 ----
 xen/include/asm-x86/div64.h             | 91 +++++++++++++++++++++++++++++++++
 xen/include/xen/kernel.h                | 23 +++++++++
 3 files changed, 114 insertions(+), 9 deletions(-)
diff --git a/xen/arch/x86/oprofile/op_model_athlon.c b/xen/arch/x86/oprofile/op_model_athlon.c
index c0a81ed..4122eee 100644
--- a/xen/arch/x86/oprofile/op_model_athlon.c
+++ b/xen/arch/x86/oprofile/op_model_athlon.c
@@ -103,15 +103,6 @@ static u64 ibs_op_ctl;
 #define IBS_FETCH_CODE                  13
 #define IBS_OP_CODE                     14
 
-#define clamp(val, min, max) ({			\
-	typeof(val) __val = (val);		\
-	typeof(min) __min = (min);		\
-	typeof(max) __max = (max);		\
-	(void) (&__val == &__min);		\
-	(void) (&__val == &__max);		\
-	__val = __val < __min ? __min: __val;	\
-	__val > __max ? __max: __val; })
-
 /*
  * 16-bit Linear Feedback Shift Register (LFSR)
  */
diff --git a/xen/include/asm-x86/div64.h b/xen/include/asm-x86/div64.h
index dd49f64..b20df2d 100644
--- a/xen/include/asm-x86/div64.h
+++ b/xen/include/asm-x86/div64.h
@@ -11,4 +11,95 @@
     __rem;                                      \
 })
 
+/*
+ * div_u64_rem - unsigned 64bit divide with 32bit divisor
+ * @dividend:  64bit dividend
+ * @divisor:   32bit divisor
+ * @remainder: 32bit remainder
+ */
+static inline uint64_t div_u64_rem(uint64_t dividend, uint32_t divisor,
+                                   uint32_t *remainder)
+{
+    *remainder = do_div(dividend, divisor);
+    return dividend;
+}
+
+static inline uint64_t div_u64(uint64_t dividend, uint32_t  divisor)
+{
+    uint32_t remainder;
+
+    return div_u64_rem(dividend, divisor, &remainder);
+}
+
+/*
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * @dividend: 64bit dividend
+ * @divisor:  64bit divisor
+ *
+ * This implementation is a modified version of the algorithm proposed
+ * by the book 'Hacker's Delight'.  The original source and full proof
+ * can be found here and is available for use without restriction.
+ *
+ * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
+ */
+static inline uint64_t div64_u64(uint64_t dividend, uint64_t divisor)
+{
+    uint32_t high = divisor >> 32;
+    uint64_t quot;
+
+    if ( high == 0 )
+        quot = div_u64(dividend, divisor);
+    else
+    {
+        int n = 1 + fls(high);
+
+        quot = div_u64(dividend >> n, divisor >> n);
+
+        if ( quot != 0 )
+            quot--;
+        if ( (dividend - quot * divisor) >= divisor )
+            quot++;
+    }
+    return quot;
+}
+
+/*
+ * div_u64_rem - signed 64bit divide with 32bit divisor
+ * @dividend:  64bit dividend
+ * @divisor:   32bit divisor
+ * @remainder: 32bit remainder
+ */
+static inline int64_t div_s64_rem(int64_t dividend, int32_t divisor,
+                                  int32_t *remainder)
+{
+    int64_t quotient;
+
+    if ( dividend < 0 )
+    {
+        quotient = div_u64_rem(-dividend, ABS(divisor),
+                               (uint32_t *)remainder);
+        *remainder = -*remainder;
+        if ( divisor > 0 )
+            quotient = -quotient;
+    }
+    else
+    {
+        quotient = div_u64_rem(dividend, ABS(divisor),
+                        (uint32_t *)remainder);
+        if ( divisor < 0 )
+            quotient = -quotient;
+    }
+    return quotient;
+}
+
+/*
+ * div_s64 - signed 64bit divide with 32bit divisor
+ */
+static inline int64_t div_s64(int64_t dividend, int32_t divisor)
+{
+    int32_t remainder;
+
+    return div_s64_rem(dividend, divisor, &remainder);
+}
+
 #endif
diff --git a/xen/include/xen/kernel.h b/xen/include/xen/kernel.h
index 548b64d..96c7948 100644
--- a/xen/include/xen/kernel.h
+++ b/xen/include/xen/kernel.h
@@ -43,6 +43,29 @@
 #define MAX(x,y) ((x) > (y) ? (x) : (y))
 
 /**
+ * clamp - return a value clamped to a given range with strict typechecking
+ * @val: current value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
+ *
+ * This macro does strict typechecking of lo/hi to make sure they are of the
+ * same type as val.  See the unnecessary pointer comparisons.
+ */
+#define clamp(val, lo, hi) min(max(val, lo), hi)
+
+/*
+ * clamp_t - return a value clamped to a given range using a given type
+ * @type: the type of variable to use
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of type
+ * 'type' to make all the comparisons.
+ */
+#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+
+/**
  * container_of - cast a member of a structure out to the containing structure
  *
  * @ptr:	the pointer to the member.
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * [PATCH v6 2/6] x86/intel_pstate: introduce the internal_governor struct
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
  2015-10-28  3:21 ` [PATCH v6 1/6] x86/intel_pstate: add some calculation related support Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-11-20 11:48   ` Jan Beulich
  2015-10-28  3:21 ` [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver Wei Wang
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
Introduce a simple internal_governor struct to manage internal
governor related variables. Also, add a condition check in
cpufreq_del_cpu to avoid going through the old ACPI governor
framework when an internal governor is in use.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 1) create this patch by spliting it from the next big one.
 xen/drivers/cpufreq/cpufreq.c      | 5 +++--
 xen/include/acpi/cpufreq/cpufreq.h | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/xen/drivers/cpufreq/cpufreq.c b/xen/drivers/cpufreq/cpufreq.c
index 6e666e4..2c1c713 100644
--- a/xen/drivers/cpufreq/cpufreq.c
+++ b/xen/drivers/cpufreq/cpufreq.c
@@ -334,8 +334,9 @@ int cpufreq_del_cpu(unsigned int cpu)
 
     /* for HW_ALL, stop gov for each core of the _PSD domain */
     /* for SW_ALL & SW_ANY, stop gov for the 1st core of the _PSD domain */
-    if (hw_all || (cpumask_weight(cpufreq_dom->map) ==
-                   perf->domain_info.num_processors))
+    if (!policy->internal_gov &&
+        (hw_all || (cpumask_weight(cpufreq_dom->map) ==
+                    perf->domain_info.num_processors)))
         __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 
     cpufreq_statistic_exit(cpu);
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index 48ad1d0..8947368 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -53,6 +53,12 @@ struct perf_limits {
     uint32_t min_policy_pct;
 };
 
+struct internal_governor {
+    char *avail_gov;
+    uint32_t gov_num;
+    uint32_t cur_gov;
+};
+
 struct cpufreq_policy {
     cpumask_var_t       cpus;          /* affected CPUs */
     unsigned int        shared_type;   /* ANY or ALL affected CPUs
@@ -66,6 +72,7 @@ struct cpufreq_policy {
                                  * governors are used */
     struct perf_limits  limits;
     struct cpufreq_governor     *governor;
+    struct internal_governor    *internal_gov;
 
     bool_t              resume; /* flag for cpufreq 1st run
                                  * S3 wakeup, hotplug cpu, etc */
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * Re: [PATCH v6 2/6] x86/intel_pstate: introduce the internal_governor struct
  2015-10-28  3:21 ` [PATCH v6 2/6] x86/intel_pstate: introduce the internal_governor struct Wei Wang
@ 2015-11-20 11:48   ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2015-11-20 11:48 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, xen-devel
>>> On 28.10.15 at 04:21, <wei.w.wang@intel.com> wrote:
> --- a/xen/drivers/cpufreq/cpufreq.c
> +++ b/xen/drivers/cpufreq/cpufreq.c
> @@ -334,8 +334,9 @@ int cpufreq_del_cpu(unsigned int cpu)
>  
>      /* for HW_ALL, stop gov for each core of the _PSD domain */
>      /* for SW_ALL & SW_ANY, stop gov for the 1st core of the _PSD domain */
> -    if (hw_all || (cpumask_weight(cpufreq_dom->map) ==
> -                   perf->domain_info.num_processors))
> +    if (!policy->internal_gov &&
> +        (hw_all || (cpumask_weight(cpufreq_dom->map) ==
> +                    perf->domain_info.num_processors)))
>          __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
Why do we need such a guard here, but not on any of the other
__cpufreq_governor() invocations? (If they will end up
unreachable by the following patches, then I'd still expect one
or more ASSERT()s to be added to document/enforce this.)
> --- a/xen/include/acpi/cpufreq/cpufreq.h
> +++ b/xen/include/acpi/cpufreq/cpufreq.h
> @@ -53,6 +53,12 @@ struct perf_limits {
>      uint32_t min_policy_pct;
>  };
>  
> +struct internal_governor {
> +    char *avail_gov;
> +    uint32_t gov_num;
> +    uint32_t cur_gov;
> +};
Apart from the fields being meaningless without user I don't think
the gov_ prefix and the _gov suffixes are really helpful here.
Also "num" isn't really unambiguous - if you mean "count", please
use "count". If you mean something else, please pick an
unambiguous field name.
Jan
^ permalink raw reply	[flat|nested] 14+ messages in thread
 
- * [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
  2015-10-28  3:21 ` [PATCH v6 1/6] x86/intel_pstate: add some calculation related support Wei Wang
  2015-10-28  3:21 ` [PATCH v6 2/6] x86/intel_pstate: introduce the internal_governor struct Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-11-20 14:09   ` Jan Beulich
  2015-10-28  3:21 ` [PATCH v6 4/6] x86/intel_pstate: add a booting param to select the driver to load Wei Wang
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
We simply grab the fundamental logic of the intel_pstate driver
from Linux kernel, and customize it to Xen style. In the kernel,
a user can adjust the limits via sysfs
(limits.min_sysfs_pct/max_sysfs_pct). In Xen, the
policy->limits.min_perf_pct/max_perf_pct acts as the transit
station. A user interacts with it via xenpm.
The new xen/include/asm-x86/cpufreq.h header file is added.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 1) change some of the unnecessary signed types to be unsigned, as requested by Jan in v2;
 2) remove "__ready_mostly" from the local variable, load, in intel_pstate_init();
 3) coding style changes.
 xen/arch/x86/acpi/cpufreq/Makefile       |   1 +
 xen/arch/x86/acpi/cpufreq/intel_pstate.c | 882 +++++++++++++++++++++++++++++++
 xen/include/acpi/cpufreq/cpufreq.h       |   6 +
 xen/include/asm-x86/cpufreq.h            |  31 ++
 xen/include/asm-x86/msr-index.h          |   3 +
 5 files changed, 923 insertions(+)
 create mode 100644 xen/arch/x86/acpi/cpufreq/intel_pstate.c
 create mode 100644 xen/include/asm-x86/cpufreq.h
diff --git a/xen/arch/x86/acpi/cpufreq/Makefile b/xen/arch/x86/acpi/cpufreq/Makefile
index f75da9b..99fa9f4 100644
--- a/xen/arch/x86/acpi/cpufreq/Makefile
+++ b/xen/arch/x86/acpi/cpufreq/Makefile
@@ -1,2 +1,3 @@
 obj-y += cpufreq.o
+obj-y += intel_pstate.o
 obj-y += powernow.o
diff --git a/xen/arch/x86/acpi/cpufreq/intel_pstate.c b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
new file mode 100644
index 0000000..020abda
--- /dev/null
+++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
@@ -0,0 +1,882 @@
+#include <xen/kernel.h>
+#include <xen/types.h>
+#include <xen/init.h>
+#include <xen/bitmap.h>
+#include <xen/cpumask.h>
+#include <xen/timer.h>
+#include <asm/msr.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/div64.h>
+#include <asm/cpufreq.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define BYT_RATIOS       0x66a
+#define BYT_VIDS         0x66b
+#define BYT_TURBO_RATIOS 0x66c
+#define BYT_TURBO_VIDS   0x66d
+
+#define FRAC_BITS 8
+#define int_tofp(X) ((uint64_t)(X) << FRAC_BITS)
+#define fp_toint(X) ((X) >> FRAC_BITS)
+
+static inline uint32_t mul_fp(uint32_t x, uint32_t y)
+{
+    return ((uint64_t)x * (uint64_t)y) >> FRAC_BITS;
+}
+
+static inline uint32_t div_fp(uint32_t x, uint32_t y)
+{
+    return div_s64((uint64_t)x << FRAC_BITS, y);
+}
+
+static inline uint32_t ceiling_fp(uint32_t x)
+{
+    uint32_t mask, ret;
+
+    ret = fp_toint(x);
+    mask = (1 << FRAC_BITS) - 1;
+    if ( x & mask )
+        ret += 1;
+    return ret;
+}
+
+struct sample {
+    uint32_t core_pct_busy;
+    uint64_t aperf;
+    uint64_t mperf;
+    uint32_t freq;
+    s_time_t time;
+};
+
+struct pstate_data {
+    uint32_t    current_pstate;
+    uint32_t    min_pstate;
+    uint32_t    max_pstate;
+    uint32_t    scaling;
+    uint32_t    turbo_pstate;
+};
+
+struct vid_data {
+    uint32_t min;
+    uint32_t max;
+    uint32_t turbo;
+    uint32_t ratio;
+};
+
+struct _pid {
+    uint32_t setpoint;
+    uint32_t integral;
+    uint32_t p_gain;
+    uint32_t i_gain;
+    uint32_t d_gain;
+    uint32_t deadband;
+    int32_t last_err;
+};
+
+struct cpudata {
+    int cpu;
+
+    struct timer timer;
+
+    struct pstate_data pstate;
+    struct vid_data vid;
+    struct _pid pid;
+
+    s_time_t last_sample_time;
+    uint64_t prev_aperf;
+    uint64_t prev_mperf;
+    struct sample sample;
+};
+
+static struct cpudata **all_cpu_data;
+
+struct pstate_adjust_policy {
+    uint32_t sample_rate_ms;
+    uint32_t deadband;
+    uint32_t setpoint;
+    uint32_t p_gain_pct;
+    uint32_t d_gain_pct;
+    uint32_t i_gain_pct;
+};
+
+struct pstate_funcs {
+    uint32_t (*get_max)(void);
+    uint32_t (*get_min)(void);
+    uint32_t (*get_turbo)(void);
+    uint32_t (*get_scaling)(void);
+    void (*set)(struct perf_limits *, struct cpudata *, uint32_t pstate);
+    void (*get_vid)(struct cpudata *);
+};
+
+struct cpu_defaults {
+    struct pstate_adjust_policy pid_policy;
+    struct pstate_funcs funcs;
+};
+
+static struct pstate_adjust_policy pid_params;
+static struct pstate_funcs pstate_funcs;
+
+static inline void pid_reset(struct _pid *pid, uint32_t setpoint,
+                             uint32_t busy, uint32_t deadband,
+                             uint32_t integral)
+{
+    pid->setpoint = setpoint;
+    pid->deadband = deadband;
+    pid->integral = int_tofp(integral);
+    pid->last_err = int_tofp(setpoint) - int_tofp(busy);
+}
+
+static inline void pid_p_gain_set(struct _pid *pid, uint32_t percent)
+{
+    pid->p_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static inline void pid_i_gain_set(struct _pid *pid, uint32_t percent)
+{
+    pid->i_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static inline void pid_d_gain_set(struct _pid *pid, uint32_t percent)
+{
+    pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
+}
+
+static signed int pid_calc(struct _pid *pid, uint32_t busy)
+{
+    signed int result;
+    int32_t pterm, dterm, fp_error;
+    int32_t integral_limit;
+
+    fp_error = int_tofp(pid->setpoint) - busy;
+
+    if ( ABS(fp_error) <= int_tofp(pid->deadband) )
+        return 0;
+
+    pterm = mul_fp(pid->p_gain, fp_error);
+
+    pid->integral += fp_error;
+
+    /*
+     * We limit the integral here so that it will never
+     * get higher than 30.  This prevents it from becoming
+     * too large an input over long periods of time and allows
+     * it to get factored out sooner.
+     * The value of 30 was chosen through experimentation.
+     */
+    integral_limit = int_tofp(30);
+    if ( pid->integral > integral_limit )
+        pid->integral = integral_limit;
+    if ( pid->integral < -integral_limit )
+        pid->integral = -integral_limit;
+
+    dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
+    pid->last_err = fp_error;
+
+    result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
+    result = result + (1 << (FRAC_BITS-1));
+    return (signed int)fp_toint(result);
+}
+
+static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
+{
+    pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
+    pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
+    pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
+
+    pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
+}
+
+static inline void intel_pstate_reset_all_pid(void)
+{
+    uint32_t cpu;
+
+    for_each_online_cpu(cpu)
+    {
+        if ( all_cpu_data[cpu] )
+            intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
+    }
+}
+
+static inline void update_turbo_state(struct cpufreq_policy *policy)
+{
+    uint64_t misc_en;
+    struct cpudata *cpu;
+
+    cpu = all_cpu_data[policy->cpu];
+    rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
+    policy->limits.turbo_disabled =
+        (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
+            cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
+}
+
+#define BYT_TURBO_CONTROL_BIT 32
+#define BYT_MIN_PSTATE(val) (((value) >> 8) & 0x7f)
+#define BYT_MAX_PSTATE(val) (((value) >> 16) & 0x7f)
+#define BYT_TURBO_PSTATE(value) ((value) & 0x7f)
+static uint32_t byt_get_min_pstate(void)
+{
+    uint64_t value;
+
+    rdmsrl(BYT_RATIOS, value);
+    return BYT_MIN_PSTATE(val);
+}
+
+static uint32_t byt_get_max_pstate(void)
+{
+    uint64_t value;
+
+    rdmsrl(BYT_RATIOS, value);
+    return BYT_MAX_PSTATE(val);
+}
+
+static uint32_t byt_get_turbo_pstate(void)
+{
+    uint64_t value;
+
+    rdmsrl(BYT_TURBO_RATIOS, value);
+    return BYT_TURBO_PSTATE(value);
+}
+
+static void byt_set_pstate(struct perf_limits *limits,
+                struct cpudata *cpudata, uint32_t pstate)
+{
+    uint64_t val;
+    uint32_t vid_fp;
+    uint32_t vid;
+
+    val = pstate << 8;
+    if ( limits->no_turbo && !limits->turbo_disabled )
+        val |= (uint64_t)1 << BYT_TURBO_CONTROL_BIT;
+
+    vid_fp = cpudata->vid.min + mul_fp(
+        int_tofp(pstate - cpudata->pstate.min_pstate),
+        cpudata->vid.ratio);
+
+    vid_fp = clamp(vid_fp, cpudata->vid.min, cpudata->vid.max);
+    vid = ceiling_fp(vid_fp);
+
+    if ( pstate > cpudata->pstate.max_pstate )
+        vid = cpudata->vid.turbo;
+
+    val |= vid;
+
+    wrmsrl(MSR_IA32_PERF_CTL, val);
+}
+
+#define BYT_BCLK_FREQS 5
+#define TO_FREQ_TABLE_IDX_MASK 0x7
+static uint32_t byt_get_scaling(void)
+{
+    const uint32_t byt_freq_table[BYT_BCLK_FREQS] =
+                   {833, 1000, 1333, 1167, 800};
+    uint64_t value;
+    int i;
+
+    rdmsrl(MSR_FSB_FREQ, value);
+    i = value & TO_FREQ_TABLE_IDX_MASK;
+
+    BUG_ON(i > BYT_BCLK_FREQS);
+
+    return byt_freq_table[i] * 100;
+}
+
+static void byt_get_vid(struct cpudata *cpudata)
+{
+    uint64_t value;
+
+    rdmsrl(BYT_VIDS, value);
+    cpudata->vid.min = int_tofp(BYT_MIN_PSTATE(val));
+    cpudata->vid.max = int_tofp(BYT_MAX_PSTATE(val));
+    cpudata->vid.ratio = div_fp(cpudata->vid.max -
+                                cpudata->vid.min,
+                                int_tofp(cpudata->pstate.max_pstate -
+                                         cpudata->pstate.min_pstate));
+    rdmsrl(BYT_TURBO_VIDS, value);
+    cpudata->vid.turbo = BYT_TURBO_PSTATE(value);
+}
+
+#define SCALING_FACTOR 100000
+#define CORE_TURBO_CONTROL_BIT 32
+#define CORE_MIN_PSTATE(val) (((value) >> 40) & 0xff)
+#define CORE_MAX_PSTATE(val) (((value) >> 8) & 0xff)
+#define CORE_TURBO_PSTATE(value) ((value) & 0xff)
+static uint32_t core_get_min_pstate(void)
+{
+    uint64_t value;
+
+    rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
+    return CORE_MIN_PSTATE(val);
+}
+
+static uint32_t core_get_max_pstate(void)
+{
+    uint64_t value;
+
+    rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
+    return CORE_MAX_PSTATE(val);
+}
+
+static uint32_t core_get_turbo_pstate(void)
+{
+    uint64_t value;
+    uint32_t nont, ret;
+
+    rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
+    nont = core_get_max_pstate();
+    ret = CORE_TURBO_PSTATE(value);
+    if ( ret <= nont )
+        ret = nont;
+    return ret;
+}
+
+static inline uint32_t core_get_scaling(void)
+{
+    return SCALING_FACTOR;
+}
+
+static void core_set_pstate(struct perf_limits *limits,
+                            struct cpudata *cpudata, uint32_t pstate)
+{
+    uint64_t val;
+
+    val = pstate << 8;
+    if ( limits->no_turbo && !limits->turbo_disabled )
+        val |= (uint64_t)1 << CORE_TURBO_CONTROL_BIT;
+
+    wrmsrl(MSR_IA32_PERF_CTL, val);
+}
+
+static __initconst struct cpu_defaults core_params = {
+    .pid_policy = {
+        .sample_rate_ms = 10,
+        .deadband = 0,
+        .setpoint = 97,
+        .p_gain_pct = 20,
+        .d_gain_pct = 0,
+        .i_gain_pct = 0,
+    },
+    .funcs = {
+        .get_max = core_get_max_pstate,
+        .get_min = core_get_min_pstate,
+        .get_turbo = core_get_turbo_pstate,
+        .get_scaling = core_get_scaling,
+        .set = core_set_pstate,
+    },
+};
+
+static __initconst struct cpu_defaults byt_params = {
+    .pid_policy = {
+        .sample_rate_ms = 10,
+        .deadband = 0,
+        .setpoint = 97,
+        .p_gain_pct = 14,
+        .d_gain_pct = 0,
+        .i_gain_pct = 4,
+    },
+    .funcs = {
+        .get_max = byt_get_max_pstate,
+        .get_min = byt_get_min_pstate,
+        .get_turbo = byt_get_turbo_pstate,
+        .set = byt_set_pstate,
+        .get_scaling = byt_get_scaling,
+        .get_vid = byt_get_vid,
+    },
+};
+
+static void intel_pstate_get_min_max(struct perf_limits *limits,
+                                     struct cpudata *cpu, uint32_t *min,
+                                     uint32_t *max)
+{
+    uint32_t max_perf = cpu->pstate.turbo_pstate;
+    uint32_t max_perf_adj;
+    uint32_t min_perf;
+
+    if ( limits->no_turbo || limits->turbo_disabled )
+        max_perf = cpu->pstate.max_pstate;
+
+    /* performance can be limited by user through xenpm */
+    max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits->max_perf));
+    *max = clamp(max_perf_adj, cpu->pstate.min_pstate,
+                 cpu->pstate.turbo_pstate);
+    min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits->min_perf));
+    *min = clamp(min_perf, cpu->pstate.min_pstate, max_perf);
+}
+
+static void intel_pstate_set_pstate(struct cpufreq_policy *policy,
+                                    struct cpudata *cpu, uint32_t pstate)
+{
+    uint32_t max_perf, min_perf;
+    struct perf_limits *limits;
+
+    limits = &policy->limits;
+
+    update_turbo_state(policy);
+
+    if ( limits->turbo_disabled )
+        policy->turbo = CPUFREQ_TURBO_UNSUPPORTED;
+    else if ( limits->no_turbo )
+        policy->turbo = CPUFREQ_TURBO_DISABLED;
+    else
+        policy->turbo = CPUFREQ_TURBO_ENABLED;
+
+    intel_pstate_get_min_max(limits, cpu, &min_perf, &max_perf);
+
+    pstate = clamp(pstate, min_perf, max_perf);
+
+    if ( pstate == cpu->pstate.current_pstate )
+        return;
+
+    cpu->pstate.current_pstate = pstate;
+    policy->cur = pstate * SCALING_FACTOR;
+
+    pstate_funcs.set(limits, cpu, pstate);
+}
+
+static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
+{
+    struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_policy, cpu->cpu);
+
+    cpu->pstate.min_pstate = pstate_funcs.get_min();
+    cpu->pstate.max_pstate = pstate_funcs.get_max();
+    cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
+    cpu->pstate.scaling = pstate_funcs.get_scaling();
+
+    if ( pstate_funcs.get_vid )
+        pstate_funcs.get_vid(cpu);
+    intel_pstate_set_pstate(policy, cpu, cpu->pstate.min_pstate);
+}
+
+static inline void intel_pstate_calc_busy(struct cpudata *cpu)
+{
+    struct sample *sample = &cpu->sample;
+    uint64_t core_pct;
+
+    core_pct = int_tofp(sample->aperf) * int_tofp(100);
+    core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
+
+    sample->freq = fp_toint(mul_fp(int_tofp(cpu->pstate.max_pstate *
+                                   cpu->pstate.scaling / 100), core_pct));
+
+    sample->core_pct_busy = (int32_t)core_pct;
+}
+
+static inline void intel_pstate_sample(struct cpudata *cpu)
+{
+    uint64_t aperf, mperf;
+    unsigned long flags;
+
+    local_irq_save(flags);
+    rdmsrl(MSR_IA32_APERF, aperf);
+    rdmsrl(MSR_IA32_MPERF, mperf);
+    local_irq_restore(flags);
+
+    cpu->last_sample_time = cpu->sample.time;
+    cpu->sample.time = get_s_time();
+    cpu->sample.aperf = aperf;
+    cpu->sample.mperf = mperf;
+    cpu->sample.aperf -= cpu->prev_aperf;
+    cpu->sample.mperf -= cpu->prev_mperf;
+
+    intel_pstate_calc_busy(cpu);
+
+    cpu->prev_aperf = aperf;
+    cpu->prev_mperf = mperf;
+}
+
+static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
+{
+    set_timer(&cpu->timer, NOW() + MILLISECS(pid_params.sample_rate_ms));
+}
+
+static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+{
+    uint32_t core_busy, max_pstate, current_pstate, sample_ratio;
+    uint32_t duration_us;
+    uint32_t sample_time_us;
+
+    /*
+     * core_busy is the ratio of actual performance to max
+     * max_pstate is the max non turbo pstate available
+     * current_pstate was the pstate that was requested during
+     * the last sample period.
+     *
+     * We normalize core_busy, which was our actual percent
+     * performance to what we requested during the last sample
+     * period. The result will be a percentage of busy at a
+     * specified pstate.
+     */
+    core_busy = cpu->sample.core_pct_busy;
+    max_pstate = int_tofp(cpu->pstate.max_pstate);
+    current_pstate = int_tofp(cpu->pstate.current_pstate);
+    core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+
+    /*
+     * Since we have a deferred timer, it will not fire unless
+     * we are in C0.  So, determine if the actual elapsed time
+     * is significantly greater (3x) than our sample interval. If it
+     * is, then we were idle for a long enough period of time
+     * to adjust our busyness.
+     */
+    sample_time_us = pid_params.sample_rate_ms  * 1000ULL;
+    duration_us = (uint32_t)((s_time_t)(cpu->sample.time -
+                              cpu->last_sample_time) / 1000);
+    if ( duration_us > sample_time_us * 3 )
+    {
+        sample_ratio = div_fp(int_tofp(sample_time_us),
+                              int_tofp(duration_us));
+        core_busy = mul_fp(core_busy, sample_ratio);
+    }
+
+    return core_busy;
+}
+
+static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
+{
+    int32_t busy_scaled;
+    struct _pid *pid;
+    signed int ctl;
+    struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_policy, cpu->cpu);
+
+    pid = &cpu->pid;
+    busy_scaled = intel_pstate_get_scaled_busy(cpu);
+
+    ctl = pid_calc(pid, busy_scaled);
+
+    /* Negative values of ctl increase the pstate and vice versa */
+    intel_pstate_set_pstate(policy, cpu, cpu->pstate.current_pstate - ctl);
+}
+
+static void intel_pstate_timer_func(void *data)
+{
+    struct cpudata *cpu = (struct cpudata *) data;
+
+    intel_pstate_sample(cpu);
+
+    intel_pstate_adjust_busy_pstate(cpu);
+
+    intel_pstate_set_sample_time(cpu);
+}
+
+#define ICPU(model, policy) \
+    { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
+            &policy##_params }
+
+static __initconst struct x86_cpu_id intel_pstate_cpu_ids[] __initconst = {
+    ICPU(0x2a, core),
+    ICPU(0x2d, core),
+    ICPU(0x37, byt),
+    ICPU(0x3a, core),
+    ICPU(0x3c, core),
+    ICPU(0x3d, core),
+    ICPU(0x3e, core),
+    ICPU(0x3f, core),
+    ICPU(0x45, core),
+    ICPU(0x46, core),
+    ICPU(0x47, core),
+    ICPU(0x4c, byt),
+    ICPU(0x4e, core),
+    ICPU(0x4f, core),
+    ICPU(0x56, core),
+    {}
+};
+
+static int intel_pstate_init_cpu(unsigned int cpunum)
+{
+    struct cpudata *cpu;
+    s_time_t expires;
+
+    if ( !all_cpu_data[cpunum] )
+        all_cpu_data[cpunum] = xzalloc(struct cpudata);
+    if ( !all_cpu_data[cpunum] )
+        return -ENOMEM;
+
+    cpu = all_cpu_data[cpunum];
+
+    cpu->cpu = cpunum;
+    intel_pstate_get_cpu_pstates(cpu);
+
+    init_timer(&cpu->timer, intel_pstate_timer_func, cpu, cpunum);
+    expires = NOW() + MILLISECS(10);
+
+    intel_pstate_busy_pid_reset(cpu);
+    intel_pstate_sample(cpu);
+
+    set_timer(&cpu->timer, expires);
+
+    return 0;
+}
+
+static int intel_pstate_set_policy(struct cpufreq_policy *policy)
+{
+    struct perf_limits *limits = &policy->limits;
+    uint32_t cur_gov = policy->internal_gov->cur_gov;
+
+    if ( !policy->cpuinfo.max_freq )
+        return -ENODEV;
+
+    switch ( cur_gov )
+    {
+    case INTERNAL_GOV_PERFORMANCE:
+        limits->no_turbo = 0;
+        limits->max_perf_pct = 100;
+        limits->max_perf = int_tofp(1);
+        limits->min_perf_pct = 100;
+        limits->min_perf = int_tofp(1);
+        break;
+    case INTERNAL_GOV_POWERSAVE:
+        limits->min_perf = div_fp(int_tofp(limits->min_policy_pct),
+                                  int_tofp(100));
+        limits->max_perf = limits->min_perf;
+        limits->min_perf_pct = limits->min_policy_pct;
+        limits->max_perf_pct = limits->min_perf_pct;
+        break;
+    case INTERNAL_GOV_USERSPACE:
+        limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
+                                  int_tofp(100));
+        limits->min_perf = limits->max_perf;
+        limits->min_perf_pct = limits->max_perf_pct;
+        break;
+    case INTERNAL_GOV_ONDEMAND:
+    default:
+        limits->min_perf = div_fp(int_tofp(limits->min_perf_pct),
+                                  int_tofp(100));
+        limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
+                                  int_tofp(100));
+        cur_gov = INTERNAL_GOV_ONDEMAND;
+        break;
+    }
+
+    return 0;
+}
+
+static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
+{
+    uint32_t cur_gov = policy->internal_gov->cur_gov;
+
+    cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+                                 policy->cpuinfo.max_freq);
+
+    switch( cur_gov )
+    {
+    case INTERNAL_GOV_PERFORMANCE:
+    case INTERNAL_GOV_POWERSAVE:
+    case INTERNAL_GOV_USERSPACE:
+    case INTERNAL_GOV_ONDEMAND:
+        return 0;
+    default:
+        return -EINVAL;
+    }
+}
+
+static void intel_pstate_internal_gov_release(struct internal_governor *gov)
+{
+    xfree(gov->avail_gov);
+    xfree(gov);
+}
+
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+    int cpu_num = policy->cpu;
+    struct cpudata *cpu = all_cpu_data[cpu_num];
+
+    kill_timer(&all_cpu_data[cpu_num]->timer);
+
+    intel_pstate_set_pstate(policy, cpu, cpu->pstate.min_pstate);
+
+    intel_pstate_internal_gov_release(policy->internal_gov);
+
+    return 0;
+}
+
+static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy *policy)
+{
+    struct cpudata *cpu = all_cpu_data[policy->cpu];
+    struct perf_limits *limits = &policy->limits;
+
+    update_turbo_state(policy);
+    if ( limits->turbo_disabled )
+    {
+        printk("Turbo disabled by BIOS or not supported on CPU\n");
+        return -EINVAL;
+    }
+    limits->no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1;
+
+    if ( limits->no_turbo )
+        policy->cpuinfo.max_freq = cpu->pstate.max_pstate *
+                                   cpu->pstate.scaling;
+    else
+        policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate *
+                                   cpu->pstate.scaling;
+
+    policy->max = clamp(policy->max, policy->cpuinfo.min_freq,
+                        policy->cpuinfo.max_freq);
+
+    return 0;
+}
+
+static uint32_t get_turbo_pct(struct cpudata *cpu)
+{
+    uint32_t total, no_turbo, turbo_pct;
+    uint32_t turbo_fp;
+
+    total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
+    no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
+    turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total));
+    turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
+    return turbo_pct;
+}
+
+#define INTEL_PSTATE_GOV_NUM 4
+static struct internal_governor* intel_pstate_internal_gov_init(void)
+{
+    unsigned int i = 0;
+    struct internal_governor *gov;
+    char *avail_gov;
+
+    gov = xzalloc(struct internal_governor);
+    if ( !gov )
+        return NULL;
+    avail_gov = xzalloc_array(char,
+            INTEL_PSTATE_GOV_NUM * CPUFREQ_NAME_LEN);
+    if ( !avail_gov )
+        return NULL;
+
+    gov->avail_gov = avail_gov;
+
+    i += scnprintf(&avail_gov[0], CPUFREQ_NAME_LEN, "%s ", "performance");
+    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "powersave");
+    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "userspace");
+    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "ondemand");
+    avail_gov[i-1] = '\0';
+    gov->gov_num = INTEL_PSTATE_GOV_NUM;
+    gov->cur_gov = INTERNAL_GOV_ONDEMAND;
+    return gov;
+}
+
+static int intel_pstate_cpu_setup(struct cpufreq_policy *policy)
+{
+    struct cpudata *cpu;
+    struct perf_limits *limits = &policy->limits;
+    int rc;
+
+    rc = intel_pstate_init_cpu(policy->cpu);
+    if ( rc )
+        return rc;
+
+    policy->internal_gov = intel_pstate_internal_gov_init();
+    if ( !policy->internal_gov )
+        return -ENOMEM;
+
+    cpu = all_cpu_data[policy->cpu];
+    policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
+    policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+
+    /* cpuinfo and default policy values */
+    policy->cpuinfo.min_freq = cpu->pstate.min_pstate *
+                               cpu->pstate.scaling;
+    policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate *
+                               cpu->pstate.scaling;
+    policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+    cpumask_set_cpu(policy->cpu, policy->cpus);
+
+    limits->no_turbo = 0;
+    limits->turbo_disabled = 0;
+    limits->turbo_pct = get_turbo_pct(cpu);
+    limits->min_policy_pct = (policy->min * 100) /
+                             policy->cpuinfo.max_freq;
+    limits->min_policy_pct = clamp_t(uint32_t,
+                                     limits->min_policy_pct, 0, 100);
+    limits->max_policy_pct = (policy->max * 100) /
+                             policy->cpuinfo.max_freq;
+    limits->max_policy_pct = clamp_t(uint32_t,
+                                     limits->max_policy_pct, 0, 100);
+    limits->max_perf_pct   = limits->max_policy_pct;
+    limits->min_perf_pct   = limits->min_policy_pct;
+
+    return 0;
+}
+
+static struct cpufreq_driver intel_pstate_driver = {
+    .verify       = intel_pstate_verify_policy,
+    .setpolicy    = intel_pstate_set_policy,
+    .init         = intel_pstate_cpu_setup,
+    .exit         = intel_pstate_cpu_exit,
+    .update       = intel_pstate_turbo_update,
+    .name         = "intel_pstate",
+};
+
+static int intel_pstate_msrs_not_valid(void)
+{
+    if ( !pstate_funcs.get_max() ||
+         !pstate_funcs.get_min() ||
+         !pstate_funcs.get_turbo() )
+        return -ENODEV;
+
+    return 0;
+}
+
+static void __init copy_pid_params(struct pstate_adjust_policy *policy)
+{
+    pid_params.sample_rate_ms = policy->sample_rate_ms;
+    pid_params.p_gain_pct = policy->p_gain_pct;
+    pid_params.i_gain_pct = policy->i_gain_pct;
+    pid_params.d_gain_pct = policy->d_gain_pct;
+    pid_params.deadband = policy->deadband;
+    pid_params.setpoint = policy->setpoint;
+}
+
+static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
+{
+    pstate_funcs.get_max   = funcs->get_max;
+    pstate_funcs.get_min   = funcs->get_min;
+    pstate_funcs.get_turbo = funcs->get_turbo;
+    pstate_funcs.get_scaling = funcs->get_scaling;
+    pstate_funcs.set       = funcs->set;
+    pstate_funcs.get_vid   = funcs->get_vid;
+}
+
+int __init intel_pstate_init(void)
+{
+    int cpu, rc = 0;
+    const struct x86_cpu_id *id;
+    struct cpu_defaults *cpu_info;
+    static bool_t load;
+    boolean_param("intel_pstate", load);
+
+    if ( !load )
+        return -ENODEV;
+
+    id = x86_match_cpu(intel_pstate_cpu_ids);
+    if ( !id )
+        return -ENODEV;
+
+    cpu_info = (struct cpu_defaults *)id->driver_data;
+
+    copy_pid_params(&cpu_info->pid_policy);
+    copy_cpu_funcs(&cpu_info->funcs);
+
+    if ( intel_pstate_msrs_not_valid() )
+        return -ENODEV;
+
+    all_cpu_data = xzalloc_array(struct cpudata *, NR_CPUS);
+    if ( !all_cpu_data )
+        return -ENOMEM;
+
+    rc = cpufreq_register_driver(&intel_pstate_driver);
+    if ( rc )
+        goto out;
+
+    return rc;
+out:
+    for_each_online_cpu(cpu)
+    {
+        if ( all_cpu_data[cpu] )
+        {
+            kill_timer(&all_cpu_data[cpu]->timer);
+            xfree(all_cpu_data[cpu]);
+        }
+    }
+    xfree(all_cpu_data);
+    return -ENODEV;
+}
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index 8947368..ef6643a 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -107,6 +107,12 @@ struct cpufreq_freqs {
  *                          CPUFREQ GOVERNORS                        *
  *********************************************************************/
 
+/* Please add internal governors here */
+#define INTERNAL_GOV_POWERSAVE        (1)
+#define INTERNAL_GOV_PERFORMANCE      (2)
+#define INTERNAL_GOV_USERSPACE        (3)
+#define INTERNAL_GOV_ONDEMAND         (4)
+
 #define CPUFREQ_GOV_START  1
 #define CPUFREQ_GOV_STOP   2
 #define CPUFREQ_GOV_LIMITS 3
diff --git a/xen/include/asm-x86/cpufreq.h b/xen/include/asm-x86/cpufreq.h
new file mode 100644
index 0000000..afc72df
--- /dev/null
+++ b/xen/include/asm-x86/cpufreq.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_CPUFREQ_H
+#define _ASM_X86_CPUFREQ_H
+
+/*
+ *  Copyright (C) 2015, Intel Corporation.
+ *  Wei Wang <wei.w.wang@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+/*
+ * Maximum transition latency is in nanoseconds - if it's unknown,
+ * CPUFREQ_ETERNAL shall be used.
+ */
+#define CPUFREQ_ETERNAL        (-1)
+
+#endif /* _ASM_X86_CPUFREQ_H */
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index 65c1d02..b9c4595 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -52,6 +52,8 @@
 #define MSR_IA32_MCG_STATUS		0x0000017a
 #define MSR_IA32_MCG_CTL		0x0000017b
 
+#define MSR_NHM_TURBO_RATIO_LIMIT	0x000001ad
+
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
@@ -320,6 +322,7 @@
 #define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18)
 #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID  (1<<22)
 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL<<38)
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * Re: [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver
  2015-10-28  3:21 ` [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver Wei Wang
@ 2015-11-20 14:09   ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2015-11-20 14:09 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, xen-devel
>>> On 28.10.15 at 04:21, <wei.w.wang@intel.com> wrote:
> --- /dev/null
> +++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
> @@ -0,0 +1,882 @@
> +#include <xen/kernel.h>
> +#include <xen/types.h>
> +#include <xen/init.h>
> +#include <xen/bitmap.h>
> +#include <xen/cpumask.h>
> +#include <xen/timer.h>
> +#include <asm/msr.h>
> +#include <asm/msr-index.h>
> +#include <asm/processor.h>
> +#include <asm/div64.h>
> +#include <asm/cpufreq.h>
> +#include <acpi/cpufreq/cpufreq.h>
> +
> +#define BYT_RATIOS       0x66a
> +#define BYT_VIDS         0x66b
> +#define BYT_TURBO_RATIOS 0x66c
> +#define BYT_TURBO_VIDS   0x66d
Considering these are MSR names, the acronym MSR should appear in
them.
> +#define FRAC_BITS 8
> +#define int_tofp(X) ((uint64_t)(X) << FRAC_BITS)
> +#define fp_toint(X) ((X) >> FRAC_BITS)
> +
> +static inline uint32_t mul_fp(uint32_t x, uint32_t y)
> +{
> +    return ((uint64_t)x * (uint64_t)y) >> FRAC_BITS;
Casting just one side of the * would suffice. Also please don't
open-code fp_to_int() ...
> +static inline uint32_t div_fp(uint32_t x, uint32_t y)
> +{
> +    return div_s64((uint64_t)x << FRAC_BITS, y);
... or int_to_fp() (for this one it would be a different thing if you
removed the - apparently pointless - cast from the macro).
> +static inline uint32_t ceiling_fp(uint32_t x)
> +{
> +    uint32_t mask, ret;
> +
> +    ret = fp_toint(x);
> +    mask = (1 << FRAC_BITS) - 1;
> +    if ( x & mask )
> +        ret += 1;
> +    return ret;
> +}
Blank line before final return statement please.
Code further down suggests there may be a sign in these fixed
point numbers, yet here you converted _everything_ to uintNN_t.
Did you go too far? What sense does e.g. div_s64() on a uint64_t
make?
> +struct cpudata {
> +    int cpu;
unsigned int (but I wonder if this is really needed, see the comment
at the allocation site)
> +static signed int pid_calc(struct _pid *pid, uint32_t busy)
> +{
> +    signed int result;
> +    int32_t pterm, dterm, fp_error;
> +    int32_t integral_limit;
> +
> +    fp_error = int_tofp(pid->setpoint) - busy;
> +    if ( ABS(fp_error) <= int_tofp(pid->deadband) )
int_tofp() currently returning an unsigned value - why ABS()?
> +static inline void intel_pstate_reset_all_pid(void)
> +{
> +    uint32_t cpu;
unsigned int
> +static inline void update_turbo_state(struct cpufreq_policy *policy)
> +{
> +    uint64_t misc_en;
> +    struct cpudata *cpu;
> +
> +    cpu = all_cpu_data[policy->cpu];
> +    rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
> +    policy->limits.turbo_disabled =
> +        (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
> +            cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
At least the & needs parenthesization.
> +static void byt_set_pstate(struct perf_limits *limits,
> +                struct cpudata *cpudata, uint32_t pstate)
> +{
> +    uint64_t val;
> +    uint32_t vid_fp;
> +    uint32_t vid;
> +
> +    val = pstate << 8;
Didn't we agree not to have any unnamed literals?
> +    if ( limits->no_turbo && !limits->turbo_disabled )
> +        val |= (uint64_t)1 << BYT_TURBO_CONTROL_BIT;
Perhaps the sense of that bit isn't what one would imply at first
glance, but the combination of conditions in the if() looks certainly
odd. Please confirm this is really intended to be this way (which
should also include pointing out what meaning the bit being on
has).
> +    vid_fp = cpudata->vid.min + mul_fp(
> +        int_tofp(pstate - cpudata->pstate.min_pstate),
> +        cpudata->vid.ratio);
> +
> +    vid_fp = clamp(vid_fp, cpudata->vid.min, cpudata->vid.max);
> +    vid = ceiling_fp(vid_fp);
> +
> +    if ( pstate > cpudata->pstate.max_pstate )
> +        vid = cpudata->vid.turbo;
The prior calculation of vid (and vid_fp) is completely pointless in
this case. Also - what if limits->no_turbo or limits->turbo_disabled?
> +    val |= vid;
> +
> +    wrmsrl(MSR_IA32_PERF_CTL, val);
> +}
> +
> +#define BYT_BCLK_FREQS 5
> +#define TO_FREQ_TABLE_IDX_MASK 0x7
> +static uint32_t byt_get_scaling(void)
Missing blank line.
> +{
> +    const uint32_t byt_freq_table[BYT_BCLK_FREQS] =
static
> +                   {833, 1000, 1333, 1167, 800};
> +    uint64_t value;
> +    int i;
> +
> +    rdmsrl(MSR_FSB_FREQ, value);
> +    i = value & TO_FREQ_TABLE_IDX_MASK;
With this calculation i is hardly a signed quantity.
> +#define CORE_MIN_PSTATE(val) (((value) >> 40) & 0xff)
> +#define CORE_MAX_PSTATE(val) (((value) >> 8) & 0xff)
> +#define CORE_TURBO_PSTATE(value) ((value) & 0xff)
> +static uint32_t core_get_min_pstate(void)
> +{
> +    uint64_t value;
> +
> +    rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
> +    return CORE_MIN_PSTATE(val);
The fact that (presumably) don't get a build error here is because
the mix of variable names here happens to invert the mix of names
in the macros ahead of the function.
> +static void core_set_pstate(struct perf_limits *limits,
> +                            struct cpudata *cpudata, uint32_t pstate)
> +{
> +    uint64_t val;
> +
> +    val = pstate << 8;
> +    if ( limits->no_turbo && !limits->turbo_disabled )
> +        val |= (uint64_t)1 << CORE_TURBO_CONTROL_BIT;
> +
> +    wrmsrl(MSR_IA32_PERF_CTL, val);
> +}
> +
> +static __initconst struct cpu_defaults core_params = {
static const struct cpu_defaults __initconst core_params = {
> +static inline void intel_pstate_calc_busy(struct cpudata *cpu)
> +{
> +    struct sample *sample = &cpu->sample;
> +    uint64_t core_pct;
> +
> +    core_pct = int_tofp(sample->aperf) * int_tofp(100);
> +    core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
> +
> +    sample->freq = fp_toint(mul_fp(int_tofp(cpu->pstate.max_pstate *
> +                                   cpu->pstate.scaling / 100), core_pct));
> +
> +    sample->core_pct_busy = (int32_t)core_pct;
Pointless cast.
> +static inline void intel_pstate_sample(struct cpudata *cpu)
> +{
> +    uint64_t aperf, mperf;
> +    unsigned long flags;
> +
> +    local_irq_save(flags);
> +    rdmsrl(MSR_IA32_APERF, aperf);
> +    rdmsrl(MSR_IA32_MPERF, mperf);
> +    local_irq_restore(flags);
> +
> +    cpu->last_sample_time = cpu->sample.time;
> +    cpu->sample.time = get_s_time();
> +    cpu->sample.aperf = aperf;
> +    cpu->sample.mperf = mperf;
> +    cpu->sample.aperf -= cpu->prev_aperf;
> +    cpu->sample.mperf -= cpu->prev_mperf;
Can these two please be done in a single statement each?
> +    intel_pstate_calc_busy(cpu);
> +
> +    cpu->prev_aperf = aperf;
> +    cpu->prev_mperf = mperf;
And these two be moved up (intel_pstate_calc_busy() doesn't
appear to be using the updated fields)?
> +static void intel_pstate_timer_func(void *data)
> +{
> +    struct cpudata *cpu = (struct cpudata *) data;
Pointless cast.
> +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> +{
> +    int cpu_num = policy->cpu;
unsigned int
> +    struct cpudata *cpu = all_cpu_data[cpu_num];
> +
> +    kill_timer(&all_cpu_data[cpu_num]->timer);
> +
> +    intel_pstate_set_pstate(policy, cpu, cpu->pstate.min_pstate);
Why min and not max? Or whatever the CPU was in when the
governor initialized the CPU?
> +static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy *policy)
> +{
> +    struct cpudata *cpu = all_cpu_data[policy->cpu];
> +    struct perf_limits *limits = &policy->limits;
> +
> +    update_turbo_state(policy);
> +    if ( limits->turbo_disabled )
> +    {
> +        printk("Turbo disabled by BIOS or not supported on CPU\n");
> +        return -EINVAL;
> +    }
I'm afraid this message may get printed more often than you'd
want (it should be printed at most once during any session).
> +    limits->no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1;
limits->no_turbo = policy->turbo != CPUFREQ_TURBO_ENABLED;
> +#define INTEL_PSTATE_GOV_NUM 4
> +static struct internal_governor* intel_pstate_internal_gov_init(void)
> +{
> +    unsigned int i = 0;
> +    struct internal_governor *gov;
> +    char *avail_gov;
> +
> +    gov = xzalloc(struct internal_governor);
> +    if ( !gov )
> +        return NULL;
> +    avail_gov = xzalloc_array(char,
> +            INTEL_PSTATE_GOV_NUM * CPUFREQ_NAME_LEN);
> +    if ( !avail_gov )
> +        return NULL;
> +
> +    gov->avail_gov = avail_gov;
> +
> +    i += scnprintf(&avail_gov[0], CPUFREQ_NAME_LEN, "%s ", "performance");
i = ... (and the initializer dropped above).
> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "powersave");
> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "userspace");
> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "ondemand");
> +    avail_gov[i-1] = '\0';
Coding style.
But overall I wonder whether this couldn't be done in a less
convoluted way (e.g. a single strlcpy(), or a strlcpy() followed by
three strlcat()s). In no case is the indirection via %s warranted.
> +    gov->gov_num = INTEL_PSTATE_GOV_NUM;
> +    gov->cur_gov = INTERNAL_GOV_ONDEMAND;
Shouldn't this depend on the "cpufreq=" option setting (if any)?
> +static int intel_pstate_cpu_setup(struct cpufreq_policy *policy)
> +{
> +    struct cpudata *cpu;
> +    struct perf_limits *limits = &policy->limits;
> +    int rc;
> +
> +    rc = intel_pstate_init_cpu(policy->cpu);
Please use reasonably consistent naming (compare the name of the
function we're in and that of the called function).
> +    if ( rc )
> +        return rc;
> +
> +    policy->internal_gov = intel_pstate_internal_gov_init();
> +    if ( !policy->internal_gov )
> +        return -ENOMEM;
> +
> +    cpu = all_cpu_data[policy->cpu];
> +    policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
> +    policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
DYM .max_pstate here?
> +    /* cpuinfo and default policy values */
> +    policy->cpuinfo.min_freq = cpu->pstate.min_pstate *
> +                               cpu->pstate.scaling;
> +    policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate *
> +                               cpu->pstate.scaling;
Same here? Or if intentional, then I think a comment is warranted,
and policy->turbo would then also seem to need initializing to 1
(that actually might eliminate the need for a separate comment).
> +    limits->min_policy_pct = clamp_t(uint32_t,
> +                                     limits->min_policy_pct, 0, 100);
> +    limits->max_policy_pct = (policy->max * 100) /
> +                             policy->cpuinfo.max_freq;
> +    limits->max_policy_pct = clamp_t(uint32_t,
> +                                     limits->max_policy_pct, 0, 100);
These appear to be the only two uses of clamp_t(), which can be
easily made just clamp(). Hence I don't see the need for introducing
clamp_t() in the first place.
> +static void __init copy_pid_params(struct pstate_adjust_policy *policy)
const
> +{
> +    pid_params.sample_rate_ms = policy->sample_rate_ms;
> +    pid_params.p_gain_pct = policy->p_gain_pct;
> +    pid_params.i_gain_pct = policy->i_gain_pct;
> +    pid_params.d_gain_pct = policy->d_gain_pct;
> +    pid_params.deadband = policy->deadband;
> +    pid_params.setpoint = policy->setpoint;
Why not just
    pid_params = *policy;
?
> +static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
const
> +{
> +    pstate_funcs.get_max   = funcs->get_max;
> +    pstate_funcs.get_min   = funcs->get_min;
> +    pstate_funcs.get_turbo = funcs->get_turbo;
> +    pstate_funcs.get_scaling = funcs->get_scaling;
> +    pstate_funcs.set       = funcs->set;
> +    pstate_funcs.get_vid   = funcs->get_vid;
And again
    pstate_funcs = *funcs;
?
> +int __init intel_pstate_init(void)
> +{
> +    int cpu, rc = 0;
unsigned int cpu;
int rc;
> +    const struct x86_cpu_id *id;
> +    struct cpu_defaults *cpu_info;
> +    static bool_t load;
__initdata (and I'm sure I said so before).
> +    boolean_param("intel_pstate", load);
> +
> +    if ( !load )
> +        return -ENODEV;
> +
> +    id = x86_match_cpu(intel_pstate_cpu_ids);
> +    if ( !id )
> +        return -ENODEV;
> +
> +    cpu_info = (struct cpu_defaults *)id->driver_data;
This casts away const-ness, which is why casts should be avoided
wherever possible (and the more when they're pointless like this one).
> +    copy_pid_params(&cpu_info->pid_policy);
> +    copy_cpu_funcs(&cpu_info->funcs);
> +
> +    if ( intel_pstate_msrs_not_valid() )
> +        return -ENODEV;
This reads as if the function returned a boolean. Either make it so
(and the perhaps invert the sense and drop the "not" from the
name) or use the function's return value (in which the function
should perhaps be named ..._validate or some such).
> +    all_cpu_data = xzalloc_array(struct cpudata *, NR_CPUS);
Why NR_CPUS? Can't this be a per-CPU item anyway?
> +    if ( !all_cpu_data )
> +        return -ENOMEM;
> +
> +    rc = cpufreq_register_driver(&intel_pstate_driver);
> +    if ( rc )
> +        goto out;
> +
> +    return rc;
Looks like this could be "return 0", albeit ...
> +out:
... just a single path leading here (and the comment below taken
into account) just doing the cleanup in the if() body above would
seem more reasonable. Otherwise - labels indented by at least one
space please.
> +    for_each_online_cpu(cpu)
> +    {
> +        if ( all_cpu_data[cpu] )
> +        {
> +            kill_timer(&all_cpu_data[cpu]->timer);
> +            xfree(all_cpu_data[cpu]);
> +        }
> +    }
Why is all of this needed? If cpufreq_register_driver() (or one of its
descendants) fails, it should clean up after itself.
> --- /dev/null
> +++ b/xen/include/asm-x86/cpufreq.h
> @@ -0,0 +1,31 @@
> +#ifndef _ASM_X86_CPUFREQ_H
> +#define _ASM_X86_CPUFREQ_H
> +
> +/*
> + *  Copyright (C) 2015, Intel Corporation.
> + *  Wei Wang <wei.w.wang@intel.com>
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; If not, see <http://www.gnu.org/licenses/>.
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + */
> +
> +/*
> + * Maximum transition latency is in nanoseconds - if it's unknown,
> + * CPUFREQ_ETERNAL shall be used.
> + */
> +#define CPUFREQ_ETERNAL        (-1)
So what is x86-specific about this #define? I.e. why can't this go
into an existing header?
Jan
^ permalink raw reply	[flat|nested] 14+ messages in thread
 
- * [PATCH v6 4/6] x86/intel_pstate: add a booting param to select the driver to load
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
                   ` (2 preceding siblings ...)
  2015-10-28  3:21 ` [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-11-20 14:22   ` Jan Beulich
  2015-10-28  3:21 ` [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c Wei Wang
  2015-10-28  3:21 ` [PATCH v6 6/6] tools: enable xenpm to control the intel_pstate driver Wei Wang
  5 siblings, 1 reply; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
By default, the old P-state driver (acpi-freq) is used. Adding
"intel_pstate" to the Xen booting param list to enable the
use of intel_pstate. However, if intel_pstate is enabled on a
machine which does not support the driver (e.g. Nehalem), the
old P-state driver will be loaded due to the failure loading of
intel_pstate.
Also, adding the intel_pstate booting parameter to
xen-command-line.markdown.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 1) move the declaration of intel_pstate_init() to this patch.
 docs/misc/xen-command-line.markdown |  7 +++++++
 xen/arch/x86/acpi/cpufreq/cpufreq.c | 15 ++++++++++-----
 xen/include/asm-x86/cpufreq.h       |  2 ++
 3 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 416e559..e57a23a 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -858,6 +858,13 @@ debug hypervisor only).
 ### idle\_latency\_factor
 > `= <integer>`
 
+### intel\_pstate
+> `= <boolean>`
+
+> Default: `false`
+
+Enable the loading of the intel pstate driver.
+
 ### ioapic\_ack
 > `= old | new`
 
diff --git a/xen/arch/x86/acpi/cpufreq/cpufreq.c b/xen/arch/x86/acpi/cpufreq/cpufreq.c
index a2ba0db..d59f251 100644
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c
@@ -40,6 +40,7 @@
 #include <asm/processor.h>
 #include <asm/percpu.h>
 #include <asm/cpufeature.h>
+#include <asm/cpufreq.h>
 #include <acpi/acpi.h>
 #include <acpi/cpufreq/cpufreq.h>
 
@@ -647,11 +648,15 @@ static int __init cpufreq_driver_init(void)
 {
     int ret = 0;
 
-    if ((cpufreq_controller == FREQCTL_xen) &&
-        (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL))
-        ret = cpufreq_register_driver(&acpi_cpufreq_driver);
-    else if ((cpufreq_controller == FREQCTL_xen) &&
-        (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
+    if ( (cpufreq_controller == FREQCTL_xen) &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+    {
+        ret = intel_pstate_init();
+        if ( ret )
+            ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+    }
+    else if ( (cpufreq_controller == FREQCTL_xen) &&
+              (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
         ret = powernow_register_driver();
 
     return ret;
diff --git a/xen/include/asm-x86/cpufreq.h b/xen/include/asm-x86/cpufreq.h
index afc72df..3ff516d 100644
--- a/xen/include/asm-x86/cpufreq.h
+++ b/xen/include/asm-x86/cpufreq.h
@@ -22,6 +22,8 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
+extern int intel_pstate_init(void);
+
 /*
  * Maximum transition latency is in nanoseconds - if it's unknown,
  * CPUFREQ_ETERNAL shall be used.
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * Re: [PATCH v6 4/6] x86/intel_pstate: add a booting param to select the driver to load
  2015-10-28  3:21 ` [PATCH v6 4/6] x86/intel_pstate: add a booting param to select the driver to load Wei Wang
@ 2015-11-20 14:22   ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2015-11-20 14:22 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, xen-devel
>>> On 28.10.15 at 04:21, <wei.w.wang@intel.com> wrote:
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
with two remarks:
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -858,6 +858,13 @@ debug hypervisor only).
>  ### idle\_latency\_factor
>  > `= <integer>`
>  
> +### intel\_pstate
> +> `= <boolean>`
> +
> +> Default: `false`
> +
> +Enable the loading of the intel pstate driver.
"Loading" is probably not the right term, as it gets always loaded.
Perhaps just drop "the loading of"?
> --- a/xen/include/asm-x86/cpufreq.h
> +++ b/xen/include/asm-x86/cpufreq.h
> @@ -22,6 +22,8 @@
>   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>   */
>  
> +extern int intel_pstate_init(void);
Considering that I questioned the creation of this header in the
previous patch, the addition of this declaration should not result
in it now being created here. In fact if any new header was
needed for this declaration, it would probably better live in
xen/arch/x86/acpi/cpufreq/.
Moving this to a _suitable_ existing header does not void the ack
given.
Jan
^ permalink raw reply	[flat|nested] 14+ messages in thread 
 
- * [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
                   ` (3 preceding siblings ...)
  2015-10-28  3:21 ` [PATCH v6 4/6] x86/intel_pstate: add a booting param to select the driver to load Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-10-28 11:18   ` Wei Liu
  2015-11-20 14:47   ` Jan Beulich
  2015-10-28  3:21 ` [PATCH v6 6/6] tools: enable xenpm to control the intel_pstate driver Wei Wang
  5 siblings, 2 replies; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
Add support in the pmstat.c so that the xenpm tool can request to
access the intel_pstate driver.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 1) add the NON_INTERNAL_GOV macro to replace literal 0;
 2) code consolidation (e.g. merging some code into if/else, as required in v5);
 3) somewhere, change to use clamp, instead of clamp_t;
 4) xen_perf_alias, instead of perf_alias.
 tools/libxc/include/xenctrl.h      |  20 ++--
 tools/libxc/xc_pm.c                |  16 ++--
 tools/misc/xenpm.c                 |   4 +-
 xen/drivers/acpi/pmstat.c          | 183 +++++++++++++++++++++++++++++++------
 xen/include/acpi/cpufreq/cpufreq.h |   2 +
 xen/include/public/sysctl.h        |  29 ++++--
 6 files changed, 198 insertions(+), 56 deletions(-)
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 3bfa00b..590eb72 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2270,6 +2270,17 @@ struct xc_get_cpufreq_para {
     uint32_t cpu_num;
     uint32_t freq_num;
     uint32_t gov_num;
+    int32_t turbo_enabled;
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    uint32_t scaling_turbo_pct;
+    uint32_t scaling_max_perf;
+    uint32_t scaling_min_perf;
+    enum xen_perf_alias perf_alias;
 
     /* for all governors */
     /* OUT variable */
@@ -2278,22 +2289,13 @@ struct xc_get_cpufreq_para {
     char     *scaling_available_governors;
     char scaling_driver[CPUFREQ_NAME_LEN];
 
-    uint32_t cpuinfo_cur_freq;
-    uint32_t cpuinfo_max_freq;
-    uint32_t cpuinfo_min_freq;
-    uint32_t scaling_cur_freq;
-
     char scaling_governor[CPUFREQ_NAME_LEN];
-    uint32_t scaling_max_freq;
-    uint32_t scaling_min_freq;
 
     /* for specific governor */
     union {
         xc_userspace_t userspace;
         xc_ondemand_t ondemand;
     } u;
-
-    int32_t turbo_enabled;
 };
 
 int xc_get_cpufreq_para(xc_interface *xch, int cpuid,
diff --git a/tools/libxc/xc_pm.c b/tools/libxc/xc_pm.c
index 5b38cf1..6a16e8a 100644
--- a/tools/libxc/xc_pm.c
+++ b/tools/libxc/xc_pm.c
@@ -260,13 +260,15 @@ int xc_get_cpufreq_para(xc_interface *xch, int cpuid,
     }
     else
     {
-        user_para->cpuinfo_cur_freq = sys_para->cpuinfo_cur_freq;
-        user_para->cpuinfo_max_freq = sys_para->cpuinfo_max_freq;
-        user_para->cpuinfo_min_freq = sys_para->cpuinfo_min_freq;
-        user_para->scaling_cur_freq = sys_para->scaling_cur_freq;
-        user_para->scaling_max_freq = sys_para->scaling_max_freq;
-        user_para->scaling_min_freq = sys_para->scaling_min_freq;
-        user_para->turbo_enabled    = sys_para->turbo_enabled;
+        user_para->cpuinfo_cur_freq  = sys_para->cpuinfo_cur_freq;
+        user_para->cpuinfo_max_freq  = sys_para->cpuinfo_max_freq;
+        user_para->cpuinfo_min_freq  = sys_para->cpuinfo_min_freq;
+        user_para->scaling_cur_freq  = sys_para->scaling_cur_freq;
+        user_para->scaling_max_perf  = sys_para->scaling_max_perf;
+        user_para->scaling_min_perf  = sys_para->scaling_min_perf;
+        user_para->scaling_turbo_pct = sys_para->scaling_turbo_pct;
+        user_para->perf_alias        = sys_para->perf_alias;
+        user_para->turbo_enabled     = sys_para->turbo_enabled;
 
         memcpy(user_para->scaling_driver,
                 sys_para->scaling_driver, CPUFREQ_NAME_LEN);
diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c
index 08f2242..5944fdb 100644
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -705,8 +705,8 @@ static void print_cpufreq_para(int cpuid, struct xc_get_cpufreq_para *p_cpufreq)
     printf("\n");
 
     printf("scaling frequency    : max [%u] min [%u] cur [%u]\n",
-           p_cpufreq->scaling_max_freq,
-           p_cpufreq->scaling_min_freq,
+           p_cpufreq->scaling_max_perf,
+           p_cpufreq->scaling_min_perf,
            p_cpufreq->scaling_cur_freq);
 
     printf("turbo mode           : %s\n",
diff --git a/xen/drivers/acpi/pmstat.c b/xen/drivers/acpi/pmstat.c
index 892260d..7825f91 100644
--- a/xen/drivers/acpi/pmstat.c
+++ b/xen/drivers/acpi/pmstat.c
@@ -191,7 +191,9 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
     uint32_t ret = 0;
     const struct processor_pminfo *pmpt;
     struct cpufreq_policy *policy;
-    uint32_t gov_num = 0;
+    struct perf_limits *limits;
+    struct internal_governor *internal_gov;
+    uint32_t cur_gov, gov_num = 0;
     uint32_t *affected_cpus;
     uint32_t *scaling_available_frequencies;
     char     *scaling_available_governors;
@@ -200,13 +202,24 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
 
     pmpt = processor_pminfo[op->cpuid];
     policy = per_cpu(cpufreq_cpu_policy, op->cpuid);
+    limits = &policy->limits;
+    internal_gov = policy->internal_gov;
 
     if ( !pmpt || !pmpt->perf.states ||
-         !policy || !policy->governor )
+         !policy || (!policy->governor && !internal_gov) )
         return -EINVAL;
 
-    list_for_each(pos, &cpufreq_governor_list)
-        gov_num++;
+    if ( internal_gov )
+    {
+        cur_gov = internal_gov->cur_gov;
+        gov_num = internal_gov->gov_num;
+    }
+    else
+    {
+        cur_gov = NON_INTERNAL_GOV;
+        list_for_each(pos, &cpufreq_governor_list)
+            gov_num++;
+    }
 
     if ( (op->u.get_para.cpu_num  != cpumask_weight(policy->cpus)) ||
          (op->u.get_para.freq_num != pmpt->perf.state_count)    ||
@@ -240,40 +253,85 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
     if ( ret )
         return ret;
 
-    if ( !(scaling_available_governors =
-           xzalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) )
-        return -ENOMEM;
-    if ( (ret = read_scaling_available_governors(scaling_available_governors,
-                gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
+    if ( internal_gov )
     {
+        ret = copy_to_guest(op->u.get_para.scaling_available_governors,
+                    internal_gov->avail_gov, gov_num * CPUFREQ_NAME_LEN);
+        if ( ret )
+            return ret;
+    }
+    else
+    {
+        if ( !(scaling_available_governors =
+               xzalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) )
+            return -ENOMEM;
+        if ( (ret = read_scaling_available_governors(scaling_available_governors,
+                gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
+        {
+            xfree(scaling_available_governors);
+            return ret;
+        }
+        ret = copy_to_guest(op->u.get_para.scaling_available_governors,
+                    scaling_available_governors, gov_num * CPUFREQ_NAME_LEN);
         xfree(scaling_available_governors);
-        return ret;
+        if ( ret )
+            return ret;
     }
-    ret = copy_to_guest(op->u.get_para.scaling_available_governors,
-                scaling_available_governors, gov_num * CPUFREQ_NAME_LEN);
-    xfree(scaling_available_governors);
-    if ( ret )
-        return ret;
-
     op->u.get_para.cpuinfo_cur_freq =
         cpufreq_driver->get ? cpufreq_driver->get(op->cpuid) : policy->cur;
     op->u.get_para.cpuinfo_max_freq = policy->cpuinfo.max_freq;
     op->u.get_para.cpuinfo_min_freq = policy->cpuinfo.min_freq;
     op->u.get_para.scaling_cur_freq = policy->cur;
-    op->u.get_para.scaling_max_freq = policy->max;
-    op->u.get_para.scaling_min_freq = policy->min;
+    if ( internal_gov )
+    {
+        op->u.get_para.scaling_max_perf = limits->max_perf_pct;
+        op->u.get_para.scaling_min_perf = limits->min_perf_pct;
+        op->u.get_para.scaling_turbo_pct = limits->turbo_pct;
+        if ( !strncmp(cpufreq_driver->name,
+                     "intel_pstate", CPUFREQ_NAME_LEN) )
+            op->u.get_para.perf_alias = PERCENTAGE;
+        else
+            op->u.get_para.perf_alias = FREQUENCY;
+    }
+    else
+    {
+        op->u.get_para.scaling_max_perf = policy->max;
+        op->u.get_para.scaling_min_perf = policy->min;
+        op->u.get_para.perf_alias = FREQUENCY;
+    }
 
     if ( cpufreq_driver->name[0] )
-        strlcpy(op->u.get_para.scaling_driver, 
+        strlcpy(op->u.get_para.scaling_driver,
             cpufreq_driver->name, CPUFREQ_NAME_LEN);
     else
         strlcpy(op->u.get_para.scaling_driver, "Unknown", CPUFREQ_NAME_LEN);
 
-    if ( policy->governor->name[0] )
-        strlcpy(op->u.get_para.scaling_governor, 
-            policy->governor->name, CPUFREQ_NAME_LEN);
-    else
-        strlcpy(op->u.get_para.scaling_governor, "Unknown", CPUFREQ_NAME_LEN);
+    switch ( cur_gov )
+    {
+    case INTERNAL_GOV_PERFORMANCE:
+        strlcpy(op->u.get_para.scaling_governor,
+                "performance", CPUFREQ_NAME_LEN);
+        break;
+    case INTERNAL_GOV_POWERSAVE:
+        strlcpy(op->u.get_para.scaling_governor,
+                "powersave", CPUFREQ_NAME_LEN);
+        break;
+    case INTERNAL_GOV_USERSPACE:
+        strlcpy(op->u.get_para.scaling_governor,
+                "userspace", CPUFREQ_NAME_LEN);
+        break;
+    case INTERNAL_GOV_ONDEMAND:
+        strlcpy(op->u.get_para.scaling_governor,
+                "ondemand", CPUFREQ_NAME_LEN);
+        break;
+    case NON_INTERNAL_GOV:
+    default:
+        if ( policy->governor->name[0] )
+            strlcpy(op->u.get_para.scaling_governor,
+                policy->governor->name, CPUFREQ_NAME_LEN);
+        else
+            strlcpy(op->u.get_para.scaling_governor, "Unknown", CPUFREQ_NAME_LEN);
+    }
 
     /* governor specific para */
     if ( !strnicmp(op->u.get_para.scaling_governor, 
@@ -299,16 +357,36 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
 static int set_cpufreq_gov(struct xen_sysctl_pm_op *op)
 {
     struct cpufreq_policy new_policy, *old_policy;
+    struct internal_governor *internal_gov;
 
     old_policy = per_cpu(cpufreq_cpu_policy, op->cpuid);
     if ( !old_policy )
         return -EINVAL;
+    internal_gov = old_policy->internal_gov;
 
     memcpy(&new_policy, old_policy, sizeof(struct cpufreq_policy));
 
-    new_policy.governor = __find_governor(op->u.set_gov.scaling_governor);
-    if (new_policy.governor == NULL)
-        return -EINVAL;
+    if ( internal_gov && internal_gov->cur_gov )
+    {
+        if ( !strnicmp(op->u.set_gov.scaling_governor,
+                       "performance", CPUFREQ_NAME_LEN) )
+            internal_gov->cur_gov = INTERNAL_GOV_PERFORMANCE;
+        else if ( !strnicmp(op->u.set_gov.scaling_governor,
+                       "powersave", CPUFREQ_NAME_LEN) )
+            internal_gov->cur_gov = INTERNAL_GOV_POWERSAVE;
+        else if ( !strnicmp(op->u.set_gov.scaling_governor,
+                       "userspace", CPUFREQ_NAME_LEN) )
+            internal_gov->cur_gov = INTERNAL_GOV_USERSPACE;
+        else if ( !strnicmp(op->u.set_gov.scaling_governor,
+                       "ondemand", CPUFREQ_NAME_LEN) )
+            internal_gov->cur_gov = INTERNAL_GOV_ONDEMAND;
+    }
+    else
+    {
+        new_policy.governor = __find_governor(op->u.set_gov.scaling_governor);
+        if ( new_policy.governor == NULL )
+            return -EINVAL;
+    }
 
     return __cpufreq_set_policy(old_policy, &new_policy);
 }
@@ -317,10 +395,12 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
 {
     int ret = 0;
     struct cpufreq_policy *policy;
+    struct internal_governor *internal_gov;
 
     policy = per_cpu(cpufreq_cpu_policy, op->cpuid);
+    internal_gov = policy->internal_gov;
 
-    if ( !policy || !policy->governor )
+    if ( !policy || (!policy->governor && !internal_gov) )
         return -EINVAL;
 
     switch(op->u.set_para.ctrl_type)
@@ -329,6 +409,9 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
     {
         struct cpufreq_policy new_policy;
 
+        if ( !policy->governor || internal_gov )
+            return -EINVAL;
+
         memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
         new_policy.max = op->u.set_para.ctrl_value;
         ret = __cpufreq_set_policy(policy, &new_policy);
@@ -340,6 +423,9 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
     {
         struct cpufreq_policy new_policy;
 
+        if ( !policy->governor || internal_gov )
+            return -EINVAL;
+
         memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
         new_policy.min = op->u.set_para.ctrl_value;
         ret = __cpufreq_set_policy(policy, &new_policy);
@@ -347,10 +433,45 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
         break;
     }
 
+    case SCALING_MAX_PCT:
+    {
+        struct cpufreq_policy new_policy;
+        struct perf_limits *limits = &new_policy.limits;
+
+        if ( policy->governor || !internal_gov )
+            return -EINVAL;
+
+        new_policy = *policy;
+        limits->max_perf_pct = clamp(op->u.set_para.ctrl_value,
+                                     limits->min_policy_pct,
+                                     limits->max_policy_pct);
+        ret = __cpufreq_set_policy(policy, &new_policy);
+        break;
+    }
+
+    case SCALING_MIN_PCT:
+    {
+        struct cpufreq_policy new_policy;
+        struct perf_limits *limits = &new_policy.limits;
+
+        if ( policy->governor || !internal_gov )
+            return -EINVAL;
+
+        new_policy = *policy;
+        limits->min_perf_pct = clamp(op->u.set_para.ctrl_value,
+                                     limits->min_policy_pct,
+                                     limits->max_policy_pct);
+        ret = __cpufreq_set_policy(policy, &new_policy);
+        break;
+    }
+
     case SCALING_SETSPEED:
     {
         unsigned int freq =op->u.set_para.ctrl_value;
 
+        if ( !policy->governor || internal_gov )
+            return -EINVAL;
+
         if ( !strnicmp(policy->governor->name,
                        "userspace", CPUFREQ_NAME_LEN) )
             ret = write_userspace_scaling_setspeed(op->cpuid, freq);
@@ -364,6 +485,9 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
     {
         unsigned int sampling_rate = op->u.set_para.ctrl_value;
 
+        if ( !policy->governor || internal_gov )
+            return -EINVAL;
+
         if ( !strnicmp(policy->governor->name,
                        "ondemand", CPUFREQ_NAME_LEN) )
             ret = write_ondemand_sampling_rate(sampling_rate);
@@ -377,6 +501,9 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
     {
         unsigned int up_threshold = op->u.set_para.ctrl_value;
 
+        if ( !policy->governor || internal_gov )
+            return -EINVAL;
+
         if ( !strnicmp(policy->governor->name,
                        "ondemand", CPUFREQ_NAME_LEN) )
             ret = write_ondemand_up_threshold(up_threshold);
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index ef6643a..5cbe71e 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -107,7 +107,9 @@ struct cpufreq_freqs {
  *                          CPUFREQ GOVERNORS                        *
  *********************************************************************/
 
+
 /* Please add internal governors here */
+#define NON_INTERNAL_GOV              (0)
 #define INTERNAL_GOV_POWERSAVE        (1)
 #define INTERNAL_GOV_PERFORMANCE      (2)
 #define INTERNAL_GOV_USERSPACE        (3)
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index 96680eb..dcfe541 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -297,11 +297,28 @@ typedef struct xen_ondemand xen_ondemand_t;
  * same as sysfs file name of native linux
  */
 #define CPUFREQ_NAME_LEN 16
+
+enum xen_perf_alias {
+    FREQUENCY  = 0,
+    PERCENTAGE = 1
+};
+
 struct xen_get_cpufreq_para {
     /* IN/OUT variable */
     uint32_t cpu_num;
     uint32_t freq_num;
     uint32_t gov_num;
+    int32_t turbo_enabled;
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    uint32_t scaling_turbo_pct;
+    uint32_t scaling_max_perf;
+    uint32_t scaling_min_perf;
+    enum xen_perf_alias perf_alias;
 
     /* for all governors */
     /* OUT variable */
@@ -309,23 +326,13 @@ struct xen_get_cpufreq_para {
     XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
     XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
     char scaling_driver[CPUFREQ_NAME_LEN];
-
-    uint32_t cpuinfo_cur_freq;
-    uint32_t cpuinfo_max_freq;
-    uint32_t cpuinfo_min_freq;
-    uint32_t scaling_cur_freq;
-
     char scaling_governor[CPUFREQ_NAME_LEN];
-    uint32_t scaling_max_freq;
-    uint32_t scaling_min_freq;
 
     /* for specific governor */
     union {
         struct  xen_userspace userspace;
         struct  xen_ondemand ondemand;
     } u;
-
-    int32_t turbo_enabled;
 };
 
 struct xen_set_cpufreq_gov {
@@ -338,6 +345,8 @@ struct xen_set_cpufreq_para {
     #define SCALING_SETSPEED           3
     #define SAMPLING_RATE              4
     #define UP_THRESHOLD               5
+    #define SCALING_MAX_PCT            6
+    #define SCALING_MIN_PCT            7
 
     uint32_t ctrl_type;
     uint32_t ctrl_value;
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * Re: [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c
  2015-10-28  3:21 ` [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c Wei Wang
@ 2015-10-28 11:18   ` Wei Liu
  2015-11-20 14:47   ` Jan Beulich
  1 sibling, 0 replies; 14+ messages in thread
From: Wei Liu @ 2015-10-28 11:18 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, jbeulich, xen-devel
On Wed, Oct 28, 2015 at 11:21:17AM +0800, Wei Wang wrote:
> Add support in the pmstat.c so that the xenpm tool can request to
> access the intel_pstate driver.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> ---
>  changes in v6:
>  1) add the NON_INTERNAL_GOV macro to replace literal 0;
>  2) code consolidation (e.g. merging some code into if/else, as required in v5);
>  3) somewhere, change to use clamp, instead of clamp_t;
>  4) xen_perf_alias, instead of perf_alias.
> 
>  tools/libxc/include/xenctrl.h      |  20 ++--
>  tools/libxc/xc_pm.c                |  16 ++--
>  tools/misc/xenpm.c                 |   4 +-
I think all changes to toolstack code are merely trying to mirror
hypervisor side changes and there are no functional changes.
With that understanding and subject to an ack from hypervisor side
maintainer:
Acked-by: Wei Liu <wei.liu2@citrix.com>
^ permalink raw reply	[flat|nested] 14+ messages in thread 
- * Re: [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c
  2015-10-28  3:21 ` [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c Wei Wang
  2015-10-28 11:18   ` Wei Liu
@ 2015-11-20 14:47   ` Jan Beulich
  1 sibling, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2015-11-20 14:47 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, xen-devel
>>> On 28.10.15 at 04:21, <wei.w.wang@intel.com> wrote:
>  tools/libxc/include/xenctrl.h      |  20 ++--
>  tools/libxc/xc_pm.c                |  16 ++--
I won't (again) comment on the changes of these files (I'll leave it
to the tools maintainers), but I can't help thinking that there's more
shuffling there than necessary.
> --- a/tools/misc/xenpm.c
> +++ b/tools/misc/xenpm.c
> @@ -705,8 +705,8 @@ static void print_cpufreq_para(int cpuid, struct xc_get_cpufreq_para *p_cpufreq)
>      printf("\n");
>  
>      printf("scaling frequency    : max [%u] min [%u] cur [%u]\n",
> -           p_cpufreq->scaling_max_freq,
> -           p_cpufreq->scaling_min_freq,
> +           p_cpufreq->scaling_max_perf,
> +           p_cpufreq->scaling_min_perf,
>             p_cpufreq->scaling_cur_freq);
So where did your percentages go?
> @@ -240,40 +253,85 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
>      if ( ret )
>          return ret;
>  
> -    if ( !(scaling_available_governors =
> -           xzalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) )
> -        return -ENOMEM;
> -    if ( (ret = read_scaling_available_governors(scaling_available_governors,
> -                gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
> +    if ( internal_gov )
>      {
> +        ret = copy_to_guest(op->u.get_para.scaling_available_governors,
> +                    internal_gov->avail_gov, gov_num * CPUFREQ_NAME_LEN);
> +        if ( ret )
> +            return ret;
> +    }
> +    else
> +    {
> +        if ( !(scaling_available_governors =
> +               xzalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) )
> +            return -ENOMEM;
> +        if ( (ret = read_scaling_available_governors(scaling_available_governors,
> +                gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
> +        {
> +            xfree(scaling_available_governors);
> +            return ret;
> +        }
> +        ret = copy_to_guest(op->u.get_para.scaling_available_governors,
> +                    scaling_available_governors, gov_num * CPUFREQ_NAME_LEN);
>          xfree(scaling_available_governors);
> -        return ret;
> +        if ( ret )
> +            return ret;
>      }
Even if the benefit may look tiny, code that can be shared should be
shared. In the case here, the conditional return should be pulled out
of the if and else branches. (It in fact seems possible to also pull out
the copy_to_guest(), which would result in quite a bit less churn on
the code.)
> -    op->u.get_para.scaling_max_freq = policy->max;
> -    op->u.get_para.scaling_min_freq = policy->min;
> +    if ( internal_gov )
> +    {
> +        op->u.get_para.scaling_max_perf = limits->max_perf_pct;
> +        op->u.get_para.scaling_min_perf = limits->min_perf_pct;
> +        op->u.get_para.scaling_turbo_pct = limits->turbo_pct;
> +        if ( !strncmp(cpufreq_driver->name,
> +                     "intel_pstate", CPUFREQ_NAME_LEN) )
> +            op->u.get_para.perf_alias = PERCENTAGE;
> +        else
> +            op->u.get_para.perf_alias = FREQUENCY;
Shouldn't the internal governor tell you, rather than keying this
on its name?
> @@ -299,16 +357,36 @@ static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
>  static int set_cpufreq_gov(struct xen_sysctl_pm_op *op)
>  {
>      struct cpufreq_policy new_policy, *old_policy;
> +    struct internal_governor *internal_gov;
>  
>      old_policy = per_cpu(cpufreq_cpu_policy, op->cpuid);
>      if ( !old_policy )
>          return -EINVAL;
> +    internal_gov = old_policy->internal_gov;
>  
>      memcpy(&new_policy, old_policy, sizeof(struct cpufreq_policy));
>  
> -    new_policy.governor = __find_governor(op->u.set_gov.scaling_governor);
> -    if (new_policy.governor == NULL)
> -        return -EINVAL;
> +    if ( internal_gov && internal_gov->cur_gov )
Either the right side of the && has to go away (preferred), or it has
to become != NON_INTERNAL_GOV.
> +    {
> +        if ( !strnicmp(op->u.set_gov.scaling_governor,
> +                       "performance", CPUFREQ_NAME_LEN) )
> +            internal_gov->cur_gov = INTERNAL_GOV_PERFORMANCE;
> +        else if ( !strnicmp(op->u.set_gov.scaling_governor,
> +                       "powersave", CPUFREQ_NAME_LEN) )
> +            internal_gov->cur_gov = INTERNAL_GOV_POWERSAVE;
> +        else if ( !strnicmp(op->u.set_gov.scaling_governor,
> +                       "userspace", CPUFREQ_NAME_LEN) )
> +            internal_gov->cur_gov = INTERNAL_GOV_USERSPACE;
> +        else if ( !strnicmp(op->u.set_gov.scaling_governor,
> +                       "ondemand", CPUFREQ_NAME_LEN) )
> +            internal_gov->cur_gov = INTERNAL_GOV_ONDEMAND;
else ...
> @@ -317,10 +395,12 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
>  {
>      int ret = 0;
>      struct cpufreq_policy *policy;
> +    struct internal_governor *internal_gov;
>  
>      policy = per_cpu(cpufreq_cpu_policy, op->cpuid);
> +    internal_gov = policy->internal_gov;
>  
> -    if ( !policy || !policy->governor )
> +    if ( !policy || (!policy->governor && !internal_gov) )
>          return -EINVAL;
>  
>      switch(op->u.set_para.ctrl_type)
> @@ -329,6 +409,9 @@ static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
>      {
>          struct cpufreq_policy new_policy;
>  
> +        if ( !policy->governor || internal_gov )
> +            return -EINVAL;
The "!policy->governor" part looks to be dead code, considering the
check done in the previous hunk.
> --- a/xen/include/public/sysctl.h
> +++ b/xen/include/public/sysctl.h
> @@ -297,11 +297,28 @@ typedef struct xen_ondemand xen_ondemand_t;
>   * same as sysfs file name of native linux
>   */
>  #define CPUFREQ_NAME_LEN 16
> +
> +enum xen_perf_alias {
> +    FREQUENCY  = 0,
> +    PERCENTAGE = 1
> +};
The need proper prefixes to avoid name space collisions, e.g.
XEN_CPUFREQ_MODE_ (and the enum name being "alias" is
also odd, "mode" or some such would seem better there too).
>  struct xen_get_cpufreq_para {
>      /* IN/OUT variable */
>      uint32_t cpu_num;
>      uint32_t freq_num;
>      uint32_t gov_num;
> +    int32_t turbo_enabled;
> +
> +    uint32_t cpuinfo_cur_freq;
> +    uint32_t cpuinfo_max_freq;
> +    uint32_t cpuinfo_min_freq;
> +    uint32_t scaling_cur_freq;
> +
> +    uint32_t scaling_turbo_pct;
> +    uint32_t scaling_max_perf;
> +    uint32_t scaling_min_perf;
> +    enum xen_perf_alias perf_alias;
You shouldn't use fields of non-fixed type in the public interface.
> @@ -309,23 +326,13 @@ struct xen_get_cpufreq_para {
>      XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
>      XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
>      char scaling_driver[CPUFREQ_NAME_LEN];
> -
> -    uint32_t cpuinfo_cur_freq;
> -    uint32_t cpuinfo_max_freq;
> -    uint32_t cpuinfo_min_freq;
> -    uint32_t scaling_cur_freq;
> -
>      char scaling_governor[CPUFREQ_NAME_LEN];
> -    uint32_t scaling_max_freq;
> -    uint32_t scaling_min_freq;
>  
>      /* for specific governor */
>      union {
>          struct  xen_userspace userspace;
>          struct  xen_ondemand ondemand;
>      } u;
> -
> -    int32_t turbo_enabled;
>  };
Just to repeat myself: Too much shuffling for no reason. I can see
why you want turbo_enabled go up (to reduce holes), but I don't
see the point of any of the other moving around of members.
Jan
^ permalink raw reply	[flat|nested] 14+ messages in thread
 
- * [PATCH v6 6/6] tools: enable xenpm to control the intel_pstate driver
  2015-10-28  3:21 [PATCH v6 0/6] Porting the intel_pstate driver to Xen Wei Wang
                   ` (4 preceding siblings ...)
  2015-10-28  3:21 ` [PATCH v6 5/6] x86/intel_pstate: support the use of intel_pstate in pmstat.c Wei Wang
@ 2015-10-28  3:21 ` Wei Wang
  2015-10-28 11:36   ` Wei Liu
  5 siblings, 1 reply; 14+ messages in thread
From: Wei Wang @ 2015-10-28  3:21 UTC (permalink / raw)
  To: jbeulich, andrew.cooper3, xen-devel, wei.liu2; +Cc: Wei Wang
The intel_pstate driver receives percentage values to set the
performance limits. This patch adds interfaces to support the
input of percentage values to control the intel_pstate driver.
The "get-cpufreq-para" is modified to show percentage
based feedback info.
Also, some changes in identation are made to make the printed
info looks tidy.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 changes in v6:
 No big change in this version, since we did not get any comments from the maintainers
 in the previous version.
 tools/misc/xenpm.c | 116 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 94 insertions(+), 22 deletions(-)
diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c
index 5944fdb..5644817 100644
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -32,6 +32,11 @@
 #define MAX_CORE_RESIDENCIES 8
 
 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+#define min_t(type,x,y) \
+        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#define max_t(type,x,y) \
+        ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
+#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
 
 static xc_interface *xc_handle;
 static unsigned int max_cpu_nr;
@@ -46,6 +51,9 @@ void show_help(void)
             " get-cpuidle-states    [cpuid]       list cpu idle info of CPU <cpuid> or all\n"
             " get-cpufreq-states    [cpuid]       list cpu freq info of CPU <cpuid> or all\n"
             " get-cpufreq-para      [cpuid]       list cpu freq parameter of CPU <cpuid> or all\n"
+            " set-scaling-max-pct   [cpuid] <num> set max performance limit in percentage\n"
+            "                                     or as scaling speed in percentage in userspace governor\n"
+            " set-scaling-min-pct   [cpuid] <num> set min performance limit in percentage\n"
             " set-scaling-maxfreq   [cpuid] <HZ>  set max cpu frequency <HZ> on CPU <cpuid>\n"
             "                                     or all CPUs\n"
             " set-scaling-minfreq   [cpuid] <HZ>  set min cpu frequency <HZ> on CPU <cpuid>\n"
@@ -59,10 +67,10 @@ void show_help(void)
             " set-up-threshold      [cpuid] <num> set up threshold on CPU <cpuid> or all\n"
             "                                     it is used in ondemand governor.\n"
             " get-cpu-topology                    get thread/core/socket topology info\n"
-            " set-sched-smt           enable|disable enable/disable scheduler smt power saving\n"
+            " set-sched-smt                       enable|disable enable/disable scheduler smt power saving\n"
             " set-vcpu-migration-delay      <num> set scheduler vcpu migration delay in us\n"
             " get-vcpu-migration-delay            get scheduler vcpu migration delay\n"
-            " set-max-cstate        <num>         set the C-State limitation (<num> >= 0)\n"
+            " set-max-cstate                <num> set the C-State limitation (<num> >= 0)\n"
             " start [seconds]                     start collect Cx/Px statistics,\n"
             "                                     output after CTRL-C or SIGINT or several seconds.\n"
             " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
@@ -677,37 +685,51 @@ static void print_cpufreq_para(int cpuid, struct xc_get_cpufreq_para *p_cpufreq)
 
     printf("current_governor     : %s\n", p_cpufreq->scaling_governor);
     if ( !strncmp(p_cpufreq->scaling_governor,
-                  "userspace", CPUFREQ_NAME_LEN) )
+                  "userspace", CPUFREQ_NAME_LEN) &&
+         strncmp(p_cpufreq->scaling_driver,
+                 "intel_pstate", CPUFREQ_NAME_LEN) )
     {
-        printf("  userspace specific :\n");
-        printf("    scaling_setspeed : %u\n",
+        printf("userspace specific   :\n");
+        printf("scaling_setspeed     : %u\n",
                p_cpufreq->u.userspace.scaling_setspeed);
     }
     else if ( !strncmp(p_cpufreq->scaling_governor,
-                       "ondemand", CPUFREQ_NAME_LEN) )
+                       "ondemand", CPUFREQ_NAME_LEN) &&
+              strncmp(p_cpufreq->scaling_driver,
+                      "intel_pstate", CPUFREQ_NAME_LEN) )
     {
-        printf("  ondemand specific  :\n");
-        printf("    sampling_rate    : max [%u] min [%u] cur [%u]\n",
+        printf("ondemand specific    :\n");
+        printf("sampling_rate        : max [%u] min [%u] cur [%u]\n",
                p_cpufreq->u.ondemand.sampling_rate_max,
                p_cpufreq->u.ondemand.sampling_rate_min,
                p_cpufreq->u.ondemand.sampling_rate);
-        printf("    up_threshold     : %u\n",
+        printf("up_threshold         : %u\n",
                p_cpufreq->u.ondemand.up_threshold);
     }
 
-    printf("scaling_avail_freq   :");
-    for ( i = 0; i < p_cpufreq->freq_num; i++ )
-        if ( p_cpufreq->scaling_available_frequencies[i] ==
-             p_cpufreq->scaling_cur_freq )
-            printf(" *%d", p_cpufreq->scaling_available_frequencies[i]);
-        else
-            printf(" %d", p_cpufreq->scaling_available_frequencies[i]);
-    printf("\n");
-
-    printf("scaling frequency    : max [%u] min [%u] cur [%u]\n",
-           p_cpufreq->scaling_max_perf,
-           p_cpufreq->scaling_min_perf,
-           p_cpufreq->scaling_cur_freq);
+    switch ( p_cpufreq->perf_alias )
+    {
+    case PERCENTAGE:
+        printf("max_perf_pct         : %d\n", p_cpufreq->scaling_max_perf);
+        printf("min_perf_pct         : %d\n", p_cpufreq->scaling_min_perf);
+        printf("turbo_pct            : %d\n", p_cpufreq->scaling_turbo_pct);
+        break;
+    case FREQUENCY:
+    default:
+        printf("scaling_avail_freq   :");
+        for ( i = 0; i < p_cpufreq->freq_num; i++ )
+            if ( p_cpufreq->scaling_available_frequencies[i] ==
+                 p_cpufreq->scaling_cur_freq )
+                printf(" *%d", p_cpufreq->scaling_available_frequencies[i]);
+            else
+                printf(" %d", p_cpufreq->scaling_available_frequencies[i]);
+        printf("\n");
+        printf("scaling frequency    : max [%u] min [%u] cur [%u]\n",
+               p_cpufreq->scaling_max_perf,
+               p_cpufreq->scaling_min_perf,
+               p_cpufreq->scaling_cur_freq);
+        break;
+    }
 
     printf("turbo mode           : %s\n",
            p_cpufreq->turbo_enabled ? "enabled" : "disabled or n/a");
@@ -854,6 +876,54 @@ void scaling_min_freq_func(int argc, char *argv[])
     }
 }
 
+void scaling_max_pct_func(int argc, char *argv[])
+{
+    int cpuid = -1, pct = -1;
+
+    parse_cpuid_and_int(argc, argv, &cpuid, &pct, "percentage");
+    pct = clamp_t(int, pct, 0, 100);
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_handle, i, SCALING_MAX_PCT, pct) )
+                fprintf(stderr,
+                        "[CPU%d] failed to set scaling max freq (%d - %s)\n",
+                        i, errno, strerror(errno));
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_handle, cpuid, SCALING_MAX_PCT, pct) )
+            fprintf(stderr, "failed to set scaling max freq (%d - %s)\n",
+                    errno, strerror(errno));
+    }
+}
+
+void scaling_min_pct_func(int argc, char *argv[])
+{
+    int cpuid = -1, pct = -1;
+
+    parse_cpuid_and_int(argc, argv, &cpuid, &pct, "percentage");
+    pct = clamp_t(int, pct, 0, 100);
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_handle, i, SCALING_MIN_PCT, pct) )
+                fprintf(stderr,
+                        "[CPU%d] failed to set scaling max pct (%d - %s)\n",
+                        i, errno, strerror(errno));
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_handle, cpuid, SCALING_MIN_PCT, pct) )
+            fprintf(stderr, "failed to set scaling min pct (%d - %s)\n",
+                    errno, strerror(errno));
+    }
+}
+
 void scaling_speed_func(int argc, char *argv[])
 {
     int cpuid = -1, speed = -1;
@@ -1133,6 +1203,8 @@ struct {
     { "get-cpufreq-para", cpufreq_para_func },
     { "set-scaling-maxfreq", scaling_max_freq_func },
     { "set-scaling-minfreq", scaling_min_freq_func },
+    { "set-scaling-max-pct", scaling_max_pct_func},
+    { "set-scaling-min-pct", scaling_min_pct_func},
     { "set-scaling-governor", scaling_governor_func },
     { "set-scaling-speed", scaling_speed_func },
     { "set-sampling-rate", scaling_sampling_rate_func },
-- 
1.9.1
^ permalink raw reply related	[flat|nested] 14+ messages in thread
- * Re: [PATCH v6 6/6] tools: enable xenpm to control the intel_pstate driver
  2015-10-28  3:21 ` [PATCH v6 6/6] tools: enable xenpm to control the intel_pstate driver Wei Wang
@ 2015-10-28 11:36   ` Wei Liu
  0 siblings, 0 replies; 14+ messages in thread
From: Wei Liu @ 2015-10-28 11:36 UTC (permalink / raw)
  To: Wei Wang; +Cc: andrew.cooper3, wei.liu2, jbeulich, xen-devel
On Wed, Oct 28, 2015 at 11:21:18AM +0800, Wei Wang wrote:
> The intel_pstate driver receives percentage values to set the
> performance limits. This patch adds interfaces to support the
> input of percentage values to control the intel_pstate driver.
> The "get-cpufreq-para" is modified to show percentage
> based feedback info.
> Also, some changes in identation are made to make the printed
> info looks tidy.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
^ permalink raw reply	[flat|nested] 14+ messages in thread