* [PATCH 1/9] Add cpu idle pwr mgmt to xen
@ 2008-04-25 5:07 Wei, Gang
2008-04-25 13:00 ` Keir Fraser
0 siblings, 1 reply; 20+ messages in thread
From: Wei, Gang @ 2008-04-25 5:07 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 326 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
[-- Attachment #2: xen-1-cx_base-0425.patch --]
[-- Type: application/octet-stream, Size: 28276 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
diff -r 2ebb7f79e3bb xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/arch/x86/acpi/Makefile Thu Apr 24 11:37:34 2008 +0800
@@ -1,2 +1,2 @@ obj-y += boot.o
obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r 2ebb7f79e3bb xen/arch/x86/acpi/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c Thu Apr 24 11:37:34 2008 +0800
@@ -0,0 +1,691 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux
+ * drivers/acpi/processor_idle.c &
+ * arch/x86/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * - Added processor hotplug support
+ * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added support for C3 on SMP
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000
+
+extern u32 pmtmr_ioport;
+
+extern void (*pm_idle) (void);
+
+#ifdef COMPAT
+extern void (*pm_idle_save) (void);
+extern unsigned int max_cstate;
+#else
+#undef guest_from_compat_handle
+#define guest_from_compat_handle(x,y) ((x)=(y))
+void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+#endif
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+ u32 count;
+ struct acpi_processor_cx *state;
+ struct
+ {
+ u32 time;
+ u32 ticks;
+ u32 count;
+ u32 bm;
+ } threshold;
+};
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ struct acpi_processor_cx_policy promotion;
+ struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_power
+{
+ struct acpi_processor_cx *state;
+ unsigned long bm_check_timestamp;
+ u32 default_state;
+ u32 bm_activity;
+ int count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+#ifdef COMPAT
+extern struct acpi_processor_power processor_powers[NR_CPUS];
+#else
+struct acpi_processor_power processor_powers[NR_CPUS];
+#endif
+
+#ifdef COMPAT
+extern void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power);
+extern int init_cx_pminfo(struct acpi_processor_power *acpi_power);
+extern int acpi_processor_set_power_policy(struct acpi_processor_power *power);
+extern void acpi_processor_idle(void);
+#else
+void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+ printk("saved cpu%d cx acpi info:\n", cpu);
+ printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+ printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+ printk("\tdefault_state = %d\n", power->default_state);
+ printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+ printk("\tcount = %d\n", power->count);
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ printk("\tstates[%d]:\n", i);
+ printk("\t\tvalid = %d\n", power->states[i].valid);
+ printk("\t\ttype = %d\n", power->states[i].type);
+ printk("\t\taddress = 0x%x\n", power->states[i].address);
+ printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+ printk("\t\tlatency = %d\n", power->states[i].latency);
+ printk("\t\tpower = %d\n", power->states[i].power);
+ printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+ printk("\t\tusage = %d\n", power->states[i].usage);
+ printk("\t\ttime = %"PRId64"\n", power->states[i].time);
+
+ printk("\t\tpromotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].promotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].promotion.state) ?
+ power->states[i].promotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].promotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].promotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].promotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].promotion.threshold.bm);
+
+ printk("\t\tdemotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].demotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].demotion.state) ?
+ power->states[i].demotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].demotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].demotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].demotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].demotion.threshold.bm);
+ }
+}
+
+void dump_cx(unsigned char key)
+{
+ for( int i = 0; i < num_online_cpus(); i++ )
+ print_acpi_power(i, &processor_powers[i]);
+}
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+ if ( t2 >= t1 )
+ return (t2 - t1);
+ else
+ return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+ struct acpi_processor_cx *new)
+{
+ struct acpi_processor_cx *old;
+
+ if ( !power || !new )
+ return;
+
+ old = power->state;
+
+ if ( old )
+ old->promotion.count = 0;
+ new->demotion.count = 0;
+
+ power->state = new;
+
+ return;
+}
+
+static void acpi_safe_halt(void)
+{
+ smp_mb__after_clear_bit();
+ safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ __monitor((void *)current, 0, 0);
+ smp_mb();
+ __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+ if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+ {
+ /* Call into architectural FFH based C-state */
+ acpi_processor_ffh_cstate_enter(cx);
+ }
+ else
+ {
+ int unused;
+ /* IO port based C-state */
+ inb(cx->address);
+ /* Dummy wait op - must do something useless after P_LVL2 read
+ because chipsets cannot guarantee that STPCLK# signal
+ gets asserted in time to freeze execution properly. */
+ unused = inl(pmtmr_ioport);
+ }
+}
+
+void acpi_processor_idle(void)
+{
+ struct acpi_processor_power *power = NULL;
+ struct acpi_processor_cx *cx = NULL;
+ struct acpi_processor_cx *next_state = NULL;
+ int sleep_ticks = 0;
+ u32 t1, t2 = 0;
+
+ power = &processor_powers[smp_processor_id()];
+
+ /*
+ * Interrupts must be disabled during bus mastering calculations and
+ * for C2/C3 transitions.
+ */
+ local_irq_disable();
+ cx = power->state;
+ if ( !cx )
+ {
+ if ( pm_idle_save )
+ {
+ printk(XENLOG_DEBUG "call pm_idle_save()\n");
+ pm_idle_save();
+ }
+ else
+ {
+ printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+ acpi_safe_halt();
+ }
+ return;
+ }
+
+ /*
+ * Sleep:
+ * ------
+ * Invoke the current Cx state to put the processor to sleep.
+ */
+ if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+ smp_mb__after_clear_bit();
+
+ switch ( cx->type )
+ {
+ case ACPI_STATE_C1:
+ /*
+ * Invoke C1.
+ * Use the appropriate idle routine, the one that would
+ * be used without acpi C-states.
+ */
+ if ( pm_idle_save )
+ pm_idle_save();
+ else
+ acpi_safe_halt();
+
+ /*
+ * TBD: Can't get time duration while in C1, as resumes
+ * go to an ISR rather than here. Need to instrument
+ * base interrupt handler.
+ */
+ sleep_ticks = 0xFFFFFFFF;
+ break;
+
+ case ACPI_STATE_C2:
+ /* Get start time (ticks) */
+ t1 = inl(pmtmr_ioport);
+ /* Invoke C2 */
+ acpi_idle_do_entry(cx);
+ /* Get end time (ticks) */
+ t2 = inl(pmtmr_ioport);
+
+ /* Re-enable interrupts */
+ local_irq_enable();
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks =
+ ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ break;
+ default:
+ local_irq_enable();
+ return;
+ }
+
+ cx->usage++;
+ if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ cx->time += sleep_ticks;
+
+ next_state = power->state;
+
+ /*
+ * Promotion?
+ * ----------
+ * Track the number of longs (time asleep is greater than threshold)
+ * and promote when the count threshold is reached. Note that bus
+ * mastering activity may prevent promotions.
+ * Do not promote above max_cstate.
+ */
+ if ( cx->promotion.state &&
+ ((cx->promotion.state - power->states) <= max_cstate) )
+ {
+ if ( sleep_ticks > cx->promotion.threshold.ticks )
+ {
+ cx->promotion.count++;
+ cx->demotion.count = 0;
+ if ( cx->promotion.count >= cx->promotion.threshold.count )
+ {
+ next_state = cx->promotion.state;
+ goto end;
+ }
+ }
+ }
+
+ /*
+ * Demotion?
+ * ---------
+ * Track the number of shorts (time asleep is less than time threshold)
+ * and demote when the usage threshold is reached.
+ */
+ if ( cx->demotion.state )
+ {
+ if ( sleep_ticks < cx->demotion.threshold.ticks )
+ {
+ cx->demotion.count++;
+ cx->promotion.count = 0;
+ if ( cx->demotion.count >= cx->demotion.threshold.count )
+ {
+ next_state = cx->demotion.state;
+ goto end;
+ }
+ }
+ }
+
+end:
+ /*
+ * Demote if current state exceeds max_cstate
+ */
+ if ( (power->state - power->states) > max_cstate )
+ {
+ if ( cx->demotion.state )
+ next_state = cx->demotion.state;
+ }
+
+ /*
+ * New Cx State?
+ * -------------
+ * If we're going to start using a new Cx state we must clean up
+ * from the previous and prepare to use the new.
+ */
+ if ( next_state != power->state )
+ acpi_processor_power_activate(power, next_state);
+}
+
+int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+ unsigned int i;
+ unsigned int state_is_set = 0;
+ struct acpi_processor_cx *lower = NULL;
+ struct acpi_processor_cx *higher = NULL;
+ struct acpi_processor_cx *cx;
+
+ if ( !power )
+ return -EINVAL;
+
+ /*
+ * This function sets the default Cx state policy (OS idle handler).
+ * Our scheme is to promote quickly to C2 but more conservatively
+ * to C3. We're favoring C2 for its characteristics of low latency
+ * (quick response), good power savings, and ability to allow bus
+ * mastering activity. Note that the Cx state policy is completely
+ * customizable and can be altered dynamically.
+ */
+
+ /* startup state */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( !state_is_set )
+ power->state = cx;
+ state_is_set++;
+ break;
+ }
+
+ if ( !state_is_set )
+ return -ENODEV;
+
+ /* demotion */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( lower )
+ {
+ cx->demotion.state = lower;
+ cx->demotion.threshold.ticks = cx->latency_ticks;
+ cx->demotion.threshold.count = 1;
+ }
+
+ lower = cx;
+ }
+
+ /* promotion */
+ for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( higher )
+ {
+ cx->promotion.state = higher;
+ cx->promotion.threshold.ticks = cx->latency_ticks;
+ if ( cx->type >= ACPI_STATE_C2 )
+ cx->promotion.threshold.count = 4;
+ else
+ cx->promotion.threshold.count = 10;
+ }
+
+ higher = cx;
+ }
+
+ return 0;
+}
+
+int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+ memset(acpi_power, 0, sizeof(*acpi_power));
+
+ acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+ acpi_power->states[ACPI_STATE_C0].valid = 1;
+ acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+ acpi_power->count = 2;
+
+ return 0;
+}
+#endif
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+ struct cpuinfo_x86 *c = ¤t_cpu_data;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+ {
+ printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+ return -1;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+ eax, ebx, ecx, edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+ return -1;
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+ return -1;
+
+ printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", cx->type);
+ return 0;
+}
+
+#define VENDOR_INTEL (1)
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static int check_cx(xen_processor_cx_t *cx)
+{
+ if ( cx == NULL )
+ return -1;
+
+ switch ( cx->reg.space_id )
+ {
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ if ( cx->reg.address == 0 )
+ return -1;
+ break;
+
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ if ( cx->type > ACPI_STATE_C1 )
+ {
+ if ( cx->reg.bit_width != VENDOR_INTEL ||
+ cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+ return -1;
+
+ /* assume all logical cpu has the same support for mwait */
+ if ( acpi_processor_ffh_cstate_probe(cx) )
+ return -1;
+ }
+ break;
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+ xen_processor_cx_t *xen_cx)
+{
+ struct acpi_processor_cx *cx;
+
+ /* skip unsupported acpi cstate */
+ if ( check_cx(xen_cx) )
+ return -1;
+
+ cx = &acpi_power->states[xen_cx->type];
+ if ( !cx->valid )
+ acpi_power->count++;
+
+ cx->valid = 1;
+ cx->type = xen_cx->type;
+ cx->address = xen_cx->reg.address;
+ cx->space_id = xen_cx->reg.space_id;
+ cx->latency = xen_cx->latency;
+ cx->power = xen_cx->power;
+
+ cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+ return 0;
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+ int i;
+ u8 apic_id;
+
+ apic_id = x86_acpiid_to_apicid[acpi_id];
+ if ( apic_id == 0xff )
+ return -1;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ if ( apic_id == x86_cpu_to_apicid[i] )
+ return i;
+ }
+
+ return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t state;
+
+ XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+ xen_processor_csd_t dp;
+
+ printk("cpu%d cx acpi info:\n", cpu);
+ printk("\tcount = %d\n", power->count);
+ printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+ "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
+ power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
+ power->flags.power_setup_done, power->flags.bm_rld_set);
+
+ guest_from_compat_handle(states, power->states);
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&state, states, i, 1);
+
+ printk("\tstates[%d]:\n", i);
+ printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+ printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+ printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+ printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+ printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+ printk("\t\ttype = %d\n", state.type);
+ printk("\t\tlatency = %d\n", state.latency);
+ printk("\t\tpower = %d\n", state.power);
+
+ guest_from_compat_handle(csd, state.dp);
+ printk("\t\tdp(@0x%p)\n", csd.p);
+
+ if ( csd.p != NULL )
+ {
+ copy_from_guest(&dp, csd, 1);
+ printk("\t\t\tdomain = %d\n", dp.domain);
+ printk("\t\t\tcoord_type = %d\n", dp.coord_type);
+ printk("\t\t\tnum = %d\n", dp.num);
+ }
+ }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t xen_cx;
+ struct acpi_processor_power *acpi_power;
+ int cpu_id;
+
+ print_cx_pminfo(cpu, power);
+
+ /* map from acpi_id to cpu_id */
+ cpu_id = get_cpu_id((u8)cpu);
+ if ( cpu_id == -1 )
+ {
+ printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+ return -1;
+ }
+
+ acpi_power = &processor_powers[cpu_id];
+
+ init_cx_pminfo(acpi_power);
+
+ guest_from_compat_handle(states, power->states);
+
+ for ( int i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&xen_cx, states, i, 1);
+ set_cx(acpi_power, &xen_cx);
+ }
+
+ /* FIXME: C-state dependency is not supported by far */
+
+ /* initialize default policy */
+ acpi_processor_set_power_policy(acpi_power);
+
+ print_acpi_power(cpu_id, acpi_power);
+
+ if ( cpu_id == 0 && pm_idle_save == NULL )
+ {
+ pm_idle_save = pm_idle;
+ pm_idle = acpi_processor_idle;
+ }
+
+ return 0;
+}
diff -r 2ebb7f79e3bb xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/arch/x86/domain.c Thu Apr 24 11:37:34 2008 +0800
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(unsigned long, cr4);
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
static void unmap_vcpu_info(struct vcpu *v);
static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -100,12 +103,17 @@ static void play_dead(void)
void idle_loop(void)
{
+ void (*idle) (void);
+
for ( ; ; )
{
if ( cpu_is_offline(smp_processor_id()) )
play_dead();
page_scrub_schedule_work();
- default_idle();
+ idle = pm_idle;
+ if ( idle == NULL )
+ idle = default_idle;
+ idle();
do_softirq();
}
}
diff -r 2ebb7f79e3bb xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/arch/x86/platform_hypercall.c Thu Apr 24 11:37:34 2008 +0800
@@ -43,6 +43,8 @@ extern spinlock_t xenpf_lock;
#endif
static DEFINE_PER_CPU(uint64_t, freq);
+
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
static long cpu_frequency_change_helper(void *data)
{
@@ -340,6 +342,27 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
}
break;
+ case XENPF_set_processor_pminfo:
+ switch ( op->u.set_pminfo.type )
+ {
+ case XEN_PM_PX:
+ ret = -EINVAL;
+ break;
+
+ case XEN_PM_CX:
+ ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.power);
+ break;
+
+ case XEN_PM_TX:
+ ret = -EINVAL;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 2ebb7f79e3bb xen/arch/x86/x86_64/platform_hypercall.c
--- a/xen/arch/x86/x86_64/platform_hypercall.c Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/arch/x86/x86_64/platform_hypercall.c Thu Apr 24 11:37:34 2008 +0800
@@ -11,12 +11,21 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_
#define xen_platform_op_t compat_platform_op_t
#define do_platform_op(x) compat_platform_op(_##x)
+DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
+DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t);
+#define xen_processor_power compat_processor_power
+#define xen_processor_power_t compat_processor_power_t
+#define xen_processor_cx_t compat_processor_cx_t
+#define xen_processor_csd_t compat_processor_csd_t
+#define set_cx_pminfo comapt_set_cx_pminfo
+
#define xenpf_enter_acpi_sleep compat_pf_enter_acpi_sleep
#define COMPAT
#define _XEN_GUEST_HANDLE(t) XEN_GUEST_HANDLE(t)
typedef int ret_t;
+#include "../acpi/cpu_idle.c"
#include "../platform_hypercall.c"
/*
diff -r 2ebb7f79e3bb xen/common/keyhandler.c
--- a/xen/common/keyhandler.c Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/common/keyhandler.c Thu Apr 24 11:37:34 2008 +0800
@@ -275,6 +275,8 @@ extern void perfc_reset(unsigned char ke
extern void perfc_reset(unsigned char key);
#endif
+extern void dump_cx(unsigned char key);
+
static void do_debug_key(unsigned char key, struct cpu_user_regs *regs)
{
printk("'%c' pressed -> trapping into debugger\n", key);
@@ -306,6 +308,8 @@ void __init initialize_keytable(void)
register_keyhandler(
'P', perfc_reset, "reset performance counters");
#endif
+ register_keyhandler(
+ 'c', dump_cx, "dump cx structures");
register_irq_keyhandler('%', do_debug_key, "Trap to xendbg");
}
diff -r 2ebb7f79e3bb xen/include/public/platform.h
--- a/xen/include/public/platform.h Tue Apr 22 19:07:48 2008 +0100
+++ b/xen/include/public/platform.h Thu Apr 24 11:37:34 2008 +0800
@@ -199,6 +199,78 @@ typedef struct xenpf_getidletime xenpf_g
typedef struct xenpf_getidletime xenpf_getidletime_t;
DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
+
+struct xen_processor_flags {
+ uint8_t bm_control:1;
+ uint8_t bm_check:1;
+ uint8_t has_cst:1;
+ uint8_t power_setup_done:1;
+ uint8_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
+};
+
+struct xen_processor_performance {
+};
+
+struct xen_processor_throttling {
+};
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, XEN_PM_PX, XEN_PM_TX} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */
+ struct xen_processor_throttling throt;/* Tx: _TPC/_PTC/_TSS/_TSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -213,6 +285,7 @@ struct xen_platform_op {
struct xenpf_enter_acpi_sleep enter_acpi_sleep;
struct xenpf_change_freq change_freq;
struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
uint8_t pad[128];
} u;
};
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-25 5:07 [PATCH 1/9] Add cpu idle pwr mgmt to xen Wei, Gang
@ 2008-04-25 13:00 ` Keir Fraser
2008-04-25 13:29 ` Wei, Gang
2008-04-26 9:55 ` Wei, Gang
0 siblings, 2 replies; 20+ messages in thread
From: Keir Fraser @ 2008-04-25 13:00 UTC (permalink / raw)
To: Wei, Gang, xen-devel
On 25/4/08 06:07, "Wei, Gang" <gang.wei@intel.com> wrote:
> Add basic acpi C-states based cpu idle power mgmt in xen for x86.
>
> It includes:
> 1. hypercall definition for passing ACPI info.
> 2. C1/C2 support.
> 3. Mwait support, as well as legacy ioport.
> 4. Ladder policy from Linux kernel.
>
> A lot of code & ideas came from Linux.
Comments:
1. In the idle loop you can just replace default_idle() with (*pm_idle)()
directly.
2. Do not modify common/keyhandler.c. Instead add an __initcall() in
cpu_idle.c to register your keyhandler. The initcall function and your
keyhandler can both be 'static' functions.
3. I don't like ifdef COMPAT all over new files. Define a separate compat
shim file, built only for x86_64, which converts compat structures to native
64-bit structures. Then cpu_idle.c need know nothing about compat issues and
will be cleaner for it.
4. Don't define placeholders for Px and Tx info in the platform hypercall
header. They should be introduced when they are actually implemented.
That's it. I haven't looked at the other patches yet, but you can probably
make the above fixes and resubmit just this one patch without affecting the
others. I'll look at them after this one goes in.
-- Keir
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-25 13:00 ` Keir Fraser
@ 2008-04-25 13:29 ` Wei, Gang
2008-04-26 9:55 ` Wei, Gang
1 sibling, 0 replies; 20+ messages in thread
From: Wei, Gang @ 2008-04-25 13:29 UTC (permalink / raw)
To: Keir Fraser, xen-devel
On Friday, April 25, 2008 9:01 PM, Keir Fraser wrote:
> On 25/4/08 06:07, "Wei, Gang" <gang.wei@intel.com> wrote:
>
>> Add basic acpi C-states based cpu idle power mgmt in xen for x86.
>
> Comments:
> 1. In the idle loop you can just replace default_idle() with
(*pm_idle)()
> directly.
> 2. Do not modify common/keyhandler.c. Instead add an __initcall() in
> cpu_idle.c to register your keyhandler. The initcall function and your
> keyhandler can both be 'static' functions.
> 3. I don't like ifdef COMPAT all over new files. Define a separate
compat
> shim file, built only for x86_64, which converts compat structures to
> native 64-bit structures. Then cpu_idle.c need know nothing about
compat
> issues and will be cleaner for it.
> 4. Don't define placeholders for Px and Tx info in the platform
hypercall
> header. They should be introduced when they are actually implemented.
>
> That's it. I haven't looked at the other patches yet, but you can
probably
> make the above fixes and resubmit just this one patch without
affecting
> the others. I'll look at them after this one goes in.
It is great idea to do it one after one. I will focus on above 4
comments for this patch and come back soon. Thanks for quick comments.
>
> -- Keir
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-25 13:00 ` Keir Fraser
2008-04-25 13:29 ` Wei, Gang
@ 2008-04-26 9:55 ` Wei, Gang
2008-04-28 9:24 ` Jan Beulich
1 sibling, 1 reply; 20+ messages in thread
From: Wei, Gang @ 2008-04-26 9:55 UTC (permalink / raw)
To: Keir Fraser, xen-devel
[-- Attachment #1: Type: text/plain, Size: 1438 bytes --]
Revised according to Keir's comments. Resubmit.
Jimmy
On Friday, April 25, 2008 9:01 PM, Keir Fraser wrote:
> On 25/4/08 06:07, "Wei, Gang" <gang.wei@intel.com> wrote:
>
>> Add basic acpi C-states based cpu idle power mgmt in xen for x86.
>>
>> It includes:
>> 1. hypercall definition for passing ACPI info.
>> 2. C1/C2 support.
>> 3. Mwait support, as well as legacy ioport.
>> 4. Ladder policy from Linux kernel.
>>
>> A lot of code & ideas came from Linux.
>
> Comments:
> 1. In the idle loop you can just replace default_idle() with
(*pm_idle)()
> directly.
> 2. Do not modify common/keyhandler.c. Instead add an __initcall() in
> cpu_idle.c to register your keyhandler. The initcall function and your
> keyhandler can both be 'static' functions.
> 3. I don't like ifdef COMPAT all over new files. Define a separate
compat
> shim file, built only for x86_64, which converts compat structures to
> native 64-bit structures. Then cpu_idle.c need know nothing about
compat
> issues and will be cleaner for it.
> 4. Don't define placeholders for Px and Tx info in the platform
hypercall
> header. They should be introduced when they are actually implemented.
>
> That's it. I haven't looked at the other patches yet, but you can
probably
> make the above fixes and resubmit just this one patch without
affecting
> the others. I'll look at them after this one goes in.
>
> -- Keir
[-- Attachment #2: xen-1-cx_base-0426.patch --]
[-- Type: application/octet-stream, Size: 31712 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
diff -r 483d006cc607 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/acpi/Makefile Fri Apr 25 15:09:31 2008 +0800
@@ -1,2 +1,2 @@ obj-y += boot.o
obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r 483d006cc607 xen/arch/x86/acpi/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c Sat Apr 26 09:00:50 2008 +0800
@@ -0,0 +1,681 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux
+ * drivers/acpi/processor_idle.c &
+ * arch/x86/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * - Added processor hotplug support
+ * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added support for C3 on SMP
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+#include <xen/keyhandler.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000
+
+extern u32 pmtmr_ioport;
+extern void (*pm_idle) (void);
+
+static void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+ u32 count;
+ struct acpi_processor_cx *state;
+ struct
+ {
+ u32 time;
+ u32 ticks;
+ u32 count;
+ u32 bm;
+ } threshold;
+};
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ struct acpi_processor_cx_policy promotion;
+ struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_power
+{
+ struct acpi_processor_cx *state;
+ u64 bm_check_timestamp;
+ u32 default_state;
+ u32 bm_activity;
+ u32 count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+static struct acpi_processor_power processor_powers[NR_CPUS];
+
+static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+ printk("saved cpu%d cx acpi info:\n", cpu);
+ printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+ printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+ printk("\tdefault_state = %d\n", power->default_state);
+ printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+ printk("\tcount = %d\n", power->count);
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ printk("\tstates[%d]:\n", i);
+ printk("\t\tvalid = %d\n", power->states[i].valid);
+ printk("\t\ttype = %d\n", power->states[i].type);
+ printk("\t\taddress = 0x%x\n", power->states[i].address);
+ printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+ printk("\t\tlatency = %d\n", power->states[i].latency);
+ printk("\t\tpower = %d\n", power->states[i].power);
+ printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+ printk("\t\tusage = %d\n", power->states[i].usage);
+ printk("\t\ttime = %"PRId64"\n", power->states[i].time);
+
+ printk("\t\tpromotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].promotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].promotion.state) ?
+ power->states[i].promotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].promotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].promotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].promotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].promotion.threshold.bm);
+
+ printk("\t\tdemotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].demotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].demotion.state) ?
+ power->states[i].demotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].demotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].demotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].demotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].demotion.threshold.bm);
+ }
+}
+
+static void dump_cx(unsigned char key)
+{
+ for( int i = 0; i < num_online_cpus(); i++ )
+ print_acpi_power(i, &processor_powers[i]);
+}
+
+static int __init cpu_idle_key_init(void)
+{
+ register_keyhandler(
+ 'c', dump_cx, "dump cx structures");
+ return 0;
+}
+__initcall(cpu_idle_key_init);
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+ if ( t2 >= t1 )
+ return (t2 - t1);
+ else
+ return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+ struct acpi_processor_cx *new)
+{
+ struct acpi_processor_cx *old;
+
+ if ( !power || !new )
+ return;
+
+ old = power->state;
+
+ if ( old )
+ old->promotion.count = 0;
+ new->demotion.count = 0;
+
+ power->state = new;
+
+ return;
+}
+
+static void acpi_safe_halt(void)
+{
+ smp_mb__after_clear_bit();
+ safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ __monitor((void *)current, 0, 0);
+ smp_mb();
+ __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+ if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+ {
+ /* Call into architectural FFH based C-state */
+ acpi_processor_ffh_cstate_enter(cx);
+ }
+ else
+ {
+ int unused;
+ /* IO port based C-state */
+ inb(cx->address);
+ /* Dummy wait op - must do something useless after P_LVL2 read
+ because chipsets cannot guarantee that STPCLK# signal
+ gets asserted in time to freeze execution properly. */
+ unused = inl(pmtmr_ioport);
+ }
+}
+
+static void acpi_processor_idle(void)
+{
+ struct acpi_processor_power *power = NULL;
+ struct acpi_processor_cx *cx = NULL;
+ struct acpi_processor_cx *next_state = NULL;
+ int sleep_ticks = 0;
+ u32 t1, t2 = 0;
+
+ power = &processor_powers[smp_processor_id()];
+
+ /*
+ * Interrupts must be disabled during bus mastering calculations and
+ * for C2/C3 transitions.
+ */
+ local_irq_disable();
+ cx = power->state;
+ if ( !cx )
+ {
+ if ( pm_idle_save )
+ {
+ printk(XENLOG_DEBUG "call pm_idle_save()\n");
+ pm_idle_save();
+ }
+ else
+ {
+ printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+ acpi_safe_halt();
+ }
+ return;
+ }
+
+ /*
+ * Sleep:
+ * ------
+ * Invoke the current Cx state to put the processor to sleep.
+ */
+ if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+ smp_mb__after_clear_bit();
+
+ switch ( cx->type )
+ {
+ case ACPI_STATE_C1:
+ /*
+ * Invoke C1.
+ * Use the appropriate idle routine, the one that would
+ * be used without acpi C-states.
+ */
+ if ( pm_idle_save )
+ pm_idle_save();
+ else
+ acpi_safe_halt();
+
+ /*
+ * TBD: Can't get time duration while in C1, as resumes
+ * go to an ISR rather than here. Need to instrument
+ * base interrupt handler.
+ */
+ sleep_ticks = 0xFFFFFFFF;
+ break;
+
+ case ACPI_STATE_C2:
+ /* Get start time (ticks) */
+ t1 = inl(pmtmr_ioport);
+ /* Invoke C2 */
+ acpi_idle_do_entry(cx);
+ /* Get end time (ticks) */
+ t2 = inl(pmtmr_ioport);
+
+ /* Re-enable interrupts */
+ local_irq_enable();
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks =
+ ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ break;
+ default:
+ local_irq_enable();
+ return;
+ }
+
+ cx->usage++;
+ if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ cx->time += sleep_ticks;
+
+ next_state = power->state;
+
+ /*
+ * Promotion?
+ * ----------
+ * Track the number of longs (time asleep is greater than threshold)
+ * and promote when the count threshold is reached. Note that bus
+ * mastering activity may prevent promotions.
+ * Do not promote above max_cstate.
+ */
+ if ( cx->promotion.state &&
+ ((cx->promotion.state - power->states) <= max_cstate) )
+ {
+ if ( sleep_ticks > cx->promotion.threshold.ticks )
+ {
+ cx->promotion.count++;
+ cx->demotion.count = 0;
+ if ( cx->promotion.count >= cx->promotion.threshold.count )
+ {
+ next_state = cx->promotion.state;
+ goto end;
+ }
+ }
+ }
+
+ /*
+ * Demotion?
+ * ---------
+ * Track the number of shorts (time asleep is less than time threshold)
+ * and demote when the usage threshold is reached.
+ */
+ if ( cx->demotion.state )
+ {
+ if ( sleep_ticks < cx->demotion.threshold.ticks )
+ {
+ cx->demotion.count++;
+ cx->promotion.count = 0;
+ if ( cx->demotion.count >= cx->demotion.threshold.count )
+ {
+ next_state = cx->demotion.state;
+ goto end;
+ }
+ }
+ }
+
+end:
+ /*
+ * Demote if current state exceeds max_cstate
+ */
+ if ( (power->state - power->states) > max_cstate )
+ {
+ if ( cx->demotion.state )
+ next_state = cx->demotion.state;
+ }
+
+ /*
+ * New Cx State?
+ * -------------
+ * If we're going to start using a new Cx state we must clean up
+ * from the previous and prepare to use the new.
+ */
+ if ( next_state != power->state )
+ acpi_processor_power_activate(power, next_state);
+}
+
+static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+ unsigned int i;
+ unsigned int state_is_set = 0;
+ struct acpi_processor_cx *lower = NULL;
+ struct acpi_processor_cx *higher = NULL;
+ struct acpi_processor_cx *cx;
+
+ if ( !power )
+ return -EINVAL;
+
+ /*
+ * This function sets the default Cx state policy (OS idle handler).
+ * Our scheme is to promote quickly to C2 but more conservatively
+ * to C3. We're favoring C2 for its characteristics of low latency
+ * (quick response), good power savings, and ability to allow bus
+ * mastering activity. Note that the Cx state policy is completely
+ * customizable and can be altered dynamically.
+ */
+
+ /* startup state */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( !state_is_set )
+ power->state = cx;
+ state_is_set++;
+ break;
+ }
+
+ if ( !state_is_set )
+ return -ENODEV;
+
+ /* demotion */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( lower )
+ {
+ cx->demotion.state = lower;
+ cx->demotion.threshold.ticks = cx->latency_ticks;
+ cx->demotion.threshold.count = 1;
+ }
+
+ lower = cx;
+ }
+
+ /* promotion */
+ for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( higher )
+ {
+ cx->promotion.state = higher;
+ cx->promotion.threshold.ticks = cx->latency_ticks;
+ if ( cx->type >= ACPI_STATE_C2 )
+ cx->promotion.threshold.count = 4;
+ else
+ cx->promotion.threshold.count = 10;
+ }
+
+ higher = cx;
+ }
+
+ return 0;
+}
+
+static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+ memset(acpi_power, 0, sizeof(*acpi_power));
+
+ acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+ acpi_power->states[ACPI_STATE_C0].valid = 1;
+ acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+ acpi_power->count = 2;
+
+ return 0;
+}
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+ struct cpuinfo_x86 *c = ¤t_cpu_data;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+ {
+ printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+ return -1;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+ eax, ebx, ecx, edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+ return -1;
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+ return -1;
+
+ printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", cx->type);
+ return 0;
+}
+
+#define VENDOR_INTEL (1)
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static int check_cx(xen_processor_cx_t *cx)
+{
+ if ( cx == NULL )
+ return -1;
+
+ switch ( cx->reg.space_id )
+ {
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ if ( cx->reg.address == 0 )
+ return -1;
+ break;
+
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ if ( cx->type > ACPI_STATE_C1 )
+ {
+ if ( cx->reg.bit_width != VENDOR_INTEL ||
+ cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+ return -1;
+
+ /* assume all logical cpu has the same support for mwait */
+ if ( acpi_processor_ffh_cstate_probe(cx) )
+ return -1;
+ }
+ break;
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+ xen_processor_cx_t *xen_cx)
+{
+ struct acpi_processor_cx *cx;
+
+ /* skip unsupported acpi cstate */
+ if ( check_cx(xen_cx) )
+ return -1;
+
+ cx = &acpi_power->states[xen_cx->type];
+ if ( !cx->valid )
+ acpi_power->count++;
+
+ cx->valid = 1;
+ cx->type = xen_cx->type;
+ cx->address = xen_cx->reg.address;
+ cx->space_id = xen_cx->reg.space_id;
+ cx->latency = xen_cx->latency;
+ cx->power = xen_cx->power;
+
+ cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+ return 0;
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+ int i;
+ u8 apic_id;
+
+ apic_id = x86_acpiid_to_apicid[acpi_id];
+ if ( apic_id == 0xff )
+ return -1;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ if ( apic_id == x86_cpu_to_apicid[i] )
+ return i;
+ }
+
+ return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t state;
+
+ XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+ xen_processor_csd_t dp;
+
+ printk("cpu%d cx acpi info:\n", cpu);
+ printk("\tcount = %d\n", power->count);
+ printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+ "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
+ power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
+ power->flags.power_setup_done, power->flags.bm_rld_set);
+
+ states = power->states;
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&state, states, i, 1);
+
+ printk("\tstates[%d]:\n", i);
+ printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+ printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+ printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+ printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+ printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+ printk("\t\ttype = %d\n", state.type);
+ printk("\t\tlatency = %d\n", state.latency);
+ printk("\t\tpower = %d\n", state.power);
+
+ csd = state.dp;
+ printk("\t\tdp(@0x%p)\n", csd.p);
+
+ if ( csd.p != NULL )
+ {
+ copy_from_guest(&dp, csd, 1);
+ printk("\t\t\tdomain = %d\n", dp.domain);
+ printk("\t\t\tcoord_type = %d\n", dp.coord_type);
+ printk("\t\t\tnum = %d\n", dp.num);
+ }
+ }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t xen_cx;
+ struct acpi_processor_power *acpi_power;
+ int cpu_id;
+
+ print_cx_pminfo(cpu, power);
+
+ /* map from acpi_id to cpu_id */
+ cpu_id = get_cpu_id((u8)cpu);
+ if ( cpu_id == -1 )
+ {
+ printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+ return -1;
+ }
+
+ acpi_power = &processor_powers[cpu_id];
+
+ init_cx_pminfo(acpi_power);
+
+ states = power->states;
+
+ for ( int i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&xen_cx, states, i, 1);
+ set_cx(acpi_power, &xen_cx);
+ }
+
+ /* FIXME: C-state dependency is not supported by far */
+
+ /* initialize default policy */
+ acpi_processor_set_power_policy(acpi_power);
+
+ print_acpi_power(cpu_id, acpi_power);
+
+ if ( cpu_id == 0 && pm_idle_save == NULL )
+ {
+ pm_idle_save = pm_idle;
+ pm_idle = acpi_processor_idle;
+ }
+
+ return 0;
+}
diff -r 483d006cc607 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/domain.c Fri Apr 25 15:24:24 2008 +0800
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(unsigned long, cr4);
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
static void unmap_vcpu_info(struct vcpu *v);
static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -105,7 +108,7 @@ void idle_loop(void)
if ( cpu_is_offline(smp_processor_id()) )
play_dead();
page_scrub_schedule_work();
- default_idle();
+ (*pm_idle)();
do_softirq();
}
}
diff -r 483d006cc607 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/platform_hypercall.c Fri Apr 25 15:09:31 2008 +0800
@@ -44,6 +44,8 @@ extern spinlock_t xenpf_lock;
static DEFINE_PER_CPU(uint64_t, freq);
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
static long cpu_frequency_change_helper(void *data)
{
return cpu_frequency_change(this_cpu(freq));
@@ -340,6 +342,27 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
}
break;
+ case XENPF_set_processor_pminfo:
+ switch ( op->u.set_pminfo.type )
+ {
+ case XEN_PM_PX:
+ ret = -EINVAL;
+ break;
+
+ case XEN_PM_CX:
+ ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.power);
+ break;
+
+ case XEN_PM_TX:
+ ret = -EINVAL;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 483d006cc607 xen/arch/x86/x86_64/Makefile
--- a/xen/arch/x86/x86_64/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/Makefile Sat Apr 26 02:41:56 2008 +0800
@@ -12,6 +12,7 @@ obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += physdev.o
obj-$(CONFIG_COMPAT) += platform_hypercall.o
+obj-$(CONFIG_COMPAT) += cpu_idle.o
ifeq ($(CONFIG_COMPAT),y)
# extra dependencies
@@ -22,4 +23,5 @@ platform_hypercall.o: ../platform_hyperc
platform_hypercall.o: ../platform_hypercall.c
sysctl.o: ../sysctl.c
traps.o: compat/traps.c
+cpu_idle.o: ../acpi/cpu_idle.c
endif
diff -r 483d006cc607 xen/arch/x86/x86_64/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_64/cpu_idle.c Sat Apr 26 09:23:01 2008 +0800
@@ -0,0 +1,133 @@
+/******************************************************************************
+ * cpu_idle.c -- adapt x86/acpi/cpu_idle.c to compat guest.
+ *
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#define __XEN_TOOLS__ /* for using get_xen_guest_handle macro */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/xmalloc.h>
+#include <xen/guest_access.h>
+#include <compat/platform.h>
+
+DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t);
+DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
+
+static void copy_from_compat_dp(xen_processor_csd_t *xen_dp,
+ compat_processor_csd_t *dp)
+{
+ xen_dp->domain = dp->domain;
+ xen_dp->coord_type = dp->coord_type;
+ xen_dp->num = dp->num;
+}
+
+static int copy_from_compat_state(xen_processor_cx_t *xen_state,
+ compat_processor_cx_t *state)
+{
+ int i;
+ xen_processor_csd_t *xen_dps = NULL;
+ XEN_GUEST_HANDLE(compat_processor_csd_t) dps;
+ compat_processor_csd_t dp;
+
+ xen_state->reg.space_id = state->reg.space_id;
+ xen_state->reg.bit_width = state->reg.bit_width;
+ xen_state->reg.bit_offset = state->reg.bit_offset;
+ xen_state->reg.access_size = state->reg.access_size;
+ xen_state->reg.address = state->reg.address;
+ xen_state->type = state->type;
+ xen_state->latency = state->latency;
+ xen_state->power = state->power;
+ xen_state->dpcnt = state->dpcnt;
+
+ if ( state->dpcnt > 0 )
+ {
+ xen_dps = xmalloc_array(xen_processor_csd_t, state->dpcnt);
+ if ( xen_dps == NULL )
+ return -1;
+
+ guest_from_compat_handle(dps, state->dp);
+ for ( i = 0; i < state->dpcnt; i++ )
+ {
+ copy_from_guest_offset(&dp, dps, i, 1);
+ copy_from_compat_dp(&xen_dps[i], &dp);
+ }
+ }
+
+ set_xen_guest_handle(xen_state->dp, xen_dps);
+ return 0;
+}
+
+static void clean_state(xen_processor_cx_t *state)
+{
+ xen_processor_csd_t *dps;
+ get_xen_guest_handle(dps, state->dp);
+ if ( dps )
+ xfree(dps);
+}
+
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
+long compat_set_cx_pminfo(uint32_t cpu, struct compat_processor_power *power)
+{
+ int i;
+ long ret = -1;
+ xen_processor_cx_t *xen_states = NULL;
+ XEN_GUEST_HANDLE(compat_processor_cx_t) states;
+ compat_processor_cx_t state;
+ struct xen_processor_power xen_power =
+ {
+ .count = power->count,
+ .flags.bm_control = power->flags.bm_control,
+ .flags.bm_check = power->flags.bm_check,
+ .flags.has_cst = power->flags.has_cst,
+ .flags.power_setup_done = power->flags.power_setup_done,
+ .flags.bm_rld_set = power->flags.bm_rld_set,
+ };
+
+ if ( power->count > 0 )
+ {
+ xen_states = xmalloc_array(xen_processor_cx_t, power->count);
+ if ( xen_states == NULL )
+ return -1;
+
+ guest_from_compat_handle(states, power->states);
+ for ( i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&state, states, i, 1);
+ if ( (ret = copy_from_compat_state(&xen_states[i], &state)) )
+ goto clean_end;
+ }
+ }
+
+ set_xen_guest_handle(xen_power.states, xen_states);
+ ret = set_cx_pminfo(cpu, &xen_power);
+
+ clean_end:
+ if ( xen_states != NULL )
+ {
+ for ( i = 0; i < xen_power.count; i++ )
+ clean_state(&xen_states[i]);
+ xfree(xen_states);
+ }
+ return ret;
+}
diff -r 483d006cc607 xen/arch/x86/x86_64/platform_hypercall.c
--- a/xen/arch/x86/x86_64/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/platform_hypercall.c Sat Apr 26 02:44:34 2008 +0800
@@ -10,6 +10,10 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_
#define xen_platform_op compat_platform_op
#define xen_platform_op_t compat_platform_op_t
#define do_platform_op(x) compat_platform_op(_##x)
+
+#define xen_processor_power compat_processor_power
+#define xen_processor_power_t compat_processor_power_t
+#define set_cx_pminfo compat_set_cx_pminfo
#define xenpf_enter_acpi_sleep compat_pf_enter_acpi_sleep
diff -r 483d006cc607 xen/include/public/platform.h
--- a/xen/include/public/platform.h Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/public/platform.h Fri Apr 25 15:50:23 2008 +0800
@@ -199,6 +199,70 @@ typedef struct xenpf_getidletime xenpf_g
typedef struct xenpf_getidletime xenpf_getidletime_t;
DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
+
+struct xen_processor_flags {
+ uint8_t bm_control:1;
+ uint8_t bm_check:1;
+ uint8_t has_cst:1;
+ uint8_t power_setup_done:1;
+ uint8_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
+};
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, ...} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -213,6 +277,7 @@ struct xen_platform_op {
struct xenpf_enter_acpi_sleep enter_acpi_sleep;
struct xenpf_change_freq change_freq;
struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
uint8_t pad[128];
} u;
};
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-26 9:55 ` Wei, Gang
@ 2008-04-28 9:24 ` Jan Beulich
2008-04-30 3:27 ` Wei, Gang
0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2008-04-28 9:24 UTC (permalink / raw)
To: Gang Wei; +Cc: xen-devel
>>> "Wei, Gang" <gang.wei@intel.com> 26.04.08 11:55 >>>
>Revised according to Keir's comments. Resubmit.
Some comments regarding the compat guest handling: You cannot
validly set_xen_guest_handle() on space coming from xmalloc. It is
the purpose of the per-vCPU argument translation page to deal with
that (i.e. the translated arguments go into that page, subject to your
own management of how you assign space for the individual (sub-)
hypercall's arguments).
Further, you shouldn't manually copy fields, this should be done
through the machine generated macros in xen/include/compat/xlat.h,
which would require you to add the structures needing translation to
xen/include/xlat.lst.
Jan
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-28 9:24 ` Jan Beulich
@ 2008-04-30 3:27 ` Wei, Gang
2008-04-30 7:22 ` Jan Beulich
0 siblings, 1 reply; 20+ messages in thread
From: Wei, Gang @ 2008-04-30 3:27 UTC (permalink / raw)
To: xen-devel; +Cc: Jan Beulich
[-- Attachment #1: Type: text/plain, Size: 742 bytes --]
Revising done according to Jan's comments. Resend.
Jimmy
On Monday, April 28, 2008 5:25 PM, Jan Beulich wrote:
> Some comments regarding the compat guest handling: You cannot
> validly set_xen_guest_handle() on space coming from xmalloc. It is
> the purpose of the per-vCPU argument translation page to deal with
> that (i.e. the translated arguments go into that page, subject to your
> own management of how you assign space for the individual (sub-)
> hypercall's arguments).
>
> Further, you shouldn't manually copy fields, this should be done
> through the machine generated macros in xen/include/compat/xlat.h,
> which would require you to add the structures needing translation to
> xen/include/xlat.lst.
>
> Jan
[-- Attachment #2: xen-1-cx_base-0430.patch --]
[-- Type: application/octet-stream, Size: 31975 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
diff -r 483d006cc607 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/acpi/Makefile Wed Apr 30 11:20:05 2008 +0800
@@ -1,2 +1,2 @@ obj-y += boot.o
obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r 483d006cc607 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/domain.c Wed Apr 30 11:20:05 2008 +0800
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(unsigned long, cr4);
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
static void unmap_vcpu_info(struct vcpu *v);
static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -105,7 +108,7 @@ void idle_loop(void)
if ( cpu_is_offline(smp_processor_id()) )
play_dead();
page_scrub_schedule_work();
- default_idle();
+ (*pm_idle)();
do_softirq();
}
}
diff -r 483d006cc607 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/platform_hypercall.c Wed Apr 30 11:20:05 2008 +0800
@@ -44,6 +44,8 @@ extern spinlock_t xenpf_lock;
static DEFINE_PER_CPU(uint64_t, freq);
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
static long cpu_frequency_change_helper(void *data)
{
return cpu_frequency_change(this_cpu(freq));
@@ -340,6 +342,27 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
}
break;
+ case XENPF_set_processor_pminfo:
+ switch ( op->u.set_pminfo.type )
+ {
+ case XEN_PM_PX:
+ ret = -EINVAL;
+ break;
+
+ case XEN_PM_CX:
+ ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.power);
+ break;
+
+ case XEN_PM_TX:
+ ret = -EINVAL;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 483d006cc607 xen/arch/x86/x86_64/Makefile
--- a/xen/arch/x86/x86_64/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/Makefile Wed Apr 30 11:20:05 2008 +0800
@@ -12,6 +12,7 @@ obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += physdev.o
obj-$(CONFIG_COMPAT) += platform_hypercall.o
+obj-$(CONFIG_COMPAT) += cpu_idle.o
ifeq ($(CONFIG_COMPAT),y)
# extra dependencies
@@ -22,4 +23,5 @@ platform_hypercall.o: ../platform_hyperc
platform_hypercall.o: ../platform_hypercall.c
sysctl.o: ../sysctl.c
traps.o: compat/traps.c
+cpu_idle.o: ../acpi/cpu_idle.c
endif
diff -r 483d006cc607 xen/arch/x86/x86_64/platform_hypercall.c
--- a/xen/arch/x86/x86_64/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/platform_hypercall.c Wed Apr 30 11:20:05 2008 +0800
@@ -10,6 +10,10 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_
#define xen_platform_op compat_platform_op
#define xen_platform_op_t compat_platform_op_t
#define do_platform_op(x) compat_platform_op(_##x)
+
+#define xen_processor_power compat_processor_power
+#define xen_processor_power_t compat_processor_power_t
+#define set_cx_pminfo compat_set_cx_pminfo
#define xenpf_enter_acpi_sleep compat_pf_enter_acpi_sleep
diff -r 483d006cc607 xen/include/public/platform.h
--- a/xen/include/public/platform.h Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/public/platform.h Wed Apr 30 11:20:05 2008 +0800
@@ -199,6 +199,70 @@ typedef struct xenpf_getidletime xenpf_g
typedef struct xenpf_getidletime xenpf_getidletime_t;
DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
+
+struct xen_processor_flags {
+ uint8_t bm_control:1;
+ uint8_t bm_check:1;
+ uint8_t has_cst:1;
+ uint8_t power_setup_done:1;
+ uint8_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
+};
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, ...} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -213,6 +277,7 @@ struct xen_platform_op {
struct xenpf_enter_acpi_sleep enter_acpi_sleep;
struct xenpf_change_freq change_freq;
struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
uint8_t pad[128];
} u;
};
diff -r 483d006cc607 xen/include/xlat.lst
--- a/xen/include/xlat.lst Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/xlat.lst Wed Apr 30 11:20:05 2008 +0800
@@ -44,3 +44,8 @@
! vcpu_runstate_info vcpu.h
? xenoprof_init xenoprof.h
? xenoprof_passive xenoprof.h
+! power_register platform.h
+! processor_csd platform.h
+! processor_cx platform.h
+! processor_flags platform.h
+! processor_power platform.h
diff -r 483d006cc607 xen/arch/x86/acpi/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c Wed Apr 30 11:28:59 2008 +0800
@@ -0,0 +1,681 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux
+ * drivers/acpi/processor_idle.c &
+ * arch/x86/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * - Added processor hotplug support
+ * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added support for C3 on SMP
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+#include <xen/keyhandler.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000
+
+extern u32 pmtmr_ioport;
+extern void (*pm_idle) (void);
+
+static void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+ u32 count;
+ struct acpi_processor_cx *state;
+ struct
+ {
+ u32 time;
+ u32 ticks;
+ u32 count;
+ u32 bm;
+ } threshold;
+};
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ struct acpi_processor_cx_policy promotion;
+ struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_power
+{
+ struct acpi_processor_cx *state;
+ u64 bm_check_timestamp;
+ u32 default_state;
+ u32 bm_activity;
+ u32 count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+static struct acpi_processor_power processor_powers[NR_CPUS];
+
+static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+ printk("saved cpu%d cx acpi info:\n", cpu);
+ printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+ printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+ printk("\tdefault_state = %d\n", power->default_state);
+ printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+ printk("\tcount = %d\n", power->count);
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ printk("\tstates[%d]:\n", i);
+ printk("\t\tvalid = %d\n", power->states[i].valid);
+ printk("\t\ttype = %d\n", power->states[i].type);
+ printk("\t\taddress = 0x%x\n", power->states[i].address);
+ printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+ printk("\t\tlatency = %d\n", power->states[i].latency);
+ printk("\t\tpower = %d\n", power->states[i].power);
+ printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+ printk("\t\tusage = %d\n", power->states[i].usage);
+ printk("\t\ttime = %"PRId64"\n", power->states[i].time);
+
+ printk("\t\tpromotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].promotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].promotion.state) ?
+ power->states[i].promotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].promotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].promotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].promotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].promotion.threshold.bm);
+
+ printk("\t\tdemotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].demotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].demotion.state) ?
+ power->states[i].demotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].demotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].demotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].demotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].demotion.threshold.bm);
+ }
+}
+
+static void dump_cx(unsigned char key)
+{
+ for( int i = 0; i < num_online_cpus(); i++ )
+ print_acpi_power(i, &processor_powers[i]);
+}
+
+static int __init cpu_idle_key_init(void)
+{
+ register_keyhandler(
+ 'c', dump_cx, "dump cx structures");
+ return 0;
+}
+__initcall(cpu_idle_key_init);
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+ if ( t2 >= t1 )
+ return (t2 - t1);
+ else
+ return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+ struct acpi_processor_cx *new)
+{
+ struct acpi_processor_cx *old;
+
+ if ( !power || !new )
+ return;
+
+ old = power->state;
+
+ if ( old )
+ old->promotion.count = 0;
+ new->demotion.count = 0;
+
+ power->state = new;
+
+ return;
+}
+
+static void acpi_safe_halt(void)
+{
+ smp_mb__after_clear_bit();
+ safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ __monitor((void *)current, 0, 0);
+ smp_mb();
+ __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+ if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+ {
+ /* Call into architectural FFH based C-state */
+ acpi_processor_ffh_cstate_enter(cx);
+ }
+ else
+ {
+ int unused;
+ /* IO port based C-state */
+ inb(cx->address);
+ /* Dummy wait op - must do something useless after P_LVL2 read
+ because chipsets cannot guarantee that STPCLK# signal
+ gets asserted in time to freeze execution properly. */
+ unused = inl(pmtmr_ioport);
+ }
+}
+
+static void acpi_processor_idle(void)
+{
+ struct acpi_processor_power *power = NULL;
+ struct acpi_processor_cx *cx = NULL;
+ struct acpi_processor_cx *next_state = NULL;
+ int sleep_ticks = 0;
+ u32 t1, t2 = 0;
+
+ power = &processor_powers[smp_processor_id()];
+
+ /*
+ * Interrupts must be disabled during bus mastering calculations and
+ * for C2/C3 transitions.
+ */
+ local_irq_disable();
+ cx = power->state;
+ if ( !cx )
+ {
+ if ( pm_idle_save )
+ {
+ printk(XENLOG_DEBUG "call pm_idle_save()\n");
+ pm_idle_save();
+ }
+ else
+ {
+ printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+ acpi_safe_halt();
+ }
+ return;
+ }
+
+ /*
+ * Sleep:
+ * ------
+ * Invoke the current Cx state to put the processor to sleep.
+ */
+ if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+ smp_mb__after_clear_bit();
+
+ switch ( cx->type )
+ {
+ case ACPI_STATE_C1:
+ /*
+ * Invoke C1.
+ * Use the appropriate idle routine, the one that would
+ * be used without acpi C-states.
+ */
+ if ( pm_idle_save )
+ pm_idle_save();
+ else
+ acpi_safe_halt();
+
+ /*
+ * TBD: Can't get time duration while in C1, as resumes
+ * go to an ISR rather than here. Need to instrument
+ * base interrupt handler.
+ */
+ sleep_ticks = 0xFFFFFFFF;
+ break;
+
+ case ACPI_STATE_C2:
+ /* Get start time (ticks) */
+ t1 = inl(pmtmr_ioport);
+ /* Invoke C2 */
+ acpi_idle_do_entry(cx);
+ /* Get end time (ticks) */
+ t2 = inl(pmtmr_ioport);
+
+ /* Re-enable interrupts */
+ local_irq_enable();
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks =
+ ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ break;
+ default:
+ local_irq_enable();
+ return;
+ }
+
+ cx->usage++;
+ if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ cx->time += sleep_ticks;
+
+ next_state = power->state;
+
+ /*
+ * Promotion?
+ * ----------
+ * Track the number of longs (time asleep is greater than threshold)
+ * and promote when the count threshold is reached. Note that bus
+ * mastering activity may prevent promotions.
+ * Do not promote above max_cstate.
+ */
+ if ( cx->promotion.state &&
+ ((cx->promotion.state - power->states) <= max_cstate) )
+ {
+ if ( sleep_ticks > cx->promotion.threshold.ticks )
+ {
+ cx->promotion.count++;
+ cx->demotion.count = 0;
+ if ( cx->promotion.count >= cx->promotion.threshold.count )
+ {
+ next_state = cx->promotion.state;
+ goto end;
+ }
+ }
+ }
+
+ /*
+ * Demotion?
+ * ---------
+ * Track the number of shorts (time asleep is less than time threshold)
+ * and demote when the usage threshold is reached.
+ */
+ if ( cx->demotion.state )
+ {
+ if ( sleep_ticks < cx->demotion.threshold.ticks )
+ {
+ cx->demotion.count++;
+ cx->promotion.count = 0;
+ if ( cx->demotion.count >= cx->demotion.threshold.count )
+ {
+ next_state = cx->demotion.state;
+ goto end;
+ }
+ }
+ }
+
+end:
+ /*
+ * Demote if current state exceeds max_cstate
+ */
+ if ( (power->state - power->states) > max_cstate )
+ {
+ if ( cx->demotion.state )
+ next_state = cx->demotion.state;
+ }
+
+ /*
+ * New Cx State?
+ * -------------
+ * If we're going to start using a new Cx state we must clean up
+ * from the previous and prepare to use the new.
+ */
+ if ( next_state != power->state )
+ acpi_processor_power_activate(power, next_state);
+}
+
+static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+ unsigned int i;
+ unsigned int state_is_set = 0;
+ struct acpi_processor_cx *lower = NULL;
+ struct acpi_processor_cx *higher = NULL;
+ struct acpi_processor_cx *cx;
+
+ if ( !power )
+ return -EINVAL;
+
+ /*
+ * This function sets the default Cx state policy (OS idle handler).
+ * Our scheme is to promote quickly to C2 but more conservatively
+ * to C3. We're favoring C2 for its characteristics of low latency
+ * (quick response), good power savings, and ability to allow bus
+ * mastering activity. Note that the Cx state policy is completely
+ * customizable and can be altered dynamically.
+ */
+
+ /* startup state */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( !state_is_set )
+ power->state = cx;
+ state_is_set++;
+ break;
+ }
+
+ if ( !state_is_set )
+ return -ENODEV;
+
+ /* demotion */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( lower )
+ {
+ cx->demotion.state = lower;
+ cx->demotion.threshold.ticks = cx->latency_ticks;
+ cx->demotion.threshold.count = 1;
+ }
+
+ lower = cx;
+ }
+
+ /* promotion */
+ for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( higher )
+ {
+ cx->promotion.state = higher;
+ cx->promotion.threshold.ticks = cx->latency_ticks;
+ if ( cx->type >= ACPI_STATE_C2 )
+ cx->promotion.threshold.count = 4;
+ else
+ cx->promotion.threshold.count = 10;
+ }
+
+ higher = cx;
+ }
+
+ return 0;
+}
+
+static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+ memset(acpi_power, 0, sizeof(*acpi_power));
+
+ acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+ acpi_power->states[ACPI_STATE_C0].valid = 1;
+ acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+ acpi_power->count = 2;
+
+ return 0;
+}
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+ struct cpuinfo_x86 *c = ¤t_cpu_data;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+ {
+ printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+ return -EFAULT;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+ eax, ebx, ecx, edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+ return -EFAULT;
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+ return -EFAULT;
+
+ printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", cx->type);
+ return 0;
+}
+
+#define VENDOR_INTEL (1)
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static int check_cx(xen_processor_cx_t *cx)
+{
+ if ( cx == NULL )
+ return -EINVAL;
+
+ switch ( cx->reg.space_id )
+ {
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ if ( cx->reg.address == 0 )
+ return -EINVAL;
+ break;
+
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ if ( cx->type > ACPI_STATE_C1 )
+ {
+ if ( cx->reg.bit_width != VENDOR_INTEL ||
+ cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+ return -EINVAL;
+
+ /* assume all logical cpu has the same support for mwait */
+ if ( acpi_processor_ffh_cstate_probe(cx) )
+ return -EFAULT;
+ }
+ break;
+
+ default:
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+ xen_processor_cx_t *xen_cx)
+{
+ struct acpi_processor_cx *cx;
+
+ /* skip unsupported acpi cstate */
+ if ( check_cx(xen_cx) )
+ return -EFAULT;
+
+ cx = &acpi_power->states[xen_cx->type];
+ if ( !cx->valid )
+ acpi_power->count++;
+
+ cx->valid = 1;
+ cx->type = xen_cx->type;
+ cx->address = xen_cx->reg.address;
+ cx->space_id = xen_cx->reg.space_id;
+ cx->latency = xen_cx->latency;
+ cx->power = xen_cx->power;
+
+ cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+ return 0;
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+ int i;
+ u8 apic_id;
+
+ apic_id = x86_acpiid_to_apicid[acpi_id];
+ if ( apic_id == 0xff )
+ return -1;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ if ( apic_id == x86_cpu_to_apicid[i] )
+ return i;
+ }
+
+ return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t state;
+
+ XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+ xen_processor_csd_t dp;
+
+ printk("cpu%d cx acpi info:\n", cpu);
+ printk("\tcount = %d\n", power->count);
+ printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+ "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
+ power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
+ power->flags.power_setup_done, power->flags.bm_rld_set);
+
+ states = power->states;
+
+ for ( uint32_t i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&state, states, i, 1);
+
+ printk("\tstates[%d]:\n", i);
+ printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+ printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+ printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+ printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+ printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+ printk("\t\ttype = %d\n", state.type);
+ printk("\t\tlatency = %d\n", state.latency);
+ printk("\t\tpower = %d\n", state.power);
+
+ csd = state.dp;
+ printk("\t\tdp(@0x%p)\n", csd.p);
+
+ if ( csd.p != NULL )
+ {
+ copy_from_guest(&dp, csd, 1);
+ printk("\t\t\tdomain = %d\n", dp.domain);
+ printk("\t\t\tcoord_type = %d\n", dp.coord_type);
+ printk("\t\t\tnum = %d\n", dp.num);
+ }
+ }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t xen_cx;
+ struct acpi_processor_power *acpi_power;
+ int cpu_id;
+
+ print_cx_pminfo(cpu, power);
+
+ /* map from acpi_id to cpu_id */
+ cpu_id = get_cpu_id((u8)cpu);
+ if ( cpu_id == -1 )
+ {
+ printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+ return -EFAULT;
+ }
+
+ acpi_power = &processor_powers[cpu_id];
+
+ init_cx_pminfo(acpi_power);
+
+ states = power->states;
+
+ for ( int i = 0; i < power->count; i++ )
+ {
+ copy_from_guest_offset(&xen_cx, states, i, 1);
+ set_cx(acpi_power, &xen_cx);
+ }
+
+ /* FIXME: C-state dependency is not supported by far */
+
+ /* initialize default policy */
+ acpi_processor_set_power_policy(acpi_power);
+
+ print_acpi_power(cpu_id, acpi_power);
+
+ if ( cpu_id == 0 && pm_idle_save == NULL )
+ {
+ pm_idle_save = pm_idle;
+ pm_idle = acpi_processor_idle;
+ }
+
+ return 0;
+}
diff -r 483d006cc607 xen/arch/x86/x86_64/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_64/cpu_idle.c Wed Apr 30 11:21:50 2008 +0800
@@ -0,0 +1,136 @@
+/******************************************************************************
+ * cpu_idle.c -- adapt x86/acpi/cpu_idle.c to compat guest.
+ *
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#define __XEN_TOOLS__ /* for using get_xen_guest_handle macro */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/xmalloc.h>
+#include <xen/guest_access.h>
+#include <compat/platform.h>
+
+DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t);
+DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
+
+static unsigned long xlat_page_current;
+#define xlat_page_start COMPAT_ARG_XLAT_VIRT_START(current->vcpu_id)
+#define xlat_page_size COMPAT_ARG_XLAT_SIZE
+#define xlat_page_left_size \
+ (xlat_page_start + xlat_page_size - xlat_page_current)
+
+#define xlat_malloc_init() do { \
+ xlat_page_current = xlat_page_start; \
+} while (0)
+
+static void *xlat_malloc(size_t size)
+{
+ void *ret;
+
+ /* normalize size to be 64 * n */
+ size = (size + 0x3fUL) & ~0x3fUL;
+
+ if ( size > xlat_page_left_size )
+ return NULL;
+
+ ret = (void *) xlat_page_current;
+ xlat_page_current += size;
+
+ return ret;
+}
+
+#define xlat_malloc_array(_t, _c) ((_t *) xlat_malloc(sizeof(_t) * _c))
+
+static int copy_from_compat_state(xen_processor_cx_t *xen_state,
+ compat_processor_cx_t *state)
+{
+#define XLAT_processor_cx_HNDL_dp(_d_, _s_) do { \
+ xen_processor_csd_t *xen_dps = NULL; \
+\
+ if ( (_s_)->dpcnt > 0 ) \
+ { \
+ XEN_GUEST_HANDLE(compat_processor_csd_t) dps; \
+ compat_processor_csd_t dp; \
+\
+ xen_dps = xlat_malloc_array(xen_processor_csd_t, (_s_)->dpcnt); \
+ if ( xen_dps == NULL ) \
+ return -EFAULT; \
+\
+ guest_from_compat_handle(dps, (_s_)->dp); \
+ for ( int i = 0; i < (_s_)->dpcnt; i++ ) \
+ { \
+ copy_from_guest_offset(&dp, dps, i, 1); \
+ XLAT_processor_csd(&xen_dps[i], &dp); \
+ } \
+ } \
+\
+ set_xen_guest_handle((_d_)->dp, xen_dps); \
+} while (0)
+
+ XLAT_processor_cx(xen_state, state);
+ return 0;
+}
+
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
+long compat_set_cx_pminfo(uint32_t cpu, struct compat_processor_power *power)
+{
+ long ret = -EFAULT;
+ struct xen_processor_power *xen_power;
+
+ xlat_malloc_init();
+
+ xen_power = xlat_malloc_array(struct xen_processor_power, 1);
+ if ( xen_power == NULL )
+ return -EFAULT;
+
+#define XLAT_processor_power_HNDL_states(_d_, _s_) do { \
+ xen_processor_cx_t *xen_states = NULL; \
+\
+ if ( (_s_)->count > 0 ) \
+ { \
+ XEN_GUEST_HANDLE(compat_processor_cx_t) states; \
+ compat_processor_cx_t state; \
+\
+ xen_states = xlat_malloc_array(xen_processor_cx_t, (_s_)->count); \
+ if ( xen_states == NULL ) \
+ return -EFAULT; \
+\
+ guest_from_compat_handle(states, (_s_)->states); \
+ for ( int i = 0; i < _s_->count; i++ ) \
+ { \
+ copy_from_guest_offset(&state, states, i, 1); \
+ if ( (ret = copy_from_compat_state(&xen_states[i], &state)) ) \
+ goto clean_end; \
+ } \
+ } \
+\
+ set_xen_guest_handle((_d_)->states, xen_states); \
+} while (0)
+
+ XLAT_processor_power(xen_power, power);
+ ret = set_cx_pminfo(cpu, xen_power);
+
+ clean_end:
+ return ret;
+}
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 3:27 ` Wei, Gang
@ 2008-04-30 7:22 ` Jan Beulich
2008-04-30 8:54 ` Keir Fraser
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Jan Beulich @ 2008-04-30 7:22 UTC (permalink / raw)
To: Gang Wei; +Cc: xen-devel
>>> "Wei, Gang" <gang.wei@intel.com> 30.04.08 05:27 >>>
>Revising done according to Jan's comments. Resend.
Thanks. Unfortunately you now use a static (but not per-CPU) variable -
while I understand that it is expected that the call is done just once, I
don't think this is a good thing to do.
Further, xen_processor_csd_t seems to not need translation, so you
could simply add a check for the type to xen/include/xlat.lst and copy
the handle rather than what it points to. This would reduce size
constraints on the xlat area and also simplify the code.
As another suggestion - could you use uint32_t for the bitfield
declarations, making it more obvious that the remaining bits in the
32-bit quantity are reserved? Alternatively, could you use an
explicit padding field after the flags member of struct
xen_processor_power?
Also, I think there's error checking missing on copy_from_guest*
throughout the patch. And I think I saw non-C89 constructs (loop
variables declared inside for() statements).
Jan
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 7:22 ` Jan Beulich
@ 2008-04-30 8:54 ` Keir Fraser
2008-04-30 9:08 ` Wei, Gang
2008-04-30 9:12 ` Tian, Kevin
2008-04-30 16:36 ` Wei, Gang
2008-05-01 0:48 ` Wei, Gang
2 siblings, 2 replies; 20+ messages in thread
From: Keir Fraser @ 2008-04-30 8:54 UTC (permalink / raw)
To: Jan Beulich, Gang Wei; +Cc: xen-devel
On 30/4/08 08:22, "Jan Beulich" <jbeulich@novell.com> wrote:
>>>> "Wei, Gang" <gang.wei@intel.com> 30.04.08 05:27 >>>
>> Revising done according to Jan's comments. Resend.
>
> Thanks. Unfortunately you now use a static (but not per-CPU) variable -
> while I understand that it is expected that the call is done just once, I
> don't think this is a good thing to do.
Why is the variable even non-local? Is it just to make the xlat_malloc*()
interfaces simpler? It's a false simplification if so, and I think you'd be
better making the variable an explicit parameter to those functions.
Also I agree with Jan regarding non-ISO C usage of loop-header variable
declarations (don't do it) and also you should check copy_from_guest*()
return values and return -EFAULT where appropriate. His comment regarding
explicit padding or use of uint32_t in your public bitfield also sounds good
to me.
-- Keir
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 8:54 ` Keir Fraser
@ 2008-04-30 9:08 ` Wei, Gang
2008-04-30 9:12 ` Tian, Kevin
1 sibling, 0 replies; 20+ messages in thread
From: Wei, Gang @ 2008-04-30 9:08 UTC (permalink / raw)
To: Keir Fraser; +Cc: xen-devel, Jan Beulich
On Wednesday, April 30, 2008 4:54 PM, Keir Fraser wrote:
>> Thanks. Unfortunately you now use a static (but not per-CPU) variable
-
>> while I understand that it is expected that the call is done just
once, I
>> don't think this is a good thing to do.
>
> Why is the variable even non-local? Is it just to make the
xlat_malloc*()
> interfaces simpler? It's a false simplification if so, and I think
you'd
> be better making the variable an explicit parameter to those
functions.
I was trying to make thing simple, and not aware the per_cpu issue for
global variable. Your suggestion sounds good, I will try to follow it.
>
> Also I agree with Jan regarding non-ISO C usage of loop-header
variable
> declarations (don't do it) and also you should check
copy_from_guest*()
> return values and return -EFAULT where appropriate. His comment
regarding
> explicit padding or use of uint32_t in your public bitfield also
sounds
> good to me.
Actually, I also agree will Jan regarding the other comments. I am
revising patch for them.
Jimmy
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 8:54 ` Keir Fraser
2008-04-30 9:08 ` Wei, Gang
@ 2008-04-30 9:12 ` Tian, Kevin
2008-04-30 9:18 ` Tian, Kevin
2008-04-30 9:35 ` Jan Beulich
1 sibling, 2 replies; 20+ messages in thread
From: Tian, Kevin @ 2008-04-30 9:12 UTC (permalink / raw)
To: Keir Fraser, Jan Beulich, Wei, Gang; +Cc: xen-devel
One thing kicking me just now is, whether Linux address check
style can be used here by temporarily increasing address limit
in compat logic to bypass relative check in common code? I
didn't see obvious benefit to reserve a guest virtual addr range
and let each component to manage internal allocation themselves.
Linux style seems simpler and compat logic can just use xmalloc
to create native copy to reduce xlat complexity.
Thanks,
Kevin
>From: Keir Fraser
>Sent: 2008年4月30日 16:54
>
>On 30/4/08 08:22, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>>>>> "Wei, Gang" <gang.wei@intel.com> 30.04.08 05:27 >>>
>>> Revising done according to Jan's comments. Resend.
>>
>> Thanks. Unfortunately you now use a static (but not per-CPU)
>variable -
>> while I understand that it is expected that the call is done
>just once, I
>> don't think this is a good thing to do.
>
>Why is the variable even non-local? Is it just to make the
>xlat_malloc*()
>interfaces simpler? It's a false simplification if so, and I
>think you'd be
>better making the variable an explicit parameter to those functions.
>
>Also I agree with Jan regarding non-ISO C usage of loop-header variable
>declarations (don't do it) and also you should check copy_from_guest*()
>return values and return -EFAULT where appropriate. His
>comment regarding
>explicit padding or use of uint32_t in your public bitfield
>also sounds good
>to me.
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 9:12 ` Tian, Kevin
@ 2008-04-30 9:18 ` Tian, Kevin
2008-04-30 9:35 ` Jan Beulich
1 sibling, 0 replies; 20+ messages in thread
From: Tian, Kevin @ 2008-04-30 9:18 UTC (permalink / raw)
To: Tian, Kevin, Keir Fraser, Jan Beulich, Wei, Gang; +Cc: xen-devel
>From: Tian, Kevin
>Sent: 2008年4月30日 17:13
>To: Keir Fraser; Jan Beulich; Wei, Gang
>Cc: xen-devel@lists.xensource.com
>Subject: RE: [Xen-devel] [PATCH 1/9] Add cpu idle pwr mgmt to xen
>
>One thing kicking me just now is, whether Linux address check
>style can be used here by temporarily increasing address limit
>in compat logic to bypass relative check in common code? I
>didn't see obvious benefit to reserve a guest virtual addr range
>and let each component to manage internal allocation themselves.
>Linux style seems simpler and compat logic can just use xmalloc
I really meant local variable OR xmalloc here...
>to create native copy to reduce xlat complexity.
>
>Thanks,
>Kevin
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 9:12 ` Tian, Kevin
2008-04-30 9:18 ` Tian, Kevin
@ 2008-04-30 9:35 ` Jan Beulich
2008-04-30 9:42 ` Tian, Kevin
1 sibling, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2008-04-30 9:35 UTC (permalink / raw)
To: Kevin Tian; +Cc: xen-devel, Keir Fraser, Gang Wei
>>> "Tian, Kevin" <kevin.tian@intel.com> 30.04.08 11:12 >>>
>One thing kicking me just now is, whether Linux address check
>style can be used here by temporarily increasing address limit
>in compat logic to bypass relative check in common code? I
>didn't see obvious benefit to reserve a guest virtual addr range
>and let each component to manage internal allocation themselves.
>Linux style seems simpler and compat logic can just use xmalloc
>to create native copy to reduce xlat complexity.
I intentionally did not go that route when I first wrote these translation
routines. For one, you wouldn't be able to partly copy things (as I
suggested as an improvement here), since the validity checks would
apply to all or nothing during an individual hypercall (and a bad 64-bit
field representing a pointer might then slip through). Secondly, the
static pre-allocation used currently also avoids spurious failures of
hypercalls (there may be deterministic failures if the combined set
of indirect hypercall arguments exceeds the pre-allocation size.
Jan
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 9:35 ` Jan Beulich
@ 2008-04-30 9:42 ` Tian, Kevin
2008-04-30 10:00 ` Keir Fraser
2008-04-30 10:25 ` Jan Beulich
0 siblings, 2 replies; 20+ messages in thread
From: Tian, Kevin @ 2008-04-30 9:42 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser, Wei, Gang
>From: Jan Beulich [mailto:jbeulich@novell.com]
>Sent: 2008年4月30日 17:35
>
>>>> "Tian, Kevin" <kevin.tian@intel.com> 30.04.08 11:12 >>>
>>One thing kicking me just now is, whether Linux address check
>>style can be used here by temporarily increasing address limit
>>in compat logic to bypass relative check in common code? I
>>didn't see obvious benefit to reserve a guest virtual addr range
>>and let each component to manage internal allocation themselves.
>>Linux style seems simpler and compat logic can just use xmalloc
>>to create native copy to reduce xlat complexity.
>
>I intentionally did not go that route when I first wrote these
>translation
>routines. For one, you wouldn't be able to partly copy things (as I
>suggested as an improvement here), since the validity checks would
>apply to all or nothing during an individual hypercall (and a
>bad 64-bit
>field representing a pointer might then slip through). Secondly, the
What do you mean by partly copying things? For a 32-on-64 guest,
all pointers from guest are 32-bit and compat_handler_okay already
ensures compat pointers validity. Only native structure may have
64-bit pointer field, which is checked by common guest_handle_okay
if from a 64bit guest, or is trusted by increasing addr limitation if
from compat layer...
>static pre-allocation used currently also avoids spurious failures of
>hypercalls (there may be deterministic failures if the combined set
>of indirect hypercall arguments exceeds the pre-allocation size.
That's also the limitation of current approach by pre-defined size, which
is not scalable if 2nd level pointer are variable decided by some count
field.
Thanks,
Kevin
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 9:42 ` Tian, Kevin
@ 2008-04-30 10:00 ` Keir Fraser
2008-04-30 10:25 ` Jan Beulich
2008-04-30 10:25 ` Jan Beulich
1 sibling, 1 reply; 20+ messages in thread
From: Keir Fraser @ 2008-04-30 10:00 UTC (permalink / raw)
To: Tian, Kevin, Jan Beulich; +Cc: xen-devel, Wei, Gang
On 30/4/08 10:42, "Tian, Kevin" <kevin.tian@intel.com> wrote:
> What do you mean by partly copying things? For a 32-on-64 guest,
> all pointers from guest are 32-bit and compat_handler_okay already
> ensures compat pointers validity. Only native structure may have
> 64-bit pointer field, which is checked by common guest_handle_okay
> if from a 64bit guest, or is trusted by increasing addr limitation if
> from compat layer...
Yes, I don't think we do partial copying anywhere right now. If we did, we
could apply guest_handle_okay() checks explicitly before removing the
addr-space limitation.
>> static pre-allocation used currently also avoids spurious failures of
>> hypercalls (there may be deterministic failures if the combined set
>> of indirect hypercall arguments exceeds the pre-allocation size.
>
> That's also the limitation of current approach by pre-defined size, which
> is not scalable if 2nd level pointer are variable decided by some count
> field.
Also the approaches are not mutually exclusive. We can still have a per-vcpu
pre-alloc'ed page for most hypercalls, and allow dynamic allocation for
hypercalls which require more space and which then have to tolerate ENOMEM
failure. The pre-alloc'ed pages would no longer require to be mapped in a
special place.
On the other hand, I don't think we have any hypercall right now where 4kB
is likely to be too little space, and where the hypercall cannot be
sub-divided into smaller chunks by the compat shim.
*But* having a way to flag that arguments have been copied would also be
useful for HVM compat shims too. We already have such a flag
(guest_handles_in_xen_space) there, so we would increase commonality. This
probably means we will go down this route for PV guests too when we merge
some of the compat shim mechanisms for PV and HVM guests.
-- Keir
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 9:42 ` Tian, Kevin
2008-04-30 10:00 ` Keir Fraser
@ 2008-04-30 10:25 ` Jan Beulich
2008-05-05 6:34 ` Tian, Kevin
1 sibling, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2008-04-30 10:25 UTC (permalink / raw)
To: Kevin Tian; +Cc: xen-devel, Keir Fraser, Gang Wei
>>> "Tian, Kevin" <kevin.tian@intel.com> 30.04.08 11:42 >>>
>What do you mean by partly copying things? For a 32-on-64 guest,
>all pointers from guest are 32-bit and compat_handler_okay already
>ensures compat pointers validity. Only native structure may have
>64-bit pointer field, which is checked by common guest_handle_okay
>if from a 64bit guest, or is trusted by increasing addr limitation if
>from compat layer...
VCPUOP_register_runstate_memory_area is an example of this.
Jan
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 10:00 ` Keir Fraser
@ 2008-04-30 10:25 ` Jan Beulich
2008-04-30 12:27 ` Keir Fraser
0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2008-04-30 10:25 UTC (permalink / raw)
To: Keir Fraser, Kevin Tian; +Cc: xen-devel, Gang Wei
>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.04.08 12:00 >>>
>Yes, I don't think we do partial copying anywhere right now. If we did, we
>could apply guest_handle_okay() checks explicitly before removing the
>addr-space limitation.
XENMEM_set_memory_map is one example where we do (valid here
because the E820 layout is identical for 32- and 64-bits).
>On the other hand, I don't think we have any hypercall right now where 4kB
>is likely to be too little space, and where the hypercall cannot be
>sub-divided into smaller chunks by the compat shim.
XENMEM_exchange and GNTTABOP_setup_table are examples for this.
Jan
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 10:25 ` Jan Beulich
@ 2008-04-30 12:27 ` Keir Fraser
0 siblings, 0 replies; 20+ messages in thread
From: Keir Fraser @ 2008-04-30 12:27 UTC (permalink / raw)
To: Jan Beulich, Kevin Tian; +Cc: xen-devel, Gang Wei
Okay, it's fair to say that any changes we make need careful checking of all
existing compat shims then. :-) This is all a bit orthogonal to the issues
in the Cx patchset.
-- Keir
On 30/4/08 11:25, "Jan Beulich" <jbeulich@novell.com> wrote:
>>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.04.08 12:00 >>>
>> Yes, I don't think we do partial copying anywhere right now. If we did, we
>> could apply guest_handle_okay() checks explicitly before removing the
>> addr-space limitation.
>
> XENMEM_set_memory_map is one example where we do (valid here
> because the E820 layout is identical for 32- and 64-bits).
>
>> On the other hand, I don't think we have any hypercall right now where 4kB
>> is likely to be too little space, and where the hypercall cannot be
>> sub-divided into smaller chunks by the compat shim.
>
> XENMEM_exchange and GNTTABOP_setup_table are examples for this.
>
> Jan
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 7:22 ` Jan Beulich
2008-04-30 8:54 ` Keir Fraser
@ 2008-04-30 16:36 ` Wei, Gang
2008-05-01 0:48 ` Wei, Gang
2 siblings, 0 replies; 20+ messages in thread
From: Wei, Gang @ 2008-04-30 16:36 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 1077 bytes --]
Revised according to below comments. Resend.
Jimmy
On Wednesday, April 30, 2008 3:22 PM, Jan Beulich wrote:
> Thanks. Unfortunately you now use a static (but not per-CPU) variable
-
> while I understand that it is expected that the call is done just
once, I
> don't think this is a good thing to do.
>
> Further, xen_processor_csd_t seems to not need translation, so you
> could simply add a check for the type to xen/include/xlat.lst and copy
> the handle rather than what it points to. This would reduce size
> constraints on the xlat area and also simplify the code.
>
> As another suggestion - could you use uint32_t for the bitfield
> declarations, making it more obvious that the remaining bits in the
> 32-bit quantity are reserved? Alternatively, could you use an
> explicit padding field after the flags member of struct
> xen_processor_power?
>
> Also, I think there's error checking missing on copy_from_guest*
> throughout the patch. And I think I saw non-C89 constructs (loop
> variables declared inside for() statements).
>
> Jan
[-- Attachment #2: xen-1-cx_base-0501.patch --]
[-- Type: application/octet-stream, Size: 32335 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
diff -r 483d006cc607 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/acpi/Makefile Thu May 01 00:16:37 2008 +0800
@@ -1,2 +1,2 @@ obj-y += boot.o
obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r 483d006cc607 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/domain.c Thu May 01 00:16:37 2008 +0800
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(unsigned long, cr4);
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
static void unmap_vcpu_info(struct vcpu *v);
static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -105,7 +108,7 @@ void idle_loop(void)
if ( cpu_is_offline(smp_processor_id()) )
play_dead();
page_scrub_schedule_work();
- default_idle();
+ (*pm_idle)();
do_softirq();
}
}
diff -r 483d006cc607 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/platform_hypercall.c Thu May 01 00:16:37 2008 +0800
@@ -44,6 +44,8 @@ extern spinlock_t xenpf_lock;
static DEFINE_PER_CPU(uint64_t, freq);
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
static long cpu_frequency_change_helper(void *data)
{
return cpu_frequency_change(this_cpu(freq));
@@ -340,6 +342,27 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
}
break;
+ case XENPF_set_processor_pminfo:
+ switch ( op->u.set_pminfo.type )
+ {
+ case XEN_PM_PX:
+ ret = -EINVAL;
+ break;
+
+ case XEN_PM_CX:
+ ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.power);
+ break;
+
+ case XEN_PM_TX:
+ ret = -EINVAL;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 483d006cc607 xen/arch/x86/x86_64/Makefile
--- a/xen/arch/x86/x86_64/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/Makefile Thu May 01 00:16:37 2008 +0800
@@ -12,6 +12,7 @@ obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += physdev.o
obj-$(CONFIG_COMPAT) += platform_hypercall.o
+obj-$(CONFIG_COMPAT) += cpu_idle.o
ifeq ($(CONFIG_COMPAT),y)
# extra dependencies
@@ -22,4 +23,5 @@ platform_hypercall.o: ../platform_hyperc
platform_hypercall.o: ../platform_hypercall.c
sysctl.o: ../sysctl.c
traps.o: compat/traps.c
+cpu_idle.o: ../acpi/cpu_idle.c
endif
diff -r 483d006cc607 xen/arch/x86/x86_64/platform_hypercall.c
--- a/xen/arch/x86/x86_64/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/platform_hypercall.c Thu May 01 00:16:37 2008 +0800
@@ -10,6 +10,10 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_
#define xen_platform_op compat_platform_op
#define xen_platform_op_t compat_platform_op_t
#define do_platform_op(x) compat_platform_op(_##x)
+
+#define xen_processor_power compat_processor_power
+#define xen_processor_power_t compat_processor_power_t
+#define set_cx_pminfo compat_set_cx_pminfo
#define xenpf_enter_acpi_sleep compat_pf_enter_acpi_sleep
diff -r 483d006cc607 xen/include/public/platform.h
--- a/xen/include/public/platform.h Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/public/platform.h Thu May 01 00:16:37 2008 +0800
@@ -199,6 +199,70 @@ typedef struct xenpf_getidletime xenpf_g
typedef struct xenpf_getidletime xenpf_getidletime_t;
DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
+
+struct xen_processor_flags {
+ uint32_t bm_control:1;
+ uint32_t bm_check:1;
+ uint32_t has_cst:1;
+ uint32_t power_setup_done:1;
+ uint32_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
+};
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, ...} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -213,6 +277,7 @@ struct xen_platform_op {
struct xenpf_enter_acpi_sleep enter_acpi_sleep;
struct xenpf_change_freq change_freq;
struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
uint8_t pad[128];
} u;
};
diff -r 483d006cc607 xen/include/xlat.lst
--- a/xen/include/xlat.lst Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/xlat.lst Thu May 01 00:16:37 2008 +0800
@@ -44,3 +44,8 @@
! vcpu_runstate_info vcpu.h
? xenoprof_init xenoprof.h
? xenoprof_passive xenoprof.h
+! power_register platform.h
+? processor_csd platform.h
+! processor_cx platform.h
+! processor_flags platform.h
+! processor_power platform.h
diff -r 483d006cc607 xen/arch/x86/acpi/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c Thu May 01 00:33:08 2008 +0800
@@ -0,0 +1,690 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux
+ * drivers/acpi/processor_idle.c &
+ * arch/x86/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * - Added processor hotplug support
+ * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added support for C3 on SMP
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+#include <xen/keyhandler.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000
+
+extern u32 pmtmr_ioport;
+extern void (*pm_idle) (void);
+
+static void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+ u32 count;
+ struct acpi_processor_cx *state;
+ struct
+ {
+ u32 time;
+ u32 ticks;
+ u32 count;
+ u32 bm;
+ } threshold;
+};
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ struct acpi_processor_cx_policy promotion;
+ struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_power
+{
+ struct acpi_processor_cx *state;
+ u64 bm_check_timestamp;
+ u32 default_state;
+ u32 bm_activity;
+ u32 count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+static struct acpi_processor_power processor_powers[NR_CPUS];
+
+static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+ uint32_t i;
+
+ printk("saved cpu%d cx acpi info:\n", cpu);
+ printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+ printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+ printk("\tdefault_state = %d\n", power->default_state);
+ printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+ printk("\tcount = %d\n", power->count);
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ printk("\tstates[%d]:\n", i);
+ printk("\t\tvalid = %d\n", power->states[i].valid);
+ printk("\t\ttype = %d\n", power->states[i].type);
+ printk("\t\taddress = 0x%x\n", power->states[i].address);
+ printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+ printk("\t\tlatency = %d\n", power->states[i].latency);
+ printk("\t\tpower = %d\n", power->states[i].power);
+ printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+ printk("\t\tusage = %d\n", power->states[i].usage);
+ printk("\t\ttime = %"PRId64"\n", power->states[i].time);
+
+ printk("\t\tpromotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].promotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].promotion.state) ?
+ power->states[i].promotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].promotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].promotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].promotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].promotion.threshold.bm);
+
+ printk("\t\tdemotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].demotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].demotion.state) ?
+ power->states[i].demotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].demotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].demotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].demotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].demotion.threshold.bm);
+ }
+}
+
+static void dump_cx(unsigned char key)
+{
+ for( int i = 0; i < num_online_cpus(); i++ )
+ print_acpi_power(i, &processor_powers[i]);
+}
+
+static int __init cpu_idle_key_init(void)
+{
+ register_keyhandler(
+ 'c', dump_cx, "dump cx structures");
+ return 0;
+}
+__initcall(cpu_idle_key_init);
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+ if ( t2 >= t1 )
+ return (t2 - t1);
+ else
+ return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+ struct acpi_processor_cx *new)
+{
+ struct acpi_processor_cx *old;
+
+ if ( !power || !new )
+ return;
+
+ old = power->state;
+
+ if ( old )
+ old->promotion.count = 0;
+ new->demotion.count = 0;
+
+ power->state = new;
+
+ return;
+}
+
+static void acpi_safe_halt(void)
+{
+ smp_mb__after_clear_bit();
+ safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ __monitor((void *)current, 0, 0);
+ smp_mb();
+ __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+ if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+ {
+ /* Call into architectural FFH based C-state */
+ acpi_processor_ffh_cstate_enter(cx);
+ }
+ else
+ {
+ int unused;
+ /* IO port based C-state */
+ inb(cx->address);
+ /* Dummy wait op - must do something useless after P_LVL2 read
+ because chipsets cannot guarantee that STPCLK# signal
+ gets asserted in time to freeze execution properly. */
+ unused = inl(pmtmr_ioport);
+ }
+}
+
+static void acpi_processor_idle(void)
+{
+ struct acpi_processor_power *power = NULL;
+ struct acpi_processor_cx *cx = NULL;
+ struct acpi_processor_cx *next_state = NULL;
+ int sleep_ticks = 0;
+ u32 t1, t2 = 0;
+
+ power = &processor_powers[smp_processor_id()];
+
+ /*
+ * Interrupts must be disabled during bus mastering calculations and
+ * for C2/C3 transitions.
+ */
+ local_irq_disable();
+ cx = power->state;
+ if ( !cx )
+ {
+ if ( pm_idle_save )
+ {
+ printk(XENLOG_DEBUG "call pm_idle_save()\n");
+ pm_idle_save();
+ }
+ else
+ {
+ printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+ acpi_safe_halt();
+ }
+ return;
+ }
+
+ /*
+ * Sleep:
+ * ------
+ * Invoke the current Cx state to put the processor to sleep.
+ */
+ if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+ smp_mb__after_clear_bit();
+
+ switch ( cx->type )
+ {
+ case ACPI_STATE_C1:
+ /*
+ * Invoke C1.
+ * Use the appropriate idle routine, the one that would
+ * be used without acpi C-states.
+ */
+ if ( pm_idle_save )
+ pm_idle_save();
+ else
+ acpi_safe_halt();
+
+ /*
+ * TBD: Can't get time duration while in C1, as resumes
+ * go to an ISR rather than here. Need to instrument
+ * base interrupt handler.
+ */
+ sleep_ticks = 0xFFFFFFFF;
+ break;
+
+ case ACPI_STATE_C2:
+ /* Get start time (ticks) */
+ t1 = inl(pmtmr_ioport);
+ /* Invoke C2 */
+ acpi_idle_do_entry(cx);
+ /* Get end time (ticks) */
+ t2 = inl(pmtmr_ioport);
+
+ /* Re-enable interrupts */
+ local_irq_enable();
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks =
+ ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ break;
+ default:
+ local_irq_enable();
+ return;
+ }
+
+ cx->usage++;
+ if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ cx->time += sleep_ticks;
+
+ next_state = power->state;
+
+ /*
+ * Promotion?
+ * ----------
+ * Track the number of longs (time asleep is greater than threshold)
+ * and promote when the count threshold is reached. Note that bus
+ * mastering activity may prevent promotions.
+ * Do not promote above max_cstate.
+ */
+ if ( cx->promotion.state &&
+ ((cx->promotion.state - power->states) <= max_cstate) )
+ {
+ if ( sleep_ticks > cx->promotion.threshold.ticks )
+ {
+ cx->promotion.count++;
+ cx->demotion.count = 0;
+ if ( cx->promotion.count >= cx->promotion.threshold.count )
+ {
+ next_state = cx->promotion.state;
+ goto end;
+ }
+ }
+ }
+
+ /*
+ * Demotion?
+ * ---------
+ * Track the number of shorts (time asleep is less than time threshold)
+ * and demote when the usage threshold is reached.
+ */
+ if ( cx->demotion.state )
+ {
+ if ( sleep_ticks < cx->demotion.threshold.ticks )
+ {
+ cx->demotion.count++;
+ cx->promotion.count = 0;
+ if ( cx->demotion.count >= cx->demotion.threshold.count )
+ {
+ next_state = cx->demotion.state;
+ goto end;
+ }
+ }
+ }
+
+end:
+ /*
+ * Demote if current state exceeds max_cstate
+ */
+ if ( (power->state - power->states) > max_cstate )
+ {
+ if ( cx->demotion.state )
+ next_state = cx->demotion.state;
+ }
+
+ /*
+ * New Cx State?
+ * -------------
+ * If we're going to start using a new Cx state we must clean up
+ * from the previous and prepare to use the new.
+ */
+ if ( next_state != power->state )
+ acpi_processor_power_activate(power, next_state);
+}
+
+static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+ unsigned int i;
+ unsigned int state_is_set = 0;
+ struct acpi_processor_cx *lower = NULL;
+ struct acpi_processor_cx *higher = NULL;
+ struct acpi_processor_cx *cx;
+
+ if ( !power )
+ return -EINVAL;
+
+ /*
+ * This function sets the default Cx state policy (OS idle handler).
+ * Our scheme is to promote quickly to C2 but more conservatively
+ * to C3. We're favoring C2 for its characteristics of low latency
+ * (quick response), good power savings, and ability to allow bus
+ * mastering activity. Note that the Cx state policy is completely
+ * customizable and can be altered dynamically.
+ */
+
+ /* startup state */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( !state_is_set )
+ power->state = cx;
+ state_is_set++;
+ break;
+ }
+
+ if ( !state_is_set )
+ return -ENODEV;
+
+ /* demotion */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( lower )
+ {
+ cx->demotion.state = lower;
+ cx->demotion.threshold.ticks = cx->latency_ticks;
+ cx->demotion.threshold.count = 1;
+ }
+
+ lower = cx;
+ }
+
+ /* promotion */
+ for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( higher )
+ {
+ cx->promotion.state = higher;
+ cx->promotion.threshold.ticks = cx->latency_ticks;
+ if ( cx->type >= ACPI_STATE_C2 )
+ cx->promotion.threshold.count = 4;
+ else
+ cx->promotion.threshold.count = 10;
+ }
+
+ higher = cx;
+ }
+
+ return 0;
+}
+
+static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+ memset(acpi_power, 0, sizeof(*acpi_power));
+
+ acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+ acpi_power->states[ACPI_STATE_C0].valid = 1;
+ acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+ acpi_power->count = 2;
+
+ return 0;
+}
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+ struct cpuinfo_x86 *c = ¤t_cpu_data;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+ {
+ printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+ return -EFAULT;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+ eax, ebx, ecx, edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+ return -EFAULT;
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+ return -EFAULT;
+
+ printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", cx->type);
+ return 0;
+}
+
+#define VENDOR_INTEL (1)
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static int check_cx(xen_processor_cx_t *cx)
+{
+ if ( cx == NULL )
+ return -EINVAL;
+
+ switch ( cx->reg.space_id )
+ {
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ if ( cx->reg.address == 0 )
+ return -EINVAL;
+ break;
+
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ if ( cx->type > ACPI_STATE_C1 )
+ {
+ if ( cx->reg.bit_width != VENDOR_INTEL ||
+ cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+ return -EINVAL;
+
+ /* assume all logical cpu has the same support for mwait */
+ if ( acpi_processor_ffh_cstate_probe(cx) )
+ return -EFAULT;
+ }
+ break;
+
+ default:
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+ xen_processor_cx_t *xen_cx)
+{
+ struct acpi_processor_cx *cx;
+
+ /* skip unsupported acpi cstate */
+ if ( check_cx(xen_cx) )
+ return -EFAULT;
+
+ cx = &acpi_power->states[xen_cx->type];
+ if ( !cx->valid )
+ acpi_power->count++;
+
+ cx->valid = 1;
+ cx->type = xen_cx->type;
+ cx->address = xen_cx->reg.address;
+ cx->space_id = xen_cx->reg.space_id;
+ cx->latency = xen_cx->latency;
+ cx->power = xen_cx->power;
+
+ cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+ return 0;
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+ int i;
+ u8 apic_id;
+
+ apic_id = x86_acpiid_to_apicid[acpi_id];
+ if ( apic_id == 0xff )
+ return -1;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ if ( apic_id == x86_cpu_to_apicid[i] )
+ return i;
+ }
+
+ return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t state;
+ XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+ xen_processor_csd_t dp;
+ uint32_t i;
+
+ printk("cpu%d cx acpi info:\n", cpu);
+ printk("\tcount = %d\n", power->count);
+ printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+ "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
+ power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
+ power->flags.power_setup_done, power->flags.bm_rld_set);
+
+ states = power->states;
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
+ return;
+
+ printk("\tstates[%d]:\n", i);
+ printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+ printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+ printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+ printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+ printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+ printk("\t\ttype = %d\n", state.type);
+ printk("\t\tlatency = %d\n", state.latency);
+ printk("\t\tpower = %d\n", state.power);
+
+ csd = state.dp;
+ printk("\t\tdp(@0x%p)\n", csd.p);
+
+ if ( csd.p != NULL )
+ {
+ if ( unlikely(copy_from_guest(&dp, csd, 1)) )
+ return;
+ printk("\t\t\tdomain = %d\n", dp.domain);
+ printk("\t\t\tcoord_type = %d\n", dp.coord_type);
+ printk("\t\t\tnum = %d\n", dp.num);
+ }
+ }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t xen_cx;
+ struct acpi_processor_power *acpi_power;
+ int cpu_id, i;
+
+ if ( unlikely(!guest_handle_okay(power->states, power->count)) )
+ return -EFAULT;
+
+ print_cx_pminfo(cpu, power);
+
+ /* map from acpi_id to cpu_id */
+ cpu_id = get_cpu_id((u8)cpu);
+ if ( cpu_id == -1 )
+ {
+ printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+ return -EFAULT;
+ }
+
+ acpi_power = &processor_powers[cpu_id];
+
+ init_cx_pminfo(acpi_power);
+
+ states = power->states;
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
+ return -EFAULT;
+
+ set_cx(acpi_power, &xen_cx);
+ }
+
+ /* FIXME: C-state dependency is not supported by far */
+
+ /* initialize default policy */
+ acpi_processor_set_power_policy(acpi_power);
+
+ print_acpi_power(cpu_id, acpi_power);
+
+ if ( cpu_id == 0 && pm_idle_save == NULL )
+ {
+ pm_idle_save = pm_idle;
+ pm_idle = acpi_processor_idle;
+ }
+
+ return 0;
+}
diff -r 483d006cc607 xen/arch/x86/x86_64/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_64/cpu_idle.c Thu May 01 00:16:37 2008 +0800
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * cpu_idle.c -- adapt x86/acpi/cpu_idle.c to compat guest.
+ *
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#define __XEN_TOOLS__ /* for using get_xen_guest_handle macro */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/xmalloc.h>
+#include <xen/guest_access.h>
+#include <compat/platform.h>
+
+CHECK_processor_csd;
+
+DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t);
+DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
+
+#define xlat_page_start COMPAT_ARG_XLAT_VIRT_START(current->vcpu_id)
+#define xlat_page_size COMPAT_ARG_XLAT_SIZE
+#define xlat_page_left_size(xlat_page_current) \
+ (xlat_page_start + xlat_page_size - xlat_page_current)
+
+#define xlat_malloc_init(xlat_page_current) do { \
+ xlat_page_current = xlat_page_start; \
+} while (0)
+
+static void *xlat_malloc(unsigned long *xlat_page_current, size_t size)
+{
+ void *ret;
+
+ /* normalize size to be 64 * n */
+ size = (size + 0x3fUL) & ~0x3fUL;
+
+ if ( unlikely(size > xlat_page_left_size(*xlat_page_current)) )
+ return NULL;
+
+ ret = (void *) *xlat_page_current;
+ *xlat_page_current += size;
+
+ return ret;
+}
+
+#define xlat_malloc_array(_p, _t, _c) ((_t *) xlat_malloc(&_p, sizeof(_t) * _c))
+
+static int copy_from_compat_state(xen_processor_cx_t *xen_state,
+ compat_processor_cx_t *state)
+{
+#define XLAT_processor_cx_HNDL_dp(_d_, _s_) do { \
+ XEN_GUEST_HANDLE(compat_processor_csd_t) dps; \
+ if ( unlikely(!compat_handle_okay((_s_)->dp, (_s_)->dpcnt)) ) \
+ return -EFAULT; \
+ guest_from_compat_handle(dps, (_s_)->dp); \
+ (_d_)->dp = guest_handle_cast(dps, xen_processor_csd_t); \
+} while (0)
+ XLAT_processor_cx(xen_state, state);
+#undef XLAT_processor_cx_HNDL_dp
+
+ return 0;
+}
+
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
+long compat_set_cx_pminfo(uint32_t cpu, struct compat_processor_power *power)
+{
+ struct xen_processor_power *xen_power;
+ unsigned long xlat_page_current;
+
+ xlat_malloc_init(xlat_page_current);
+
+ xen_power = xlat_malloc_array(xlat_page_current,
+ struct xen_processor_power, 1);
+ if ( unlikely(xen_power == NULL) )
+ return -EFAULT;
+
+#define XLAT_processor_power_HNDL_states(_d_, _s_) do { \
+ xen_processor_cx_t *xen_states = NULL; \
+\
+ if ( likely((_s_)->count > 0) ) \
+ { \
+ XEN_GUEST_HANDLE(compat_processor_cx_t) states; \
+ compat_processor_cx_t state; \
+ int i; \
+\
+ xen_states = xlat_malloc_array(xlat_page_current, \
+ xen_processor_cx_t, (_s_)->count); \
+ if ( unlikely(xen_states == NULL) ) \
+ return -EFAULT; \
+\
+ if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->count)) ) \
+ return -EFAULT; \
+ guest_from_compat_handle(states, (_s_)->states); \
+\
+ for ( i = 0; i < _s_->count; i++ ) \
+ { \
+ if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
+ return -EFAULT; \
+ if ( unlikely(copy_from_compat_state(&xen_states[i], &state)) ) \
+ return -EFAULT; \
+ } \
+ } \
+\
+ set_xen_guest_handle((_d_)->states, xen_states); \
+} while (0)
+ XLAT_processor_power(xen_power, power);
+#undef XLAT_processor_power_HNDL_states
+
+ return set_cx_pminfo(cpu, xen_power);
+}
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 7:22 ` Jan Beulich
2008-04-30 8:54 ` Keir Fraser
2008-04-30 16:36 ` Wei, Gang
@ 2008-05-01 0:48 ` Wei, Gang
2 siblings, 0 replies; 20+ messages in thread
From: Wei, Gang @ 2008-05-01 0:48 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 1077 bytes --]
Revised according to below comments. Resend.
Jimmy
On Wednesday, April 30, 2008 3:22 PM, Jan Beulich wrote:
> Thanks. Unfortunately you now use a static (but not per-CPU) variable
-
> while I understand that it is expected that the call is done just
once, I
> don't think this is a good thing to do.
>
> Further, xen_processor_csd_t seems to not need translation, so you
> could simply add a check for the type to xen/include/xlat.lst and copy
> the handle rather than what it points to. This would reduce size
> constraints on the xlat area and also simplify the code.
>
> As another suggestion - could you use uint32_t for the bitfield
> declarations, making it more obvious that the remaining bits in the
> 32-bit quantity are reserved? Alternatively, could you use an
> explicit padding field after the flags member of struct
> xen_processor_power?
>
> Also, I think there's error checking missing on copy_from_guest*
> throughout the patch. And I think I saw non-C89 constructs (loop
> variables declared inside for() statements).
>
> Jan
[-- Attachment #2: xen-1-cx_base-0501.patch --]
[-- Type: application/octet-stream, Size: 32335 bytes --]
Add basic acpi C-states based cpu idle power mgmt in xen for x86.
It includes:
1. hypercall definition for passing ACPI info.
2. C1/C2 support.
3. Mwait support, as well as legacy ioport.
4. Ladder policy from Linux kernel.
A lot of code & ideas came from Linux.
Signed-off-by: Wei Gang <gang.wei@intel.com>
diff -r 483d006cc607 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/acpi/Makefile Thu May 01 00:16:37 2008 +0800
@@ -1,2 +1,2 @@ obj-y += boot.o
obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r 483d006cc607 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/domain.c Thu May 01 00:16:37 2008 +0800
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(u64, efer);
DEFINE_PER_CPU(unsigned long, cr4);
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
static void unmap_vcpu_info(struct vcpu *v);
static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -105,7 +108,7 @@ void idle_loop(void)
if ( cpu_is_offline(smp_processor_id()) )
play_dead();
page_scrub_schedule_work();
- default_idle();
+ (*pm_idle)();
do_softirq();
}
}
diff -r 483d006cc607 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/platform_hypercall.c Thu May 01 00:16:37 2008 +0800
@@ -44,6 +44,8 @@ extern spinlock_t xenpf_lock;
static DEFINE_PER_CPU(uint64_t, freq);
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
static long cpu_frequency_change_helper(void *data)
{
return cpu_frequency_change(this_cpu(freq));
@@ -340,6 +342,27 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
}
break;
+ case XENPF_set_processor_pminfo:
+ switch ( op->u.set_pminfo.type )
+ {
+ case XEN_PM_PX:
+ ret = -EINVAL;
+ break;
+
+ case XEN_PM_CX:
+ ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.power);
+ break;
+
+ case XEN_PM_TX:
+ ret = -EINVAL;
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 483d006cc607 xen/arch/x86/x86_64/Makefile
--- a/xen/arch/x86/x86_64/Makefile Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/Makefile Thu May 01 00:16:37 2008 +0800
@@ -12,6 +12,7 @@ obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += domain.o
obj-$(CONFIG_COMPAT) += physdev.o
obj-$(CONFIG_COMPAT) += platform_hypercall.o
+obj-$(CONFIG_COMPAT) += cpu_idle.o
ifeq ($(CONFIG_COMPAT),y)
# extra dependencies
@@ -22,4 +23,5 @@ platform_hypercall.o: ../platform_hyperc
platform_hypercall.o: ../platform_hypercall.c
sysctl.o: ../sysctl.c
traps.o: compat/traps.c
+cpu_idle.o: ../acpi/cpu_idle.c
endif
diff -r 483d006cc607 xen/arch/x86/x86_64/platform_hypercall.c
--- a/xen/arch/x86/x86_64/platform_hypercall.c Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/arch/x86/x86_64/platform_hypercall.c Thu May 01 00:16:37 2008 +0800
@@ -10,6 +10,10 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_
#define xen_platform_op compat_platform_op
#define xen_platform_op_t compat_platform_op_t
#define do_platform_op(x) compat_platform_op(_##x)
+
+#define xen_processor_power compat_processor_power
+#define xen_processor_power_t compat_processor_power_t
+#define set_cx_pminfo compat_set_cx_pminfo
#define xenpf_enter_acpi_sleep compat_pf_enter_acpi_sleep
diff -r 483d006cc607 xen/include/public/platform.h
--- a/xen/include/public/platform.h Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/public/platform.h Thu May 01 00:16:37 2008 +0800
@@ -199,6 +199,70 @@ typedef struct xenpf_getidletime xenpf_g
typedef struct xenpf_getidletime xenpf_getidletime_t;
DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+#define XENPF_set_processor_pminfo 54
+
+/* ability bits */
+#define XEN_PROCESSOR_PM_CX 1
+#define XEN_PROCESSOR_PM_PX 2
+#define XEN_PROCESSOR_PM_TX 4
+
+/* cmd type */
+#define XEN_PM_CX 0
+#define XEN_PM_PX 1
+#define XEN_PM_TX 2
+
+struct xen_power_register {
+ uint32_t space_id;
+ uint32_t bit_width;
+ uint32_t bit_offset;
+ uint32_t access_size;
+ uint64_t address;
+};
+
+struct xen_processor_csd {
+ uint32_t domain; /* domain number of one dependent group */
+ uint32_t coord_type; /* coordination type */
+ uint32_t num; /* number of processors in same domain */
+};
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
+
+struct xen_processor_cx {
+ struct xen_power_register reg; /* GAS for Cx trigger register */
+ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */
+ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */
+ uint32_t power; /* average power consumption(mW) */
+ uint32_t dpcnt; /* number of dependency entries */
+ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
+};
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
+
+struct xen_processor_flags {
+ uint32_t bm_control:1;
+ uint32_t bm_check:1;
+ uint32_t has_cst:1;
+ uint32_t power_setup_done:1;
+ uint32_t bm_rld_set:1;
+};
+
+struct xen_processor_power {
+ uint32_t count; /* number of C state entries in array below */
+ struct xen_processor_flags flags; /* global flags of this processor */
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
+};
+
+struct xenpf_set_processor_pminfo {
+ /* IN variables */
+ uint32_t id; /* ACPI CPU ID */
+ uint32_t type; /* {XEN_PM_CX, ...} */
+ union {
+ struct xen_processor_power power;/* Cx: _CST/_CSD */
+ };
+};
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -213,6 +277,7 @@ struct xen_platform_op {
struct xenpf_enter_acpi_sleep enter_acpi_sleep;
struct xenpf_change_freq change_freq;
struct xenpf_getidletime getidletime;
+ struct xenpf_set_processor_pminfo set_pminfo;
uint8_t pad[128];
} u;
};
diff -r 483d006cc607 xen/include/xlat.lst
--- a/xen/include/xlat.lst Fri Apr 25 13:46:27 2008 +0100
+++ b/xen/include/xlat.lst Thu May 01 00:16:37 2008 +0800
@@ -44,3 +44,8 @@
! vcpu_runstate_info vcpu.h
? xenoprof_init xenoprof.h
? xenoprof_passive xenoprof.h
+! power_register platform.h
+? processor_csd platform.h
+! processor_cx platform.h
+! processor_flags platform.h
+! processor_power platform.h
diff -r 483d006cc607 xen/arch/x86/acpi/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c Thu May 01 00:33:08 2008 +0800
@@ -0,0 +1,690 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux
+ * drivers/acpi/processor_idle.c &
+ * arch/x86/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * - Added processor hotplug support
+ * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added support for C3 on SMP
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+#include <xen/keyhandler.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000
+
+extern u32 pmtmr_ioport;
+extern void (*pm_idle) (void);
+
+static void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+ u32 count;
+ struct acpi_processor_cx *state;
+ struct
+ {
+ u32 time;
+ u32 ticks;
+ u32 count;
+ u32 bm;
+ } threshold;
+};
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ struct acpi_processor_cx_policy promotion;
+ struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_power
+{
+ struct acpi_processor_cx *state;
+ u64 bm_check_timestamp;
+ u32 default_state;
+ u32 bm_activity;
+ u32 count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+static struct acpi_processor_power processor_powers[NR_CPUS];
+
+static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+ uint32_t i;
+
+ printk("saved cpu%d cx acpi info:\n", cpu);
+ printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+ printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+ printk("\tdefault_state = %d\n", power->default_state);
+ printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+ printk("\tcount = %d\n", power->count);
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ printk("\tstates[%d]:\n", i);
+ printk("\t\tvalid = %d\n", power->states[i].valid);
+ printk("\t\ttype = %d\n", power->states[i].type);
+ printk("\t\taddress = 0x%x\n", power->states[i].address);
+ printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+ printk("\t\tlatency = %d\n", power->states[i].latency);
+ printk("\t\tpower = %d\n", power->states[i].power);
+ printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+ printk("\t\tusage = %d\n", power->states[i].usage);
+ printk("\t\ttime = %"PRId64"\n", power->states[i].time);
+
+ printk("\t\tpromotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].promotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].promotion.state) ?
+ power->states[i].promotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].promotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].promotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].promotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].promotion.threshold.bm);
+
+ printk("\t\tdemotion policy:\n");
+ printk("\t\t\tcount = %d\n", power->states[i].demotion.count);
+ printk("\t\t\tstate = C%d\n",
+ (power->states[i].demotion.state) ?
+ power->states[i].demotion.state->type : -1);
+ printk("\t\t\tthreshold.time = %d\n", power->states[i].demotion.threshold.time);
+ printk("\t\t\tthreshold.ticks = %d\n", power->states[i].demotion.threshold.ticks);
+ printk("\t\t\tthreshold.count = %d\n", power->states[i].demotion.threshold.count);
+ printk("\t\t\tthreshold.bm = %d\n", power->states[i].demotion.threshold.bm);
+ }
+}
+
+static void dump_cx(unsigned char key)
+{
+ for( int i = 0; i < num_online_cpus(); i++ )
+ print_acpi_power(i, &processor_powers[i]);
+}
+
+static int __init cpu_idle_key_init(void)
+{
+ register_keyhandler(
+ 'c', dump_cx, "dump cx structures");
+ return 0;
+}
+__initcall(cpu_idle_key_init);
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+ if ( t2 >= t1 )
+ return (t2 - t1);
+ else
+ return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+ struct acpi_processor_cx *new)
+{
+ struct acpi_processor_cx *old;
+
+ if ( !power || !new )
+ return;
+
+ old = power->state;
+
+ if ( old )
+ old->promotion.count = 0;
+ new->demotion.count = 0;
+
+ power->state = new;
+
+ return;
+}
+
+static void acpi_safe_halt(void)
+{
+ smp_mb__after_clear_bit();
+ safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ __monitor((void *)current, 0, 0);
+ smp_mb();
+ __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+ if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+ {
+ /* Call into architectural FFH based C-state */
+ acpi_processor_ffh_cstate_enter(cx);
+ }
+ else
+ {
+ int unused;
+ /* IO port based C-state */
+ inb(cx->address);
+ /* Dummy wait op - must do something useless after P_LVL2 read
+ because chipsets cannot guarantee that STPCLK# signal
+ gets asserted in time to freeze execution properly. */
+ unused = inl(pmtmr_ioport);
+ }
+}
+
+static void acpi_processor_idle(void)
+{
+ struct acpi_processor_power *power = NULL;
+ struct acpi_processor_cx *cx = NULL;
+ struct acpi_processor_cx *next_state = NULL;
+ int sleep_ticks = 0;
+ u32 t1, t2 = 0;
+
+ power = &processor_powers[smp_processor_id()];
+
+ /*
+ * Interrupts must be disabled during bus mastering calculations and
+ * for C2/C3 transitions.
+ */
+ local_irq_disable();
+ cx = power->state;
+ if ( !cx )
+ {
+ if ( pm_idle_save )
+ {
+ printk(XENLOG_DEBUG "call pm_idle_save()\n");
+ pm_idle_save();
+ }
+ else
+ {
+ printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+ acpi_safe_halt();
+ }
+ return;
+ }
+
+ /*
+ * Sleep:
+ * ------
+ * Invoke the current Cx state to put the processor to sleep.
+ */
+ if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+ smp_mb__after_clear_bit();
+
+ switch ( cx->type )
+ {
+ case ACPI_STATE_C1:
+ /*
+ * Invoke C1.
+ * Use the appropriate idle routine, the one that would
+ * be used without acpi C-states.
+ */
+ if ( pm_idle_save )
+ pm_idle_save();
+ else
+ acpi_safe_halt();
+
+ /*
+ * TBD: Can't get time duration while in C1, as resumes
+ * go to an ISR rather than here. Need to instrument
+ * base interrupt handler.
+ */
+ sleep_ticks = 0xFFFFFFFF;
+ break;
+
+ case ACPI_STATE_C2:
+ /* Get start time (ticks) */
+ t1 = inl(pmtmr_ioport);
+ /* Invoke C2 */
+ acpi_idle_do_entry(cx);
+ /* Get end time (ticks) */
+ t2 = inl(pmtmr_ioport);
+
+ /* Re-enable interrupts */
+ local_irq_enable();
+ /* Compute time (ticks) that we were actually asleep */
+ sleep_ticks =
+ ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ break;
+ default:
+ local_irq_enable();
+ return;
+ }
+
+ cx->usage++;
+ if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ cx->time += sleep_ticks;
+
+ next_state = power->state;
+
+ /*
+ * Promotion?
+ * ----------
+ * Track the number of longs (time asleep is greater than threshold)
+ * and promote when the count threshold is reached. Note that bus
+ * mastering activity may prevent promotions.
+ * Do not promote above max_cstate.
+ */
+ if ( cx->promotion.state &&
+ ((cx->promotion.state - power->states) <= max_cstate) )
+ {
+ if ( sleep_ticks > cx->promotion.threshold.ticks )
+ {
+ cx->promotion.count++;
+ cx->demotion.count = 0;
+ if ( cx->promotion.count >= cx->promotion.threshold.count )
+ {
+ next_state = cx->promotion.state;
+ goto end;
+ }
+ }
+ }
+
+ /*
+ * Demotion?
+ * ---------
+ * Track the number of shorts (time asleep is less than time threshold)
+ * and demote when the usage threshold is reached.
+ */
+ if ( cx->demotion.state )
+ {
+ if ( sleep_ticks < cx->demotion.threshold.ticks )
+ {
+ cx->demotion.count++;
+ cx->promotion.count = 0;
+ if ( cx->demotion.count >= cx->demotion.threshold.count )
+ {
+ next_state = cx->demotion.state;
+ goto end;
+ }
+ }
+ }
+
+end:
+ /*
+ * Demote if current state exceeds max_cstate
+ */
+ if ( (power->state - power->states) > max_cstate )
+ {
+ if ( cx->demotion.state )
+ next_state = cx->demotion.state;
+ }
+
+ /*
+ * New Cx State?
+ * -------------
+ * If we're going to start using a new Cx state we must clean up
+ * from the previous and prepare to use the new.
+ */
+ if ( next_state != power->state )
+ acpi_processor_power_activate(power, next_state);
+}
+
+static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+ unsigned int i;
+ unsigned int state_is_set = 0;
+ struct acpi_processor_cx *lower = NULL;
+ struct acpi_processor_cx *higher = NULL;
+ struct acpi_processor_cx *cx;
+
+ if ( !power )
+ return -EINVAL;
+
+ /*
+ * This function sets the default Cx state policy (OS idle handler).
+ * Our scheme is to promote quickly to C2 but more conservatively
+ * to C3. We're favoring C2 for its characteristics of low latency
+ * (quick response), good power savings, and ability to allow bus
+ * mastering activity. Note that the Cx state policy is completely
+ * customizable and can be altered dynamically.
+ */
+
+ /* startup state */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( !state_is_set )
+ power->state = cx;
+ state_is_set++;
+ break;
+ }
+
+ if ( !state_is_set )
+ return -ENODEV;
+
+ /* demotion */
+ for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( lower )
+ {
+ cx->demotion.state = lower;
+ cx->demotion.threshold.ticks = cx->latency_ticks;
+ cx->demotion.threshold.count = 1;
+ }
+
+ lower = cx;
+ }
+
+ /* promotion */
+ for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+ {
+ cx = &power->states[i];
+ if ( !cx->valid )
+ continue;
+
+ if ( higher )
+ {
+ cx->promotion.state = higher;
+ cx->promotion.threshold.ticks = cx->latency_ticks;
+ if ( cx->type >= ACPI_STATE_C2 )
+ cx->promotion.threshold.count = 4;
+ else
+ cx->promotion.threshold.count = 10;
+ }
+
+ higher = cx;
+ }
+
+ return 0;
+}
+
+static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+ memset(acpi_power, 0, sizeof(*acpi_power));
+
+ acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+ acpi_power->states[ACPI_STATE_C0].valid = 1;
+ acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+ acpi_power->count = 2;
+
+ return 0;
+}
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+ struct cpuinfo_x86 *c = ¤t_cpu_data;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+ {
+ printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+ return -EFAULT;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+ eax, ebx, ecx, edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+ return -EFAULT;
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+ return -EFAULT;
+
+ printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", cx->type);
+ return 0;
+}
+
+#define VENDOR_INTEL (1)
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static int check_cx(xen_processor_cx_t *cx)
+{
+ if ( cx == NULL )
+ return -EINVAL;
+
+ switch ( cx->reg.space_id )
+ {
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ if ( cx->reg.address == 0 )
+ return -EINVAL;
+ break;
+
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ if ( cx->type > ACPI_STATE_C1 )
+ {
+ if ( cx->reg.bit_width != VENDOR_INTEL ||
+ cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+ return -EINVAL;
+
+ /* assume all logical cpu has the same support for mwait */
+ if ( acpi_processor_ffh_cstate_probe(cx) )
+ return -EFAULT;
+ }
+ break;
+
+ default:
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+ xen_processor_cx_t *xen_cx)
+{
+ struct acpi_processor_cx *cx;
+
+ /* skip unsupported acpi cstate */
+ if ( check_cx(xen_cx) )
+ return -EFAULT;
+
+ cx = &acpi_power->states[xen_cx->type];
+ if ( !cx->valid )
+ acpi_power->count++;
+
+ cx->valid = 1;
+ cx->type = xen_cx->type;
+ cx->address = xen_cx->reg.address;
+ cx->space_id = xen_cx->reg.space_id;
+ cx->latency = xen_cx->latency;
+ cx->power = xen_cx->power;
+
+ cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+ return 0;
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+ int i;
+ u8 apic_id;
+
+ apic_id = x86_acpiid_to_apicid[acpi_id];
+ if ( apic_id == 0xff )
+ return -1;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ if ( apic_id == x86_cpu_to_apicid[i] )
+ return i;
+ }
+
+ return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t state;
+ XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+ xen_processor_csd_t dp;
+ uint32_t i;
+
+ printk("cpu%d cx acpi info:\n", cpu);
+ printk("\tcount = %d\n", power->count);
+ printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+ "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
+ power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
+ power->flags.power_setup_done, power->flags.bm_rld_set);
+
+ states = power->states;
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
+ return;
+
+ printk("\tstates[%d]:\n", i);
+ printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+ printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+ printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+ printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+ printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+ printk("\t\ttype = %d\n", state.type);
+ printk("\t\tlatency = %d\n", state.latency);
+ printk("\t\tpower = %d\n", state.power);
+
+ csd = state.dp;
+ printk("\t\tdp(@0x%p)\n", csd.p);
+
+ if ( csd.p != NULL )
+ {
+ if ( unlikely(copy_from_guest(&dp, csd, 1)) )
+ return;
+ printk("\t\t\tdomain = %d\n", dp.domain);
+ printk("\t\t\tcoord_type = %d\n", dp.coord_type);
+ printk("\t\t\tnum = %d\n", dp.num);
+ }
+ }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+ XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+ xen_processor_cx_t xen_cx;
+ struct acpi_processor_power *acpi_power;
+ int cpu_id, i;
+
+ if ( unlikely(!guest_handle_okay(power->states, power->count)) )
+ return -EFAULT;
+
+ print_cx_pminfo(cpu, power);
+
+ /* map from acpi_id to cpu_id */
+ cpu_id = get_cpu_id((u8)cpu);
+ if ( cpu_id == -1 )
+ {
+ printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+ return -EFAULT;
+ }
+
+ acpi_power = &processor_powers[cpu_id];
+
+ init_cx_pminfo(acpi_power);
+
+ states = power->states;
+
+ for ( i = 0; i < power->count; i++ )
+ {
+ if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
+ return -EFAULT;
+
+ set_cx(acpi_power, &xen_cx);
+ }
+
+ /* FIXME: C-state dependency is not supported by far */
+
+ /* initialize default policy */
+ acpi_processor_set_power_policy(acpi_power);
+
+ print_acpi_power(cpu_id, acpi_power);
+
+ if ( cpu_id == 0 && pm_idle_save == NULL )
+ {
+ pm_idle_save = pm_idle;
+ pm_idle = acpi_processor_idle;
+ }
+
+ return 0;
+}
diff -r 483d006cc607 xen/arch/x86/x86_64/cpu_idle.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/x86_64/cpu_idle.c Thu May 01 00:16:37 2008 +0800
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * cpu_idle.c -- adapt x86/acpi/cpu_idle.c to compat guest.
+ *
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#define __XEN_TOOLS__ /* for using get_xen_guest_handle macro */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/xmalloc.h>
+#include <xen/guest_access.h>
+#include <compat/platform.h>
+
+CHECK_processor_csd;
+
+DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t);
+DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
+
+#define xlat_page_start COMPAT_ARG_XLAT_VIRT_START(current->vcpu_id)
+#define xlat_page_size COMPAT_ARG_XLAT_SIZE
+#define xlat_page_left_size(xlat_page_current) \
+ (xlat_page_start + xlat_page_size - xlat_page_current)
+
+#define xlat_malloc_init(xlat_page_current) do { \
+ xlat_page_current = xlat_page_start; \
+} while (0)
+
+static void *xlat_malloc(unsigned long *xlat_page_current, size_t size)
+{
+ void *ret;
+
+ /* normalize size to be 64 * n */
+ size = (size + 0x3fUL) & ~0x3fUL;
+
+ if ( unlikely(size > xlat_page_left_size(*xlat_page_current)) )
+ return NULL;
+
+ ret = (void *) *xlat_page_current;
+ *xlat_page_current += size;
+
+ return ret;
+}
+
+#define xlat_malloc_array(_p, _t, _c) ((_t *) xlat_malloc(&_p, sizeof(_t) * _c))
+
+static int copy_from_compat_state(xen_processor_cx_t *xen_state,
+ compat_processor_cx_t *state)
+{
+#define XLAT_processor_cx_HNDL_dp(_d_, _s_) do { \
+ XEN_GUEST_HANDLE(compat_processor_csd_t) dps; \
+ if ( unlikely(!compat_handle_okay((_s_)->dp, (_s_)->dpcnt)) ) \
+ return -EFAULT; \
+ guest_from_compat_handle(dps, (_s_)->dp); \
+ (_d_)->dp = guest_handle_cast(dps, xen_processor_csd_t); \
+} while (0)
+ XLAT_processor_cx(xen_state, state);
+#undef XLAT_processor_cx_HNDL_dp
+
+ return 0;
+}
+
+extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
+
+long compat_set_cx_pminfo(uint32_t cpu, struct compat_processor_power *power)
+{
+ struct xen_processor_power *xen_power;
+ unsigned long xlat_page_current;
+
+ xlat_malloc_init(xlat_page_current);
+
+ xen_power = xlat_malloc_array(xlat_page_current,
+ struct xen_processor_power, 1);
+ if ( unlikely(xen_power == NULL) )
+ return -EFAULT;
+
+#define XLAT_processor_power_HNDL_states(_d_, _s_) do { \
+ xen_processor_cx_t *xen_states = NULL; \
+\
+ if ( likely((_s_)->count > 0) ) \
+ { \
+ XEN_GUEST_HANDLE(compat_processor_cx_t) states; \
+ compat_processor_cx_t state; \
+ int i; \
+\
+ xen_states = xlat_malloc_array(xlat_page_current, \
+ xen_processor_cx_t, (_s_)->count); \
+ if ( unlikely(xen_states == NULL) ) \
+ return -EFAULT; \
+\
+ if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->count)) ) \
+ return -EFAULT; \
+ guest_from_compat_handle(states, (_s_)->states); \
+\
+ for ( i = 0; i < _s_->count; i++ ) \
+ { \
+ if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
+ return -EFAULT; \
+ if ( unlikely(copy_from_compat_state(&xen_states[i], &state)) ) \
+ return -EFAULT; \
+ } \
+ } \
+\
+ set_xen_guest_handle((_d_)->states, xen_states); \
+} while (0)
+ XLAT_processor_power(xen_power, power);
+#undef XLAT_processor_power_HNDL_states
+
+ return set_cx_pminfo(cpu, xen_power);
+}
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH 1/9] Add cpu idle pwr mgmt to xen
2008-04-30 10:25 ` Jan Beulich
@ 2008-05-05 6:34 ` Tian, Kevin
0 siblings, 0 replies; 20+ messages in thread
From: Tian, Kevin @ 2008-05-05 6:34 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser, Wei, Gang
>From: Jan Beulich [mailto:jbeulich@novell.com]
>Sent: 2008年4月30日 18:25
>
>>>> "Tian, Kevin" <kevin.tian@intel.com> 30.04.08 11:42 >>>
>>What do you mean by partly copying things? For a 32-on-64 guest,
>>all pointers from guest are 32-bit and compat_handler_okay already
>>ensures compat pointers validity. Only native structure may have
>>64-bit pointer field, which is checked by common guest_handle_okay
>>if from a 64bit guest, or is trusted by increasing addr limitation if
>>from compat layer...
>
>VCPUOP_register_runstate_memory_area is an example of this.
>
Thanks for pointing out. However I still didn't understand why this
becomes the benefit of the existing approach. For a normal parameter
conversion, the steps can be:
a) check pointer validity upon compat address limitation
b) allocate native structure with content translated from compat version
c) gear to native handler which checks native address limitation
d) back update compat structure if possible
Existing approach allocates native structure in guest address space
at step b) to bypass address check in step c), while my suggestion
is to allocate native version in Xen space by temporarily improving
address limitation at step b). You can see in either approach where
all necessary checks at step a) have to be done correctly before steping
next. For example, where partly copy applies can always be achieved
even when rest part is copied into Xen space (mixed with guest handle
but validated at step a)). Also 64bit pointer has to be checked at step a)
before improving address limitation.
Well, I'm not against existing approach since I didn't find obvious cons
to not use it. :-) As I said earlier, the intent is to get more backgrounds
and make this compat slim clearer to me. BTW, is it possible to let
guest register such compat page within its own address space? This
can release Xen overhead from managing this extra range...
Thanks,
Kevin
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2008-05-05 6:34 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-25 5:07 [PATCH 1/9] Add cpu idle pwr mgmt to xen Wei, Gang
2008-04-25 13:00 ` Keir Fraser
2008-04-25 13:29 ` Wei, Gang
2008-04-26 9:55 ` Wei, Gang
2008-04-28 9:24 ` Jan Beulich
2008-04-30 3:27 ` Wei, Gang
2008-04-30 7:22 ` Jan Beulich
2008-04-30 8:54 ` Keir Fraser
2008-04-30 9:08 ` Wei, Gang
2008-04-30 9:12 ` Tian, Kevin
2008-04-30 9:18 ` Tian, Kevin
2008-04-30 9:35 ` Jan Beulich
2008-04-30 9:42 ` Tian, Kevin
2008-04-30 10:00 ` Keir Fraser
2008-04-30 10:25 ` Jan Beulich
2008-04-30 12:27 ` Keir Fraser
2008-04-30 10:25 ` Jan Beulich
2008-05-05 6:34 ` Tian, Kevin
2008-04-30 16:36 ` Wei, Gang
2008-05-01 0:48 ` Wei, Gang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.