* [RFC PATCH 2/7] xen/arm: implement HYPERVISOR_sysctl
2014-10-07 14:20 [RFC PATCH] xen_cpufreq implementation in kernel Oleksandr Dmytryshyn
2014-10-07 14:20 ` [RFC PATCH 1/7] PM / OPP: make cpufreq functions dependent on CONFIG_CPU_FREQ_TABLE Oleksandr Dmytryshyn
@ 2014-10-07 14:20 ` Oleksandr Dmytryshyn
2014-10-07 14:20 ` [RFC PATCH 3/7] xen/arm: implement HYPERVISOR_dom0_op Oleksandr Dmytryshyn
` (5 subsequent siblings)
7 siblings, 0 replies; 16+ messages in thread
From: Oleksandr Dmytryshyn @ 2014-10-07 14:20 UTC (permalink / raw)
To: Ian Campbell, Stefano Stabellini, Tim Deegan, xen-devel
Signed-off-by: Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com>
---
arch/arm/include/asm/xen/hypercall.h | 1 +
arch/arm/include/asm/xen/interface.h | 2 +
arch/arm/xen/enlighten.c | 1 +
arch/arm/xen/hypercall.S | 1 +
include/xen/interface/sysctl.h | 646 +++++++++++++++++++++++++++++++++++
include/xen/interface/xen.h | 6 +
6 files changed, 657 insertions(+)
create mode 100644 include/xen/interface/sysctl.h
diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h
index c817c56..751869eb 100644
--- a/arch/arm/include/asm/xen/hypercall.h
+++ b/arch/arm/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@ int HYPERVISOR_memory_op(unsigned int cmd, void *arg);
int HYPERVISOR_physdev_op(int cmd, void *arg);
int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args);
int HYPERVISOR_tmem_op(void *arg);
+int HYPERVISOR_sysctl(void *arg);
static inline void
MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h
index 1151188..acf4b7a 100644
--- a/arch/arm/include/asm/xen/interface.h
+++ b/arch/arm/include/asm/xen/interface.h
@@ -19,6 +19,7 @@
__DEFINE_GUEST_HANDLE(name, struct name)
#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
#define GUEST_HANDLE(name) __guest_handle_ ## name
+#define GUEST_HANDLE_64(name) GUEST_HANDLE(name)
#define set_xen_guest_handle(hnd, val) \
do { \
@@ -48,6 +49,7 @@ DEFINE_GUEST_HANDLE(int);
DEFINE_GUEST_HANDLE(void);
DEFINE_GUEST_HANDLE(uint64_t);
DEFINE_GUEST_HANDLE(uint32_t);
+DEFINE_GUEST_HANDLE(uint8_t);
DEFINE_GUEST_HANDLE(xen_pfn_t);
DEFINE_GUEST_HANDLE(xen_ulong_t);
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index eb0d851..675f17a 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -350,4 +350,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_memory_op);
EXPORT_SYMBOL_GPL(HYPERVISOR_physdev_op);
EXPORT_SYMBOL_GPL(HYPERVISOR_vcpu_op);
EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op);
+EXPORT_SYMBOL_GPL(HYPERVISOR_sysctl);
EXPORT_SYMBOL_GPL(privcmd_call);
diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S
index d1cf7b7..a1276df 100644
--- a/arch/arm/xen/hypercall.S
+++ b/arch/arm/xen/hypercall.S
@@ -89,6 +89,7 @@ HYPERCALL2(memory_op);
HYPERCALL2(physdev_op);
HYPERCALL3(vcpu_op);
HYPERCALL1(tmem_op);
+HYPERCALL1(sysctl);
ENTRY(privcmd_call)
stmdb sp!, {r4}
diff --git a/include/xen/interface/sysctl.h b/include/xen/interface/sysctl.h
new file mode 100644
index 0000000..1a8cf7a
--- /dev/null
+++ b/include/xen/interface/sysctl.h
@@ -0,0 +1,646 @@
+/******************************************************************************
+ * sysctl.h
+ *
+ * System management operations. For use by node control stack.
+ *
+ * Reused from xen: xen/include/public/sysctl.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ * Copyright (c) 2014, GlobalLogic Inc.
+ */
+
+#ifndef __XEN_PUBLIC_SYSCTL_H__
+#define __XEN_PUBLIC_SYSCTL_H__
+
+#include <xen/interface/xen.h>
+
+#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000A
+
+/*
+ * Read console content from Xen buffer ring.
+ */
+/* XEN_SYSCTL_readconsole */
+struct xen_sysctl_readconsole {
+ /* IN: Non-zero -> clear after reading. */
+ uint8_t clear;
+ /* IN: Non-zero -> start index specified by @index field. */
+ uint8_t incremental;
+ uint8_t pad0, pad1;
+ /*
+ * IN: Start index for consuming from ring buffer (if @incremental);
+ * OUT: End index after consuming from ring buffer.
+ */
+ uint32_t index;
+ /* IN: Virtual address to write console data. */
+ GUEST_HANDLE_64(char) buffer;
+ /* IN: Size of buffer; OUT: Bytes written to buffer. */
+ uint32_t count;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_readconsole);
+
+/* Get trace buffers machine base address */
+/* XEN_SYSCTL_tbuf_op */
+struct xen_sysctl_tbuf_op {
+ /* IN variables */
+#define XEN_SYSCTL_TBUFOP_get_info 0
+#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
+#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
+#define XEN_SYSCTL_TBUFOP_set_size 3
+#define XEN_SYSCTL_TBUFOP_enable 4
+#define XEN_SYSCTL_TBUFOP_disable 5
+ uint32_t cmd;
+ /* IN/OUT variables */
+ struct xenctl_bitmap cpu_mask;
+ uint32_t evt_mask;
+ /* OUT variables */
+ uint64_aligned_t buffer_mfn;
+ uint32_t size; /* Also an IN variable! */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_tbuf_op);
+
+/*
+ * Get physical information about the host machine
+ */
+/* XEN_SYSCTL_physinfo */
+ /* (x86) The platform supports HVM guests. */
+#define _XEN_SYSCTL_PHYSCAP_hvm 0
+#define XEN_SYSCTL_PHYSCAP_hvm (1u<<_XEN_SYSCTL_PHYSCAP_hvm)
+ /* (x86) The platform supports HVM-guest direct access to I/O devices. */
+#define _XEN_SYSCTL_PHYSCAP_hvm_directio 1
+#define XEN_SYSCTL_PHYSCAP_hvm_directio (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio)
+struct xen_sysctl_physinfo {
+ uint32_t threads_per_core;
+ uint32_t cores_per_socket;
+ uint32_t nr_cpus; /* # CPUs currently online */
+ uint32_t max_cpu_id; /* Largest possible CPU ID on this host */
+ uint32_t nr_nodes; /* # nodes currently online */
+ uint32_t max_node_id; /* Largest possible node ID on this host */
+ uint32_t cpu_khz;
+ uint64_aligned_t total_pages;
+ uint64_aligned_t free_pages;
+ uint64_aligned_t scrub_pages;
+ uint64_aligned_t outstanding_pages;
+ uint32_t hw_cap[8];
+
+ /* XEN_SYSCTL_PHYSCAP_??? */
+ uint32_t capabilities;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_physinfo);
+
+/*
+ * Get the ID of the current scheduler.
+ */
+/* XEN_SYSCTL_sched_id */
+struct xen_sysctl_sched_id {
+ /* OUT variable */
+ uint32_t sched_id;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_sched_id);
+
+/* Interface for controlling Xen software performance counters. */
+/* XEN_SYSCTL_perfc_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */
+#define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */
+struct xen_sysctl_perfc_desc {
+ char name[80]; /* name of perf counter */
+ uint32_t nr_vals; /* number of values for this counter */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_perfc_desc);
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_perfc_val);
+
+struct xen_sysctl_perfc_op {
+ /* IN variables. */
+ uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */
+ /* OUT variables. */
+ uint32_t nr_counters; /* number of counters description */
+ uint32_t nr_vals; /* number of values */
+ /* counter information (or NULL) */
+ GUEST_HANDLE_64(xen_sysctl_perfc_desc) desc;
+ /* counter values (or NULL) */
+ GUEST_HANDLE_64(xen_sysctl_perfc_val) val;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_perfc_op);
+
+/* Inject debug keys into Xen. */
+/* XEN_SYSCTL_debug_keys */
+struct xen_sysctl_debug_keys {
+ /* IN variables. */
+ GUEST_HANDLE_64(char) keys;
+ uint32_t nr_keys;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_debug_keys);
+
+/* Get physical CPU information. */
+/* XEN_SYSCTL_getcpuinfo */
+struct xen_sysctl_cpuinfo {
+ uint64_aligned_t idletime;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_cpuinfo);
+struct xen_sysctl_getcpuinfo {
+ /* IN variables. */
+ uint32_t max_cpus;
+ GUEST_HANDLE_64(xen_sysctl_cpuinfo) info;
+ /* OUT variables. */
+ uint32_t nr_cpus;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_getcpuinfo);
+
+/* XEN_SYSCTL_availheap */
+struct xen_sysctl_availheap {
+ /* IN variables. */
+ uint32_t min_bitwidth; /* Smallest address width (zero if don't care) */
+ uint32_t max_bitwidth; /* Largest address width (zero if don't care) */
+ int32_t node; /* NUMA node of interest (-1 for all nodes) */
+ /* OUT variables. */
+ uint64_aligned_t avail_bytes;/* Bytes available in the specified region */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_availheap);
+
+/* XEN_SYSCTL_get_pmstat */
+struct pm_px_val {
+ uint64_aligned_t freq; /* Px core frequency */
+ uint64_aligned_t residency; /* Px residency time */
+ uint64_aligned_t count; /* Px transition count */
+};
+DEFINE_GUEST_HANDLE_STRUCT(pm_px_val);
+
+struct pm_px_stat {
+ uint8_t total; /* total Px states */
+ uint8_t usable; /* usable Px states */
+ uint8_t last; /* last Px state */
+ uint8_t cur; /* current Px state */
+ GUEST_HANDLE_64(uint64_t) trans_pt; /* Px transition table */
+ GUEST_HANDLE_64(pm_px_val) pt;
+};
+DEFINE_GUEST_HANDLE_STRUCT(pm_px_stat);
+
+struct pm_cx_stat {
+ uint32_t nr; /* entry nr in triggers & residencies, including C0 */
+ uint32_t last; /* last Cx state */
+ uint64_aligned_t idle_time; /* idle time from boot */
+ GUEST_HANDLE_64(uint64_t) triggers; /* Cx trigger counts */
+ GUEST_HANDLE_64(uint64_t) residencies; /* Cx residencies */
+ uint64_aligned_t pc2;
+ uint64_aligned_t pc3;
+ uint64_aligned_t pc6;
+ uint64_aligned_t pc7;
+ uint64_aligned_t cc3;
+ uint64_aligned_t cc6;
+ uint64_aligned_t cc7;
+};
+
+struct xen_sysctl_get_pmstat {
+#define PMSTAT_CATEGORY_MASK 0xf0
+#define PMSTAT_PX 0x10
+#define PMSTAT_CX 0x20
+#define PMSTAT_get_max_px (PMSTAT_PX | 0x1)
+#define PMSTAT_get_pxstat (PMSTAT_PX | 0x2)
+#define PMSTAT_reset_pxstat (PMSTAT_PX | 0x3)
+#define PMSTAT_get_max_cx (PMSTAT_CX | 0x1)
+#define PMSTAT_get_cxstat (PMSTAT_CX | 0x2)
+#define PMSTAT_reset_cxstat (PMSTAT_CX | 0x3)
+ uint32_t type;
+ uint32_t cpuid;
+ union {
+ struct pm_px_stat getpx;
+ struct pm_cx_stat getcx;
+ /* other struct for tx, etc */
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_get_pmstat);
+
+/* XEN_SYSCTL_cpu_hotplug */
+struct xen_sysctl_cpu_hotplug {
+ /* IN variables */
+ uint32_t cpu; /* Physical cpu. */
+#define XEN_SYSCTL_CPU_HOTPLUG_ONLINE 0
+#define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1
+ uint32_t op; /* hotplug opcode */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_cpu_hotplug);
+
+/*
+ * Get/set xen power management, include
+ * 1. cpufreq governors and related parameters
+ */
+/* XEN_SYSCTL_pm_op */
+struct xen_userspace {
+ uint32_t scaling_setspeed;
+};
+
+struct xen_ondemand {
+ uint32_t sampling_rate_max;
+ uint32_t sampling_rate_min;
+
+ uint32_t sampling_rate;
+ uint32_t up_threshold;
+};
+
+/*
+ * cpufreq para name of this structure named
+ * same as sysfs file name of native linux
+ */
+#define CPUFREQ_NAME_LEN 16
+struct xen_get_cpufreq_para {
+ /* IN/OUT variable */
+ uint32_t cpu_num;
+ uint32_t freq_num;
+ uint32_t gov_num;
+
+ /* for all governors */
+ /* OUT variable */
+ GUEST_HANDLE_64(uint32_t) affected_cpus;
+ GUEST_HANDLE_64(uint32_t) scaling_available_frequencies;
+ GUEST_HANDLE_64(char) scaling_available_governors;
+ char scaling_driver[CPUFREQ_NAME_LEN];
+
+ uint32_t cpuinfo_cur_freq;
+ uint32_t cpuinfo_max_freq;
+ uint32_t cpuinfo_min_freq;
+ uint32_t scaling_cur_freq;
+
+ char scaling_governor[CPUFREQ_NAME_LEN];
+ uint32_t scaling_max_freq;
+ uint32_t scaling_min_freq;
+
+ /* for specific governor */
+ union {
+ struct xen_userspace userspace;
+ struct xen_ondemand ondemand;
+ } u;
+
+ int32_t turbo_enabled;
+};
+
+struct xen_set_cpufreq_gov {
+ char scaling_governor[CPUFREQ_NAME_LEN];
+};
+
+struct xen_set_cpufreq_para {
+ #define SCALING_MAX_FREQ 1
+ #define SCALING_MIN_FREQ 2
+ #define SCALING_SETSPEED 3
+ #define SAMPLING_RATE 4
+ #define UP_THRESHOLD 5
+
+ uint32_t ctrl_type;
+ uint32_t ctrl_value;
+};
+
+struct xen_sysctl_pm_op {
+ #define PM_PARA_CATEGORY_MASK 0xf0
+ #define CPUFREQ_PARA 0x10
+
+ /* cpufreq command type */
+ #define GET_CPUFREQ_PARA (CPUFREQ_PARA | 0x01)
+ #define SET_CPUFREQ_GOV (CPUFREQ_PARA | 0x02)
+ #define SET_CPUFREQ_PARA (CPUFREQ_PARA | 0x03)
+ #define GET_CPUFREQ_AVGFREQ (CPUFREQ_PARA | 0x04)
+
+ /* set/reset scheduler power saving option */
+ #define XEN_SYSCTL_pm_op_set_sched_opt_smt 0x21
+
+ /* cpuidle max_cstate access command */
+ #define XEN_SYSCTL_pm_op_get_max_cstate 0x22
+ #define XEN_SYSCTL_pm_op_set_max_cstate 0x23
+
+ /* set scheduler migration cost value */
+ #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay 0x24
+ #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay 0x25
+
+ /* enable/disable turbo mode when in dbs governor */
+ #define XEN_SYSCTL_pm_op_enable_turbo 0x26
+ #define XEN_SYSCTL_pm_op_disable_turbo 0x27
+
+ uint32_t cmd;
+ uint32_t cpuid;
+ union {
+ struct xen_get_cpufreq_para get_para;
+ struct xen_set_cpufreq_gov set_gov;
+ struct xen_set_cpufreq_para set_para;
+ uint64_aligned_t get_avgfreq;
+ uint32_t set_sched_opt_smt;
+ uint32_t get_max_cstate;
+ uint32_t set_max_cstate;
+ uint32_t get_vcpu_migration_delay;
+ uint32_t set_vcpu_migration_delay;
+ } u;
+};
+
+/* XEN_SYSCTL_page_offline_op */
+struct xen_sysctl_page_offline_op {
+ /* IN: range of page to be offlined */
+#define sysctl_page_offline 1
+#define sysctl_page_online 2
+#define sysctl_query_page_offline 3
+ uint32_t cmd;
+ uint32_t start;
+ uint32_t end;
+ /* OUT: result of page offline request */
+ /*
+ * bit 0~15: result flags
+ * bit 16~31: owner
+ */
+ GUEST_HANDLE(uint32_t) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED (0x1UL << 1)
+#define PG_OFFLINE_PENDING (0x1UL << 2)
+#define PG_OFFLINE_FAILED (0x1UL << 3)
+#define PG_OFFLINE_AGAIN (0x1UL << 4)
+
+#define PG_ONLINE_FAILED PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK (0xFFUL << 4)
+
+/* valid when PG_OFFLINE_FAILED or PG_OFFLINE_PENDING */
+#define PG_OFFLINE_XENPAGE (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM (0x1UL << 11)
+#define PG_OFFLINE_OWNED (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN (0x1UL << 13)
+#define PG_ONLINE_BROKEN PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
+/* XEN_SYSCTL_lockprof_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_LOCKPROF_reset 1 /* Reset all profile data to zero. */
+#define XEN_SYSCTL_LOCKPROF_query 2 /* Get lock profile information. */
+/* Record-type: */
+#define LOCKPROF_TYPE_GLOBAL 0 /* global lock, idx meaningless */
+#define LOCKPROF_TYPE_PERDOM 1 /* per-domain lock, idx is domid */
+#define LOCKPROF_TYPE_N 2 /* number of types */
+struct xen_sysctl_lockprof_data {
+ char name[40]; /* lock name (may include up to 2 %d specifiers) */
+ int32_t type; /* LOCKPROF_TYPE_??? */
+ int32_t idx; /* index (e.g. domain id) */
+ uint64_aligned_t lock_cnt; /* # of locking succeeded */
+ uint64_aligned_t block_cnt; /* # of wait for lock */
+ uint64_aligned_t lock_time; /* nsecs lock held */
+ uint64_aligned_t block_time; /* nsecs waited for lock */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_lockprof_data);
+
+struct xen_sysctl_lockprof_op {
+ /* IN variables. */
+ uint32_t cmd; /* XEN_SYSCTL_LOCKPROF_??? */
+ uint32_t max_elem; /* size of output buffer */
+ /* OUT variables (query only). */
+ uint32_t nr_elem; /* number of elements available */
+ uint64_aligned_t time; /* nsecs of profile measurement */
+ /* profile information (or NULL) */
+ GUEST_HANDLE_64(xen_sysctl_lockprof_data) data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_lockprof_op);
+
+/* XEN_SYSCTL_topologyinfo */
+#define INVALID_TOPOLOGY_ID (~0U)
+struct xen_sysctl_topologyinfo {
+ /*
+ * IN: maximum addressable entry in the caller-provided arrays.
+ * OUT: largest cpu identifier in the system.
+ * If OUT is greater than IN then the arrays are truncated!
+ * If OUT is leass than IN then the array tails are not written by
+ * sysctl.
+ */
+ uint32_t max_cpu_index;
+
+ /*
+ * If not NULL, these arrays are filled with core/socket/node identifier
+ * for each cpu.
+ * If a cpu has no core/socket/node information (e.g., cpu not present)
+ * then the sentinel value ~0u is written to each array.
+ * The number of array elements written by the sysctl is:
+ * min(@max_cpu_index_IN,@max_cpu_index_OUT)+1
+ */
+ GUEST_HANDLE_64(uint32_t) cpu_to_core;
+ GUEST_HANDLE_64(uint32_t) cpu_to_socket;
+ GUEST_HANDLE_64(uint32_t) cpu_to_node;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_topologyinfo);
+
+/* XEN_SYSCTL_numainfo */
+#define INVALID_NUMAINFO_ID (~0U)
+struct xen_sysctl_numainfo {
+ /*
+ * IN: maximum addressable entry in the caller-provided arrays.
+ * OUT: largest node identifier in the system.
+ * If OUT is greater than IN then the arrays are truncated!
+ */
+ uint32_t max_node_index;
+
+ /* NB. Entries are 0 if node is not present. */
+ GUEST_HANDLE_64(uint64_t) node_to_memsize;
+ GUEST_HANDLE_64(uint64_t) node_to_memfree;
+
+ /*
+ * Array, of size (max_node_index+1)^2, listing memory access distances
+ * between nodes. If an entry has no node distance information (e.g., node
+ * not present) then the value ~0u is written.
+ *
+ * Note that the array rows must be indexed by multiplying by the minimum
+ * of the caller-provided max_node_index and the returned value of
+ * max_node_index. That is, if the largest node index in the system is
+ * smaller than the caller can handle, a smaller 2-d array is constructed
+ * within the space provided by the caller. When this occurs, trailing
+ * space provided by the caller is not modified. If the largest node index
+ * in the system is larger than the caller can handle, then a 2-d array of
+ * the maximum size handleable by the caller is constructed.
+ */
+ GUEST_HANDLE_64(uint32_t) node_to_node_distance;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_numainfo);
+
+/* XEN_SYSCTL_cpupool_op */
+#define XEN_SYSCTL_CPUPOOL_OP_CREATE 1 /* C */
+#define XEN_SYSCTL_CPUPOOL_OP_DESTROY 2 /* D */
+#define XEN_SYSCTL_CPUPOOL_OP_INFO 3 /* I */
+#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU 4 /* A */
+#define XEN_SYSCTL_CPUPOOL_OP_RMCPU 5 /* R */
+#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN 6 /* M */
+#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO 7 /* F */
+#define XEN_SYSCTL_CPUPOOL_PAR_ANY 0xFFFFFFFF
+struct xen_sysctl_cpupool_op {
+ uint32_t op; /* IN */
+ uint32_t cpupool_id; /* IN: CDIARM OUT: CI */
+ uint32_t sched_id; /* IN: C OUT: I */
+ uint32_t domid; /* IN: M */
+ uint32_t cpu; /* IN: AR */
+ uint32_t n_dom; /* OUT: I */
+ struct xenctl_bitmap cpumap; /* OUT: IF */
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_cpupool_op);
+
+#define ARINC653_MAX_DOMAINS_PER_SCHEDULE 64
+/*
+ * This structure is used to pass a new ARINC653 schedule from a
+ * privileged domain (ie dom0) to Xen.
+ */
+struct xen_sysctl_arinc653_schedule {
+ /* major_frame holds the time for the new schedule's major frame
+ * in nanoseconds. */
+ uint64_aligned_t major_frame;
+ /* num_sched_entries holds how many of the entries in the
+ * sched_entries[] array are valid. */
+ uint8_t num_sched_entries;
+ /* The sched_entries array holds the actual schedule entries. */
+ struct {
+ /* dom_handle must match a domain's UUID */
+ xen_domain_handle_t dom_handle;
+ /*
+ * If a domain has multiple VCPUs, vcpu_id specifies which one
+ * this schedule entry applies to. It should be set to 0 if
+ * there is only one VCPU for the domain. */
+ unsigned int vcpu_id;
+ /*
+ * runtime specifies the amount of time that should be allocated
+ * to this VCPU per major frame. It is specified in nanoseconds
+ */
+ uint64_aligned_t runtime;
+ } sched_entries[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_arinc653_schedule);
+
+struct xen_sysctl_credit_schedule {
+ /* Length of timeslice in milliseconds */
+#define XEN_SYSCTL_CSCHED_TSLICE_MAX 1000
+#define XEN_SYSCTL_CSCHED_TSLICE_MIN 1
+ unsigned tslice_ms;
+ /* Rate limit (minimum timeslice) in microseconds */
+#define XEN_SYSCTL_SCHED_RATELIMIT_MAX 500000
+#define XEN_SYSCTL_SCHED_RATELIMIT_MIN 100
+ unsigned ratelimit_us;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_credit_schedule);
+
+/* XEN_SYSCTL_scheduler_op */
+/* Set or get info? */
+#define XEN_SYSCTL_SCHEDOP_putinfo 0
+#define XEN_SYSCTL_SCHEDOP_getinfo 1
+struct xen_sysctl_scheduler_op {
+ uint32_t cpupool_id; /* Cpupool whose scheduler is to be targetted. */
+ uint32_t sched_id; /* XEN_SCHEDULER_* (domctl.h) */
+ uint32_t cmd; /* XEN_SYSCTL_SCHEDOP_* */
+ union {
+ struct xen_sysctl_sched_arinc653 {
+ GUEST_HANDLE_64(xen_sysctl_arinc653_schedule) schedule;
+ } sched_arinc653;
+ struct xen_sysctl_credit_schedule sched_credit;
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_scheduler_op);
+
+/* XEN_SYSCTL_coverage_op */
+/*
+ * Get total size of information, to help allocate
+ * the buffer. The pointer points to a 32 bit value.
+ */
+#define XEN_SYSCTL_COVERAGE_get_total_size 0
+
+/*
+ * Read coverage information in a single run
+ * You must use a tool to split them.
+ */
+#define XEN_SYSCTL_COVERAGE_read 1
+
+/*
+ * Reset all the coverage counters to 0
+ * No parameters.
+ */
+#define XEN_SYSCTL_COVERAGE_reset 2
+
+/*
+ * Like XEN_SYSCTL_COVERAGE_read but reset also
+ * counters to 0 in a single call.
+ */
+#define XEN_SYSCTL_COVERAGE_read_and_reset 3
+
+struct xen_sysctl_coverage_op {
+ uint32_t cmd; /* XEN_SYSCTL_COVERAGE_* */
+ union {
+ uint32_t total_size; /* OUT */
+ GUEST_HANDLE_64(uint8_t) raw_info; /* OUT */
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl_coverage_op);
+
+
+struct xen_sysctl {
+ uint32_t cmd;
+#define XEN_SYSCTL_readconsole 1
+#define XEN_SYSCTL_tbuf_op 2
+#define XEN_SYSCTL_physinfo 3
+#define XEN_SYSCTL_sched_id 4
+#define XEN_SYSCTL_perfc_op 5
+#define XEN_SYSCTL_debug_keys 7
+#define XEN_SYSCTL_getcpuinfo 8
+#define XEN_SYSCTL_availheap 9
+#define XEN_SYSCTL_get_pmstat 10
+#define XEN_SYSCTL_cpu_hotplug 11
+#define XEN_SYSCTL_pm_op 12
+#define XEN_SYSCTL_page_offline_op 14
+#define XEN_SYSCTL_lockprof_op 15
+#define XEN_SYSCTL_topologyinfo 16
+#define XEN_SYSCTL_numainfo 17
+#define XEN_SYSCTL_cpupool_op 18
+#define XEN_SYSCTL_scheduler_op 19
+#define XEN_SYSCTL_coverage_op 20
+ uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
+ union {
+ struct xen_sysctl_readconsole readconsole;
+ struct xen_sysctl_tbuf_op tbuf_op;
+ struct xen_sysctl_physinfo physinfo;
+ struct xen_sysctl_topologyinfo topologyinfo;
+ struct xen_sysctl_numainfo numainfo;
+ struct xen_sysctl_sched_id sched_id;
+ struct xen_sysctl_perfc_op perfc_op;
+ struct xen_sysctl_debug_keys debug_keys;
+ struct xen_sysctl_getcpuinfo getcpuinfo;
+ struct xen_sysctl_availheap availheap;
+ struct xen_sysctl_get_pmstat get_pmstat;
+ struct xen_sysctl_cpu_hotplug cpu_hotplug;
+ struct xen_sysctl_pm_op pm_op;
+ struct xen_sysctl_page_offline_op page_offline;
+ struct xen_sysctl_lockprof_op lockprof_op;
+ struct xen_sysctl_cpupool_op cpupool_op;
+ struct xen_sysctl_scheduler_op scheduler_op;
+ struct xen_sysctl_coverage_op coverage_op;
+ uint8_t pad[128];
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_sysctl);
+
+#endif /* __XEN_PUBLIC_SYSCTL_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 53ec416..cf64566 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -57,6 +57,7 @@
#define __HYPERVISOR_event_channel_op 32
#define __HYPERVISOR_physdev_op 33
#define __HYPERVISOR_hvm_op 34
+#define __HYPERVISOR_sysctl 35
#define __HYPERVISOR_tmem_op 38
/* Architecture-specific hypercall definitions. */
@@ -526,6 +527,11 @@ struct tmem_op {
DEFINE_GUEST_HANDLE(u64);
+struct xenctl_bitmap {
+ GUEST_HANDLE_64(uint8_t) bitmap;
+ uint32_t nr_bits;
+};
+
#else /* __ASSEMBLY__ */
/* In assembly code we cannot use C numeric constant suffixes. */
--
1.9.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [RFC PATCH 7/7] xen/arm: cpufreq: add cpufreq driver
2014-10-07 14:20 [RFC PATCH] xen_cpufreq implementation in kernel Oleksandr Dmytryshyn
` (5 preceding siblings ...)
2014-10-07 14:20 ` [RFC PATCH 6/7] cpufreq: make CPU Frequency scaling drivers visible for XEN_DOM0 config Oleksandr Dmytryshyn
@ 2014-10-07 14:20 ` Oleksandr Dmytryshyn
2014-10-07 15:57 ` Julien Grall
2014-10-07 15:44 ` [RFC PATCH] xen_cpufreq implementation in kernel Julien Grall
7 siblings, 1 reply; 16+ messages in thread
From: Oleksandr Dmytryshyn @ 2014-10-07 14:20 UTC (permalink / raw)
To: Ian Campbell, Stefano Stabellini, Tim Deegan, xen-devel
Xen changes frequencies on CPUs using this driver.
Signed-off-by: Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com>
---
drivers/xen/Kconfig | 20 +
drivers/xen/Makefile | 1 +
drivers/xen/xen-cpufreq.c | 882 ++++++++++++++++++++++++++++++++++++++++++++
include/xen/interface/xen.h | 1 +
4 files changed, 904 insertions(+)
create mode 100644 drivers/xen/xen-cpufreq.c
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 254a5cc..bb2d3d5 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -220,6 +220,26 @@ config XEN_ACPI_PROCESSOR
called xen_acpi_processor If you do not know what to choose, select
M here. If the CPUFREQ drivers are built in, select Y here.
+config XEN_CPUFREQ
+ bool "Xen Cpufreq driver"
+ depends on XEN_DOM0 && !CPU_FREQ
+ default n
+ help
+ This driver uploads Power Management information to the Xen
+ hypervisor and changes CPUs frequency using CPU Frequency scaling
+ drivers.
+
+ To do that the driver uses CPU Frequency scaling drivers to parse
+ the Power Management data and uploads said information to the Xen
+ hypervisor. Then the Xen hypervisor can select the proper Pxx states.
+
+ Then the Xen hypervisor can change CPUs frequency by giving commands
+ via this driver to the CPU Frequency scaling driver.
+
+ To compile this driver as a module, choose M here: the module will be
+ called xen_acpi_processor If you do not know what to choose, select
+ M here. If the CPUFREQ drivers are built in, select Y here.
+
config XEN_MCE_LOG
bool "Xen platform mcelog"
depends on XEN_DOM0 && X86_64 && X86_MCE
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index b7c835f..0345d65 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o
obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o
obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o
+obj-$(CONFIG_XEN_CPUFREQ) += xen-cpufreq.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o
diff --git a/drivers/xen/xen-cpufreq.c b/drivers/xen/xen-cpufreq.c
new file mode 100644
index 0000000..a0d9adc
--- /dev/null
+++ b/drivers/xen/xen-cpufreq.c
@@ -0,0 +1,882 @@
+/*
+ * Copyright (C) 2001 Russell King
+ * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *
+ * Oct 2005 - Ashok Raj <ashok.raj@intel.com>
+ * Added handling for CPU hotplug
+ * Feb 2006 - Jacob Shin <jacob.shin@amd.com>
+ * Fix handling for CPU hotplug -- affected CPUs
+ *
+ * (C) 2014 GlobalLogic Inc.
+ *
+ * Based on drivers/cpufreq/cpufreq.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/irq.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+
+#include <trace/events/power.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/sysctl.h>
+#include <asm/xen/hypercall.h>
+
+#ifdef CONFIG_CPUMASK_OFFSTACK
+#error CONFIG_CPUMASK_OFFSTACK config should not be used with this driver
+#endif
+
+static int xen_nr_cpus;
+static int xen_irq;
+
+#define for_each_xen_cpu(cpu, mask) \
+ for ((cpu) = -1; \
+ (cpu) = cpumask_next((cpu), (mask)), \
+ (cpu) < xen_nr_cpus;)
+
+static struct cpufreq_driver *xen_cpufreq_driver;
+static DEFINE_PER_CPU(struct cpufreq_policy *, xen_cpufreq_cpu_data);
+
+static DEFINE_SPINLOCK(xen_cpufreq_driver_lock);
+
+/*
+ * xen_cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
+ * all cpufreq/hotplug/workqueue/etc related lock issues.
+ *
+ * The rules for this semaphore:
+ * - Any routine that wants to read from the policy structure will
+ * do a down_read on this semaphore.
+ * - Any routine that will write to the policy structure and/or may take away
+ * the policy altogether (eg. CPU hotplug), will hold this lock in write
+ * mode before doing so.
+ *
+ * Additional rules:
+ * - Governor routines that can be called in cpufreq hotplug path should not
+ * take this sem as top level hotplug notifier handler takes this.
+ * - Lock should not be held across
+ * __cpufreq_governor(data, CPUFREQ_GOV_STOP);
+ */
+static DEFINE_PER_CPU(int, xen_cpufreq_policy_cpu);
+static DEFINE_PER_CPU(struct rw_semaphore, xen_cpu_policy_rwsem);
+
+#define lock_policy_rwsem(mode, cpu) \
+static int lock_policy_rwsem_##mode \
+(int cpu) \
+{ \
+ int policy_cpu = per_cpu(xen_cpufreq_policy_cpu, cpu); \
+ BUG_ON(policy_cpu == -1); \
+ down_##mode(&per_cpu(xen_cpu_policy_rwsem, policy_cpu)); \
+ \
+ return 0; \
+}
+
+lock_policy_rwsem(write, cpu);
+
+static void unlock_policy_rwsem_write(int cpu)
+{
+ int policy_cpu = per_cpu(xen_cpufreq_policy_cpu, cpu);
+ BUG_ON(policy_cpu == -1);
+ up_write(&per_cpu(xen_cpu_policy_rwsem, policy_cpu));
+}
+
+/**
+ * The "transition" notifier list for kernel code that needs to handle
+ * changes to devices when the CPU clock speed changes.
+ * The mutex locks this list.
+ */
+static struct srcu_notifier_head xen_cpufreq_transition_notifier_list;
+
+static bool init_cpufreq_transition_notifier_list_called;
+static int __init init_cpufreq_transition_notifier_list(void)
+{
+ srcu_init_notifier_head(&xen_cpufreq_transition_notifier_list);
+ init_cpufreq_transition_notifier_list_called = true;
+ return 0;
+}
+pure_initcall(init_cpufreq_transition_notifier_list);
+
+static struct cpufreq_policy *xen_cpufreq_cpu_get(unsigned int cpu)
+{
+ struct cpufreq_policy *data = NULL;
+ unsigned long flags;
+
+ if (cpu >= xen_nr_cpus)
+ goto err_out;
+
+ /* get the cpufreq driver */
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+
+ if (!xen_cpufreq_driver)
+ goto err_out_unlock;
+
+ /* get the CPU */
+ data = per_cpu(xen_cpufreq_cpu_data, cpu);
+
+err_out_unlock:
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+err_out:
+ return data;
+}
+
+static void xen_cpufreq_cpu_put(struct cpufreq_policy *data)
+{
+ module_put(xen_cpufreq_driver->owner);
+}
+
+static int push_data_to_hypervisor(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table)
+{
+ int ret = 0;
+ unsigned int i;
+ unsigned int cpu;
+ uint32_t platform_limit = 0;
+ unsigned int max_freq = 0;
+ unsigned int state_count = 0;
+ unsigned int prev_freq = 0;
+ struct xen_processor_px *dst_states;
+ struct xen_processor_performance *dst_perf;
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.type = XEN_PM_PX,
+ };
+
+ dst_perf = &op.u.set_pminfo.perf;
+
+ /* Check freq table and find max frequency */
+ for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ unsigned int freq = table[i].frequency;
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+
+ if (table[i].index != state_count || freq <= prev_freq) {
+ pr_err("Frequency table format error\n");
+ return -EINVAL;
+ }
+
+ prev_freq = freq;
+ state_count++;
+ if (freq > max_freq)
+ max_freq = freq;
+ }
+
+ if (!state_count)
+ return -EINVAL;
+
+ dst_perf->state_count = state_count;
+
+ dst_states = kcalloc(state_count,
+ sizeof(struct xen_processor_px), GFP_KERNEL);
+
+ if (!dst_states)
+ return -ENOMEM;
+
+ set_xen_guest_handle(dst_perf->states, dst_states);
+
+ /*
+ * Freq table should start from lower values
+ * dst_states should start from higer values
+ */
+ for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ unsigned int freq = table[i].frequency;
+ unsigned int tbl_index = state_count - 1 - table[i].index;
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+
+ if (freq == max_freq)
+ platform_limit = tbl_index;
+
+ dst_states[tbl_index].core_frequency = freq / 1000;
+ dst_states[tbl_index].transition_latency =
+ policy->cpuinfo.transition_latency / 1000;
+ }
+
+ dst_perf->shared_type = policy->shared_type;
+ dst_perf->platform_limit = platform_limit;
+ dst_perf->domain_info.domain = policy->cpu;
+ dst_perf->domain_info.num_processors = xen_nr_cpus;
+ dst_perf->flags = XEN_PX_PSS | XEN_PX_PSD | XEN_PX_PPC;
+
+ for_each_xen_cpu(cpu, policy->cpus) {
+ op.u.set_pminfo.id = cpu;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret) {
+ pr_debug("Hypervisor error(%d) for CPU%u\n", ret, cpu);
+ goto err_free_states;
+ }
+ pr_debug("CPU%u - P-states uploaded\n", cpu);
+
+ for (i = 0; i < dst_perf->state_count; i++) {
+ pr_debug(" state %d: %d MHz, %d uS\n",
+ i, (u32) dst_states[i].core_frequency,
+ (u32) dst_states[i].transition_latency);
+ }
+ }
+
+err_free_states:
+ kfree(dst_states);
+ return ret;
+}
+
+/*
+ * Returns:
+ * Negative: Failure
+ * 0: Success
+ * Positive: When we have a managed CPU and the sysfs got symlinked
+ */
+static int xen_cpufreq_add_dev_policy(unsigned int cpu,
+ struct cpufreq_policy *policy)
+{
+ int ret = 0;
+#ifdef CONFIG_SMP
+ unsigned long flags;
+ unsigned int j;
+
+ for_each_cpu(j, policy->cpus) {
+ struct cpufreq_policy *managed_policy;
+
+ if (cpu == j)
+ continue;
+
+ /* Check for existing affected CPUs.
+ * They may not be aware of it due to CPU Hotplug.
+ * cpufreq_cpu_put is called when the device is removed
+ * in __cpufreq_remove_dev()
+ */
+ managed_policy = xen_cpufreq_cpu_get(j);
+ if (unlikely(managed_policy)) {
+ /* Set proper policy_cpu */
+ unlock_policy_rwsem_write(cpu);
+ per_cpu(xen_cpufreq_policy_cpu, cpu) =
+ managed_policy->cpu;
+
+ if (lock_policy_rwsem_write(cpu) < 0) {
+ /* Should not go through policy unlock path */
+ if (xen_cpufreq_driver->exit)
+ xen_cpufreq_driver->exit(policy);
+ xen_cpufreq_cpu_put(managed_policy);
+ return -EBUSY;
+ }
+
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+ cpumask_copy(managed_policy->cpus, policy->cpus);
+ per_cpu(xen_cpufreq_cpu_data, cpu) = managed_policy;
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+
+ pr_debug("CPU already managed, adding link\n");
+
+ /*
+ * Success. We only needed to be added to the mask.
+ * Call driver->exit() because only the cpu parent of
+ * the kobj needed to call init().
+ */
+ if (xen_cpufreq_driver->exit)
+ xen_cpufreq_driver->exit(policy);
+
+ return 1;
+ }
+ }
+#endif
+ return ret;
+}
+
+/**
+ * xen_cpufreq_add_dev - add a CPU device
+ *
+ * Adds the cpufreq interface for a CPU device.
+ */
+static int xen_cpufreq_add_dev(unsigned int cpu)
+{
+ int ret = 0;
+ struct cpufreq_policy *policy;
+ unsigned long flags;
+ unsigned int j;
+
+ pr_debug("adding CPU %u\n", cpu);
+
+#ifdef CONFIG_SMP
+ /* check whether a different CPU already registered this
+ * CPU because it is in the same boat. */
+ policy = xen_cpufreq_cpu_get(cpu);
+ if (unlikely(policy)) {
+ xen_cpufreq_cpu_put(policy);
+ return 0;
+ }
+#endif
+
+ if (!try_module_get(xen_cpufreq_driver->owner)) {
+ ret = -EINVAL;
+ goto module_out;
+ }
+
+ ret = -ENOMEM;
+ policy = kzalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);
+ if (!policy)
+ goto nomem_out;
+
+ if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL))
+ goto err_free_policy;
+
+ if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL))
+ goto err_free_cpumask;
+
+ policy->cpu = cpu;
+ cpumask_copy(policy->cpus, cpumask_of(cpu));
+
+ /* Initially set CPU itself as the policy_cpu */
+ per_cpu(xen_cpufreq_policy_cpu, cpu) = cpu;
+ ret = (lock_policy_rwsem_write(cpu) < 0);
+ WARN_ON(ret);
+
+ /* call driver. From then on the cpufreq must be able
+ * to accept all calls to ->verify and ->setpolicy for this CPU
+ */
+ ret = xen_cpufreq_driver->init(policy);
+ if (ret) {
+ pr_debug("initialization failed\n");
+ goto err_unlock_policy;
+ }
+ ret = xen_cpufreq_add_dev_policy(cpu, policy);
+ if (ret) {
+ if (ret > 0)
+ /* This is a managed cpu, symlink created,
+ exit with 0 */
+ ret = 0;
+ goto err_unlock_policy;
+ }
+
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+ for_each_cpu(j, policy->cpus) {
+ per_cpu(xen_cpufreq_cpu_data, j) = policy;
+ per_cpu(xen_cpufreq_policy_cpu, j) = policy->cpu;
+ }
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+
+ unlock_policy_rwsem_write(cpu);
+
+ module_put(xen_cpufreq_driver->owner);
+ pr_debug("initialization complete\n");
+
+ return 0;
+
+err_unlock_policy:
+ unlock_policy_rwsem_write(cpu);
+ free_cpumask_var(policy->related_cpus);
+err_free_cpumask:
+ free_cpumask_var(policy->cpus);
+err_free_policy:
+ kfree(policy);
+nomem_out:
+ module_put(xen_cpufreq_driver->owner);
+module_out:
+ return ret;
+}
+
+/**
+ * __cpufreq_remove_dev - remove a CPU device
+ *
+ * Removes the cpufreq interface for a CPU device.
+ * Caller should already have policy_rwsem in write mode for this CPU.
+ * This routine frees the rwsem before returning.
+ */
+static int __xen_cpufreq_remove_dev(unsigned int cpu)
+{
+ unsigned long flags;
+ struct cpufreq_policy *data;
+#ifdef CONFIG_SMP
+ unsigned int j;
+#endif
+
+ pr_debug("unregistering CPU %u\n", cpu);
+
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+ data = per_cpu(xen_cpufreq_cpu_data, cpu);
+
+ if (!data) {
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+ unlock_policy_rwsem_write(cpu);
+ return -EINVAL;
+ }
+ per_cpu(xen_cpufreq_cpu_data, cpu) = NULL;
+
+
+#ifdef CONFIG_SMP
+ /* if this isn't the CPU which is the parent of the kobj, we
+ * only need to unlink, put and exit
+ */
+ if (unlikely(cpu != data->cpu)) {
+ pr_debug("removing link\n");
+ cpumask_clear_cpu(cpu, data->cpus);
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+ xen_cpufreq_cpu_put(data);
+ unlock_policy_rwsem_write(cpu);
+ return 0;
+ }
+#endif
+
+#ifdef CONFIG_SMP
+
+ /* if we have other CPUs still registered, we need to unlink them,
+ * or else wait_for_completion below will lock up. Clean the
+ * per_cpu(cpufreq_cpu_data) while holding the lock, and remove
+ * the sysfs links afterwards.
+ */
+ if (unlikely(cpumask_weight(data->cpus) > 1)) {
+ for_each_cpu(j, data->cpus) {
+ if (j == cpu)
+ continue;
+ per_cpu(xen_cpufreq_cpu_data, j) = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+
+ if (unlikely(cpumask_weight(data->cpus) > 1)) {
+ for_each_cpu(j, data->cpus) {
+ if (j == cpu)
+ continue;
+ pr_debug("removing link for cpu %u\n", j);
+ unlock_policy_rwsem_write(cpu);
+ lock_policy_rwsem_write(cpu);
+ xen_cpufreq_cpu_put(data);
+ }
+ }
+#else
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+#endif
+
+ unlock_policy_rwsem_write(cpu);
+
+ lock_policy_rwsem_write(cpu);
+ if (xen_cpufreq_driver->exit)
+ xen_cpufreq_driver->exit(data);
+ unlock_policy_rwsem_write(cpu);
+
+ free_cpumask_var(data->related_cpus);
+ free_cpumask_var(data->cpus);
+ kfree(data);
+
+ return 0;
+}
+
+static int xen_cpufreq_remove_dev(unsigned int cpu)
+{
+ int retval;
+
+ if (unlikely(lock_policy_rwsem_write(cpu)))
+ BUG();
+
+ retval = __xen_cpufreq_remove_dev(cpu);
+ return retval;
+}
+
+/*********************************************************************
+ * EXTERNALLY AFFECTING FREQUENCY CHANGES *
+ *********************************************************************/
+
+/**
+ * adjust_jiffies - adjust the system "loops_per_jiffy"
+ *
+ * This function alters the system "loops_per_jiffy" for the clock
+ * speed change. Note that loops_per_jiffy cannot be updated on SMP
+ * systems as each CPU might be scaled differently. So, use the arch
+ * per-CPU loops_per_jiffy value wherever possible.
+ */
+#ifndef CONFIG_SMP
+static unsigned long l_p_j_ref;
+static unsigned int l_p_j_ref_freq;
+
+static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
+{
+ if (ci->flags & CPUFREQ_CONST_LOOPS)
+ return;
+
+ if (!l_p_j_ref_freq) {
+ l_p_j_ref = loops_per_jiffy;
+ l_p_j_ref_freq = ci->old;
+ pr_debug("saving %lu as reference value for loops_per_jiffy; "
+ "freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
+ }
+ if ((val == CPUFREQ_POSTCHANGE && ci->old != ci->new) ||
+ (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) {
+ loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,
+ ci->new);
+ pr_debug("scaling loops_per_jiffy to %lu "
+ "for frequency %u kHz\n", loops_per_jiffy, ci->new);
+ }
+}
+#else
+static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
+{
+ return;
+}
+#endif
+
+
+/**
+ * cpufreq_notify_transition - call notifier chain and adjust_jiffies
+ * on frequency transition.
+ *
+ * This function calls the transition notifiers and the "adjust_jiffies"
+ * function. It is called twice on all CPU frequency changes that have
+ * external effects.
+ */
+void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
+{
+ struct cpufreq_policy *policy;
+
+ BUG_ON(irqs_disabled());
+
+ freqs->flags = xen_cpufreq_driver->flags;
+ pr_debug("notification %u of frequency transition to %u kHz\n",
+ state, freqs->new);
+
+ policy = per_cpu(xen_cpufreq_cpu_data, freqs->cpu);
+ switch (state) {
+ case CPUFREQ_PRECHANGE:
+ /* detect if the driver reported a value as "old frequency"
+ * which is not equal to what the cpufreq core thinks is
+ * "old frequency".
+ */
+ if (!(xen_cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
+ if ((policy) && (policy->cpu == freqs->cpu) &&
+ (policy->cur) && (policy->cur != freqs->old)) {
+ pr_debug("Warning: CPU frequency is"
+ " %u, cpufreq assumed %u kHz.\n",
+ freqs->old, policy->cur);
+ freqs->old = policy->cur;
+ }
+ }
+ srcu_notifier_call_chain(&xen_cpufreq_transition_notifier_list,
+ CPUFREQ_PRECHANGE, freqs);
+ adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
+ break;
+
+ case CPUFREQ_POSTCHANGE:
+ adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
+ pr_debug("FREQ: %lu - CPU: %lu\n", (unsigned long)freqs->new,
+ (unsigned long)freqs->cpu);
+ trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
+ srcu_notifier_call_chain(&xen_cpufreq_transition_notifier_list,
+ CPUFREQ_POSTCHANGE, freqs);
+ if (likely(policy) && likely(policy->cpu == freqs->cpu))
+ policy->cur = freqs->new;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
+
+/*********************************************************************
+ * GOVERNORS *
+ *********************************************************************/
+
+int __xen_cpufreq_driver_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ int retval = -EINVAL;
+ unsigned int old_target_freq = target_freq;
+
+ /* Make sure that target_freq is within supported range */
+ if (target_freq > policy->max)
+ target_freq = policy->max;
+ if (target_freq < policy->min)
+ target_freq = policy->min;
+
+ pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",
+ policy->cpu, target_freq, relation, old_target_freq);
+
+ if (target_freq == policy->cur)
+ return 0;
+
+ if (xen_cpufreq_driver->target)
+ retval = xen_cpufreq_driver->target(policy, target_freq,
+ relation);
+
+ return retval;
+}
+
+int xen_cpufreq_driver_target(unsigned int cpu,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ int ret = -EINVAL;
+ struct cpufreq_policy *policy;
+
+ policy = xen_cpufreq_cpu_get(cpu);
+ if (!policy)
+ goto no_policy;
+
+ if (unlikely(lock_policy_rwsem_write(policy->cpu)))
+ goto fail;
+
+ ret = __xen_cpufreq_driver_target(policy, target_freq, relation);
+
+ unlock_policy_rwsem_write(policy->cpu);
+
+fail:
+ xen_cpufreq_cpu_put(policy);
+no_policy:
+ return ret;
+}
+
+/*********************************************************************
+ * HANDLE COMMANDS FROM XEN *
+ *********************************************************************/
+static void xen_cpufreq_work_hnd(struct work_struct *w);
+
+static struct workqueue_struct *xen_cpufreq_wq;
+static DECLARE_WORK(xen_cpufreq_work, xen_cpufreq_work_hnd);
+
+static void xen_cpufreq_work_hnd(struct work_struct *w)
+{
+ int ret;
+ struct xen_sysctl op = {
+ .cmd = XEN_SYSCTL_cpufreq_op,
+ .interface_version = XEN_SYSCTL_INTERFACE_VERSION,
+ };
+ struct xen_sysctl_cpufreq_op *cf_op = &op.u.cpufreq_op;
+
+ cf_op->cmd = XEN_SYSCTL_CPUFREQ_get_target;
+ ret = HYPERVISOR_sysctl(&op);
+ if (ret) {
+ pr_err("Hypervisor cpufreq error get targer (%d)\n", ret);
+ return;
+ }
+
+ ret = xen_cpufreq_driver_target(cf_op->u.target.cpu,
+ cf_op->u.target.freq,
+ cf_op->u.target.relation);
+
+ cf_op->cmd = XEN_SYSCTL_CPUFREQ_set_result;
+ cf_op->u.result = ret;
+ ret = HYPERVISOR_sysctl(&op);
+ if (ret) {
+ pr_err("Hypervisor cpufreq set result error (%d)\n", ret);
+ return;
+ }
+}
+
+static irqreturn_t xen_cpufreq_interrupt(int irq, void *data)
+{
+ queue_work(xen_cpufreq_wq, &xen_cpufreq_work);
+ return IRQ_HANDLED;
+}
+
+/*********************************************************************
+ * REGISTER / UNREGISTER CPUFREQ DRIVER *
+ *********************************************************************/
+
+/**
+ * cpufreq_register_driver - register a CPU Frequency driver
+ * @driver_data: A struct cpufreq_driver containing the values#
+ * submitted by the CPU Frequency driver.
+ *
+ * Registers a CPU Frequency driver to this core code. This code
+ * returns zero on success, -EBUSY when another driver got here first
+ * (and isn't unregistered in the meantime).
+ *
+ */
+int cpufreq_register_driver(struct cpufreq_driver *driver_data)
+{
+ unsigned long flags;
+ int ret;
+ unsigned int cpu;
+ struct cpufreq_frequency_table *table;
+ struct cpufreq_policy *policy;
+ cpumask_var_t pushed_cpus;
+ int irq;
+
+ if (!xen_nr_cpus)
+ return -EPROBE_DEFER;
+
+ if (!driver_data || !driver_data->verify || !driver_data->init ||
+ (!driver_data->target))
+ return -EINVAL;
+
+ pr_debug("trying to register driver %s\n", driver_data->name);
+
+ if (driver_data->setpolicy)
+ driver_data->flags |= CPUFREQ_CONST_LOOPS;
+
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+
+ if (xen_cpufreq_driver) {
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+ return -EBUSY;
+ }
+ xen_cpufreq_driver = driver_data;
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+
+ irq = bind_virq_to_irq(VIRQ_CPUFREQ, 0);
+ if (irq < 0) {
+ pr_err("Bind virq (%d) error (%d)\n", VIRQ_CPUFREQ, irq);
+ ret = irq;
+ goto err_remove_drv;
+ }
+
+ irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN|IRQ_NOPROBE);
+
+ ret = request_irq(irq, xen_cpufreq_interrupt, 0,
+ "xen_cpufreq", NULL);
+
+ if (ret < 0) {
+ pr_err("Request irq (%d) error (%d)\n", irq, ret);
+ goto err_unbind_from_irqhnd;
+ }
+
+ xen_irq = irq;
+
+ for (cpu = 0; cpu < xen_nr_cpus; cpu++) {
+ ret = xen_cpufreq_add_dev(cpu);
+ if (ret)
+ goto err_remove_cpu;
+ }
+
+ if (!zalloc_cpumask_var(&pushed_cpus, GFP_KERNEL))
+ goto err_remove_cpu;
+
+ for (cpu = 0; cpu < xen_nr_cpus; cpu++) {
+ if (cpumask_test_cpu(cpu, pushed_cpus))
+ continue;
+
+ policy = xen_cpufreq_cpu_get(cpu);
+ if (!policy) {
+ ret = -EINVAL;
+ goto err_free_cpumask;
+ }
+
+ cpumask_or(pushed_cpus, pushed_cpus, policy->cpus);
+ table = cpufreq_frequency_get_table(policy->cpu);
+ if (!table) {
+ ret = -EINVAL;
+ goto err_free_cpumask;
+ }
+
+ ret = push_data_to_hypervisor(policy, table);
+ if (ret)
+ goto err_free_cpumask;
+ }
+
+ free_cpumask_var(pushed_cpus);
+
+ pr_debug("driver %s up and running\n", driver_data->name);
+
+ return 0;
+
+err_free_cpumask:
+ free_cpumask_var(pushed_cpus);
+err_remove_cpu:
+ for (cpu = 0; cpu < xen_nr_cpus; cpu++)
+ xen_cpufreq_remove_dev(cpu);
+err_unbind_from_irqhnd:
+ unbind_from_irqhandler(irq, NULL);
+err_remove_drv:
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+ xen_cpufreq_driver = NULL;
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpufreq_register_driver);
+
+
+/**
+ * cpufreq_unregister_driver - unregister the current CPUFreq driver
+ *
+ * Unregister the current CPUFreq driver. Only call this if you have
+ * the right to do so, i.e. if you have succeeded in initialising before!
+ * Returns zero if successful, and -EINVAL if the cpufreq_driver is
+ * currently not initialised.
+ */
+int cpufreq_unregister_driver(struct cpufreq_driver *driver)
+{
+ unsigned long flags;
+ unsigned int cpu;
+
+ if (!xen_cpufreq_driver || (driver != xen_cpufreq_driver))
+ return -EINVAL;
+
+ pr_debug("unregistering driver %s\n", driver->name);
+
+ unbind_from_irqhandler(xen_irq, NULL);
+
+ for (cpu = 0; cpu < xen_nr_cpus; cpu++)
+ xen_cpufreq_remove_dev(cpu);
+
+ spin_lock_irqsave(&xen_cpufreq_driver_lock, flags);
+ xen_cpufreq_driver = NULL;
+ spin_unlock_irqrestore(&xen_cpufreq_driver_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
+
+static int __init xen_cpufreq_init(void)
+{
+ int ret;
+ int i;
+
+ struct xen_sysctl op = {
+ .cmd = XEN_SYSCTL_physinfo,
+ .interface_version = XEN_SYSCTL_INTERFACE_VERSION,
+ };
+
+ ret = HYPERVISOR_sysctl(&op);
+ if (ret) {
+ pr_err("Hypervisor get physinfo error (%d)\n", ret);
+ return ret;
+ }
+
+ xen_nr_cpus = op.u.physinfo.nr_cpus;
+ if (xen_nr_cpus == 0 || xen_nr_cpus > NR_CPUS) {
+ xen_nr_cpus = 0;
+ pr_err("Wrong CPUs amount (%d)\n", xen_nr_cpus);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < xen_nr_cpus; i++) {
+ per_cpu(xen_cpufreq_policy_cpu, i) = -1;
+ init_rwsem(&per_cpu(xen_cpu_policy_rwsem, i));
+ }
+
+ xen_cpufreq_wq = create_workqueue("xen_cpufreq");
+ if (!xen_cpufreq_wq) {
+ pr_err("Create workqueue error\n");
+ ret = -ENOMEM;
+ goto err_create_wq;
+ }
+
+ return 0;
+
+err_create_wq:
+ xen_nr_cpus = 0;
+ return ret;
+}
+
+MODULE_AUTHOR("Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com>");
+MODULE_DESCRIPTION("Xen cpufreq driver which uploads PM data to Xen hypervisor");
+MODULE_LICENSE("GPL");
+
+core_initcall(xen_cpufreq_init);
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index cf64566..0520194 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -81,6 +81,7 @@
#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */
+#define VIRQ_CPUFREQ 13 /* (DOM0) Notify xen-cpufreq driver. */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
--
1.9.1
^ permalink raw reply related [flat|nested] 16+ messages in thread