From mboxrd@z Thu Jan  1 00:00:00 1970
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Subject: Re: [PATCH v10 11/20] x86/VPMU: Interface for setting
 PMU mode and flags
Date: Thu, 18 Sep 2014 17:50:54 -0400
Message-ID: <541B53BE.4000909@oracle.com>
References: <1409802080-6160-1-git-send-email-boris.ostrovsky@oracle.com>
	<1409802080-6160-12-git-send-email-boris.ostrovsky@oracle.com>
	<AADFC41AFE54684AB9EE6CBC0274A5D1260865E2@SHSMSX101.ccr.corp.intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"; Format="flowed"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
In-Reply-To: <AADFC41AFE54684AB9EE6CBC0274A5D1260865E2@SHSMSX101.ccr.corp.intel.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: "Tian, Kevin" <kevin.tian@intel.com>, "jbeulich@suse.com" <jbeulich@suse.com>, "suravee.suthikulpanit@amd.com" <suravee.suthikulpanit@amd.com>, "Dong, Eddie" <eddie.dong@intel.com>, "Aravind.Gopalakrishnan@amd.com" <Aravind.Gopalakrishnan@amd.com>
Cc: "andrew.cooper3@citrix.com" <andrew.cooper3@citrix.com>, "xen-devel@lists.xen.org" <xen-devel@lists.xen.org>, "keir@xen.org" <keir@xen.org>, "Nakajima,
	Jun" <jun.nakajima@intel.com>, "tim@xen.org" <tim@xen.org>
List-Id: xen-devel@lists.xenproject.org

On 09/18/2014 12:11 AM, Tian, Kevin wrote:
>> From: Boris Ostrovsky [mailto:boris.ostrovsky@oracle.com]
>> Sent: Wednesday, September 03, 2014 8:41 PM
>>
>> Add runtime interface for setting PMU mode and flags. Three main modes are
>> provided:
>> * PMU off
>> * PMU on: Guests can access PMU MSRs and receive PMU interrupts. dom0
>>    profiles itself and the hypervisor.
>> * dom0-only PMU: dom0 collects samples for both itself and guests.
> could you associate above three modes to the actual definitions in the code?
> from code only two modes are defined:

Wrong commit message (I kept it from early days of the patch). This 
patch only has two modes: "on" (aka" self") and "off". The third mode is 
introduced by a later patch.

>
>> +/* PMU modes:
>> + * - XENPMU_MODE_OFF:   No PMU virtualization
>> + * - XENPMU_MODE_SELF:  Guests can profile themselves, dom0 profiles
>> + *                      itself and Xen
>> + */
> how do I understand "PMU on" mode then?

"Self" is "on". I'll mention this in the commit message and in the 
comments before parsing code

>
>> For feature flags only Intel's BTS is currently supported.
>>
>> Mode and flags are set via HYPERVISOR_xenpmu_op hypercall.
>>
>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> Reviewed-by: Dietmar Hahn <dietmar.hahn@ts.fujitsu.com>
>> Tested-by: Dietmar Hahn <dietmar.hahn@ts.fujitsu.com>
>> ---
>>   xen/arch/x86/domain.c              |   7 +-
>>   xen/arch/x86/hvm/svm/vpmu.c        |   4 +-
>>   xen/arch/x86/hvm/vmx/vpmu_core2.c  |  10 +-
>>   xen/arch/x86/hvm/vpmu.c            | 214
>> +++++++++++++++++++++++++++++++++++--
>>   xen/arch/x86/x86_64/compat/entry.S |   4 +
>>   xen/arch/x86/x86_64/entry.S        |   4 +
>>   xen/include/Makefile               |   2 +
>>   xen/include/asm-x86/hvm/vpmu.h     |  27 +++--
>>   xen/include/public/pmu.h           |  42 ++++++++
>>   xen/include/public/xen.h           |   1 +
>>   xen/include/xen/hypercall.h        |   4 +
>>   xen/include/xlat.lst               |   4 +
>>   12 files changed, 295 insertions(+), 28 deletions(-)
>>
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index f7e0e78..64d4a83 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -1499,8 +1499,7 @@ void context_switch(struct vcpu *prev, struct vcpu
>> *next)
>>
>>       if ( is_hvm_vcpu(prev) )
>>       {
>> -        if (prev != next)
>> -            vpmu_save(prev);
>> +        vpmu_switch_from(prev, next);
>>
>>           if ( !list_empty(&prev->arch.hvm_vcpu.tm_list) )
>>               pt_save_timer(prev);
>> @@ -1543,9 +1542,9 @@ void context_switch(struct vcpu *prev, struct vcpu
>> *next)
>>                              !is_hardware_domain(next->domain));
>>       }
>>
>> -    if (is_hvm_vcpu(next) && (prev != next) )
>> +    if ( is_hvm_vcpu(prev) )
>>           /* Must be done with interrupts enabled */
>> -        vpmu_load(next);
>> +        vpmu_switch_to(prev, next);
>>
>>       context_saved(prev);
>>
>> diff --git a/xen/arch/x86/hvm/svm/vpmu.c b/xen/arch/x86/hvm/svm/vpmu.c
>> index 124b147..37d8228 100644
>> --- a/xen/arch/x86/hvm/svm/vpmu.c
>> +++ b/xen/arch/x86/hvm/svm/vpmu.c
>> @@ -479,14 +479,14 @@ struct arch_vpmu_ops amd_vpmu_ops = {
>>       .arch_vpmu_dump = amd_vpmu_dump
>>   };
>>
>> -int svm_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
>> +int svm_vpmu_initialise(struct vcpu *v)
>>   {
>>       struct vpmu_struct *vpmu = vcpu_vpmu(v);
>>       uint8_t family = current_cpu_data.x86;
>>       int ret = 0;
>>
>>       /* vpmu enabled? */
>> -    if ( !vpmu_flags )
>> +    if ( vpmu_mode == XENPMU_MODE_OFF )
>>           return 0;
>>
>>       switch ( family )
>> diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c
>> b/xen/arch/x86/hvm/vmx/vpmu_core2.c
>> index 1cdafe0..f5d85e4 100644
>> --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
>> +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c
>> @@ -703,13 +703,13 @@ static int core2_vpmu_do_interrupt(struct
>> cpu_user_regs *regs)
>>       return 1;
>>   }
>>
>> -static int core2_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
>> +static int core2_vpmu_initialise(struct vcpu *v)
>>   {
>>       struct vpmu_struct *vpmu = vcpu_vpmu(v);
>>       u64 msr_content;
>>       static bool_t ds_warned;
>>
>> -    if ( !(vpmu_flags & VPMU_BOOT_BTS) )
>> +    if ( !(vpmu_features & XENPMU_FEATURE_INTEL_BTS) )
>>           goto func_out;
>>       /* Check the 'Debug Store' feature in the CPUID.EAX[1]:EDX[21] */
>>       while ( boot_cpu_has(X86_FEATURE_DS) )
>> @@ -824,7 +824,7 @@ struct arch_vpmu_ops core2_no_vpmu_ops = {
>>       .do_cpuid = core2_no_vpmu_do_cpuid,
>>   };
>>
>> -int vmx_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
>> +int vmx_vpmu_initialise(struct vcpu *v)
>>   {
>>       struct vpmu_struct *vpmu = vcpu_vpmu(v);
>>       uint8_t family = current_cpu_data.x86;
>> @@ -832,7 +832,7 @@ int vmx_vpmu_initialise(struct vcpu *v, unsigned int
>> vpmu_flags)
>>       int ret = 0;
>>
>>       vpmu->arch_vpmu_ops = &core2_no_vpmu_ops;
>> -    if ( !vpmu_flags )
>> +    if ( vpmu_mode == XENPMU_MODE_OFF )
>>           return 0;
>>
>>       if ( family == 6 )
>> @@ -875,7 +875,7 @@ int vmx_vpmu_initialise(struct vcpu *v, unsigned int
>> vpmu_flags)
>>           /* future: */
>>           case 0x3d:
>>           case 0x4e:
>> -            ret = core2_vpmu_initialise(v, vpmu_flags);
>> +            ret = core2_vpmu_initialise(v);
>>               if ( !ret )
>>                   vpmu->arch_vpmu_ops = &core2_vpmu_ops;
>>               return ret;
>> diff --git a/xen/arch/x86/hvm/vpmu.c b/xen/arch/x86/hvm/vpmu.c
>> index 89bead4..39d13f6 100644
>> --- a/xen/arch/x86/hvm/vpmu.c
>> +++ b/xen/arch/x86/hvm/vpmu.c
>> @@ -21,6 +21,8 @@
>>   #include <xen/config.h>
>>   #include <xen/sched.h>
>>   #include <xen/xenoprof.h>
>> +#include <xen/event.h>
>> +#include <xen/guest_access.h>
>>   #include <asm/regs.h>
>>   #include <asm/types.h>
>>   #include <asm/msr.h>
>> @@ -32,13 +34,21 @@
>>   #include <asm/hvm/svm/vmcb.h>
>>   #include <asm/apic.h>
>>   #include <public/pmu.h>
>> +#include <xen/tasklet.h>
>> +
>> +#include <compat/pmu.h>
>> +CHECK_pmu_params;
>> +CHECK_pmu_intel_ctxt;
>> +CHECK_pmu_amd_ctxt;
>> +CHECK_pmu_cntr_pair;
>>
>>   /*
>>    * "vpmu" :     vpmu generally enabled
>>    * "vpmu=off" : vpmu generally disabled
>>    * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on.
>>    */
> this comment doesn't describe "self" mode. better to have it consistent
> with later param parse.
>
>> -static unsigned int __read_mostly opt_vpmu_enabled;
>> +uint64_t __read_mostly vpmu_mode = XENPMU_MODE_OFF;
>> +uint64_t __read_mostly vpmu_features = 0;
>>   static void parse_vpmu_param(char *s);
>>   custom_param("vpmu", parse_vpmu_param);
>>
>> @@ -52,7 +62,7 @@ static void __init parse_vpmu_param(char *s)
>>           break;
>>       default:
>>           if ( !strcmp(s, "bts") )
>> -            opt_vpmu_enabled |= VPMU_BOOT_BTS;
>> +            vpmu_features |= XENPMU_FEATURE_INTEL_BTS;
>>           else if ( *s )
>>           {
>>               printk("VPMU: unknown flag: %s - vpmu disabled!\n", s);
>> @@ -60,7 +70,7 @@ static void __init parse_vpmu_param(char *s)
>>           }
>>           /* fall through */
>>       case 1:
>> -        opt_vpmu_enabled |= VPMU_BOOT_ENABLED;
>> +        vpmu_mode = XENPMU_MODE_SELF;
>>           break;
>>       }
>>   }
>> @@ -77,6 +87,9 @@ int vpmu_do_wrmsr(unsigned int msr, uint64_t
>> msr_content, uint64_t supported)
>>   {
>>       struct vpmu_struct *vpmu = vcpu_vpmu(current);
>>
>> +    if ( !(vpmu_mode & XENPMU_MODE_SELF) )
>> +        return 0;
>> +
>>       if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_wrmsr )
>>           return vpmu->arch_vpmu_ops->do_wrmsr(msr, msr_content,
>> supported);
>>       return 0;
>> @@ -86,6 +99,9 @@ int vpmu_do_rdmsr(unsigned int msr, uint64_t
>> *msr_content)
>>   {
>>       struct vpmu_struct *vpmu = vcpu_vpmu(current);
>>
>> +    if ( !(vpmu_mode & XENPMU_MODE_SELF) )
>> +        return 0;
>> +
>>       if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_rdmsr )
>>           return vpmu->arch_vpmu_ops->do_rdmsr(msr, msr_content);
>>       return 0;
>> @@ -240,19 +256,19 @@ void vpmu_initialise(struct vcpu *v)
>>       switch ( vendor )
>>       {
>>       case X86_VENDOR_AMD:
>> -        if ( svm_vpmu_initialise(v, opt_vpmu_enabled) != 0 )
>> -            opt_vpmu_enabled = 0;
>> +        if ( svm_vpmu_initialise(v) != 0 )
>> +            vpmu_mode = XENPMU_MODE_OFF;
>>           break;
>>
>>       case X86_VENDOR_INTEL:
>> -        if ( vmx_vpmu_initialise(v, opt_vpmu_enabled) != 0 )
>> -            opt_vpmu_enabled = 0;
>> +        if ( vmx_vpmu_initialise(v) != 0 )
>> +            vpmu_mode = XENPMU_MODE_OFF;
>>           break;
>>
>>       default:
>>           printk("VPMU: Initialization failed. "
>>                  "Unknown CPU vendor %d\n", vendor);
>> -        opt_vpmu_enabled = 0;
>> +        vpmu_mode = XENPMU_MODE_OFF;
>>           break;
>>       }
>>   }
>> @@ -274,3 +290,185 @@ void vpmu_dump(struct vcpu *v)
>>           vpmu->arch_vpmu_ops->arch_vpmu_dump(v);
>>   }
>>
>> +static atomic_t vpmu_sched_counter;
>> +
>> +static void vpmu_sched_checkin(unsigned long unused)
>> +{
>> +    atomic_inc(&vpmu_sched_counter);
>> +}
>> +
>> +static int
>> +vpmu_force_context_switch(XEN_GUEST_HANDLE_PARAM(xen_pmu_params
>> _t) arg)
> looks a tricky implementation...
>
>> +{
>> +    unsigned i, j, allbutself_num, tasknum, mycpu;
>> +    static s_time_t start;
>> +    static struct tasklet **sync_task;
>> +    struct vcpu *curr_vcpu = current;
>> +    static struct vcpu *sync_vcpu;
>> +    int ret = 0;
>> +
>> +    tasknum = allbutself_num = num_online_cpus() - 1;
>> +
>> +    if ( sync_task ) /* if set, we are in hypercall continuation */
>> +    {
>> +        if ( (sync_vcpu != NULL) && (sync_vcpu != curr_vcpu) )
>> +            /* We are not the original caller */
>> +            return -EAGAIN;
> I assume hypercall continuation will happen on original vcpu context. Under
> which situation the hypercall will be continued on a different vcpu? If yes,
> would it be an unbounded situation where you may wait unexpected time
> to have sync_vcpu==curr_vcpu?

The continuation call is gone in v11. If we are stuck waiting for more 
than 5 seconds we'll simply return -EAGAIN and have the caller retry.

>
>> +        goto cont_wait;
>> +    }
>> +
>> +    sync_task = xmalloc_array(struct tasklet *, allbutself_num);
>> +    if ( !sync_task )
>> +    {
>> +        printk(XENLOG_WARNING "vpmu_force_context_switch: out of
>> memory\n");
>> +        return -ENOMEM;
>> +    }
>> +
>> +    for ( tasknum = 0; tasknum < allbutself_num; tasknum++ )
>> +    {
>> +        sync_task[tasknum] = xmalloc(struct tasklet);
>> +        if ( sync_task[tasknum] == NULL )
>> +        {
>> +            printk(XENLOG_WARNING "vpmu_force_context_switch: out of
>> memory\n");
>> +            ret = -ENOMEM;
>> +            goto out;
>> +        }
>> +        tasklet_init(sync_task[tasknum], vpmu_sched_checkin, 0);
>> +    }
>> +
>> +    atomic_set(&vpmu_sched_counter, 0);
>> +    sync_vcpu = curr_vcpu;
>> +
>> +    j = 0;
>> +    mycpu = smp_processor_id();
>> +    for_each_online_cpu( i )
>> +    {
>> +        if ( i != mycpu )
>> +            tasklet_schedule_on_cpu(sync_task[j++], i);
>> +    }
> are you sure tasklet can always cause context switch?

Yes, tasklet is run as result of SCHEDULE_SOFTIRQ.

> and how do you ensure
> vpmu won't become dirty again if the target vcpu switches out, and then switches
> back again to use vpmu, but before your whole hypercall completes? Here some
> way is required to really stop vpmu on the target vcpu imho, but I may be wrong
> on understanding the whole background here... :-)

I change vpmu_mode *before* calling this sync routine so even if remote 
VCPU comes on again it won't load that VCPU's VPMU. That's because the 
only time I do this sync is when I:
* Turn VPMU mode off. In which case no VPMU will ever get loaded
* Switch to XENPMU_MODE_ALL mode (in v11 version) in which case the only 
time VPMU will get loaded is if it's dom0's VPMU. Which is fine.

>
>> +
>> +    vpmu_save(curr_vcpu);
>> +
>> +    start = NOW();
>> +
>> + cont_wait:
>> +    /*
>> +     * Note that we may fail here if a CPU is hot-(un)plugged while we are
>> +     * waiting. We will then time out.
>> +     */
>> +    while ( atomic_read(&vpmu_sched_counter) != allbutself_num )
>> +    {
>> +        /* Give up after 5 seconds */
>> +        if ( NOW() > start + SECONDS(5) )
>> +        {
>> +            printk(XENLOG_WARNING
>> +                   "vpmu_force_context_switch: failed to sync\n");
>> +            ret = -EBUSY;
>> +            break;
>> +        }
>> +        cpu_relax();
>> +        if ( hypercall_preempt_check() )
>> +            return hypercall_create_continuation(
>> +                __HYPERVISOR_xenpmu_op, "ih", XENPMU_mode_set,
>> arg);
>> +    }
>> +
>> + out:
>> +
>> +    for ( i = 0; i < tasknum; i++ )
>> +    {
>> +        tasklet_kill(sync_task[i]);
>> +        xfree(sync_task[i]);
>> +    }
>> +    xfree(sync_task);
>> +    sync_task = NULL;
>> +    sync_vcpu = NULL;
>> +
>> +    return ret;
>> +}
>> +
>> +long do_xenpmu_op(int op,
>> XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t) arg)
>> +{
>> +    int ret = -EINVAL;
>> +    xen_pmu_params_t pmu_params;
>> +
>> +    switch ( op )
>> +    {
>> +    case XENPMU_mode_set:
>> +    {
>> +        static DEFINE_SPINLOCK(xenpmu_mode_lock);
> why not a global lock? should it cover feature_set too?

This lock is not protecting setting the mode but rather it tries to 
prevent multiple CPUs doing the sync (tasklet thing) at the same time.


>
>> +        uint32_t current_mode;
>> +
>> +        if ( !is_control_domain(current->domain) )
>> +            return -EPERM;
>> +
>> +        if ( copy_from_guest(&pmu_params, arg, 1) )
>> +            return -EFAULT;
>> +
>> +        if ( pmu_params.val & ~XENPMU_MODE_SELF )
>> +            return -EINVAL;
>> +
>> +        /*
>> +         * Return error is someone else is in the middle of changing mode
>> ---
>> +         * this is most likely indication of two system administrators
>> +         * working against each other
>> +         */
>> +        if ( !spin_trylock(&xenpmu_mode_lock) )
>> +            return -EAGAIN;
>> +
>> +        current_mode = vpmu_mode;
>> +        vpmu_mode = pmu_params.val;
>> +
>> +        if ( vpmu_mode == XENPMU_MODE_OFF )
>> +        {
>> +            /*
>> +             * Make sure all (non-dom0) VCPUs have unloaded their
>> VPMUs. This
>> +             * can be achieved by having all physical processors go
>> through
>> +             * context_switch().
> I didn't see any condition check on 'non-dom0' to reflect above comment...

The comment is not quite correct. I wrote it for XENPMU_MODE_ALL mode, 
which is not what I do here. I'll update it.

>
>> +             */
>> +            ret = vpmu_force_context_switch(arg);
>> +            if ( ret )
>> +                vpmu_mode = current_mode;
>> +        }
>> +        else
>> +            ret = 0;
>> +
>> +        spin_unlock(&xenpmu_mode_lock);
>> +        break;
>> +    }
>> +
>> +    case XENPMU_mode_get:
>> +        memset(&pmu_params, 0, sizeof(pmu_params));
>> +        pmu_params.val = vpmu_mode;
>> +        pmu_params.version.maj = XENPMU_VER_MAJ;
>> +        pmu_params.version.min = XENPMU_VER_MIN;
>> +        if ( copy_to_guest(arg, &pmu_params, 1) )
>> +            return -EFAULT;
>> +        ret = 0;
>> +        break;
>> +
>> +    case XENPMU_feature_set:
>> +        if ( !is_control_domain(current->domain) )
>> +            return -EPERM;
>> +
>> +        if ( copy_from_guest(&pmu_params, arg, 1) )
>> +            return -EFAULT;
>> +
>> +        if ( pmu_params.val & ~XENPMU_FEATURE_INTEL_BTS )
>> +            return -EINVAL;
>> +
>> +        vpmu_features = pmu_params.val;
>> +
>> +        ret = 0;
>> +        break;
>> +
>> +    case XENPMU_feature_get:
>> +        memset(&pmu_params, 0, sizeof(pmu_params));
>> +        pmu_params.val = vpmu_mode;
>> +        if ( copy_to_guest(arg, &pmu_params, 1) )
>> +            return -EFAULT;
>> +        ret = 0;
>> +        break;
>> +     }
>> +
>> +    return ret;
>> +}
>> diff --git a/xen/arch/x86/x86_64/compat/entry.S
>> b/xen/arch/x86/x86_64/compat/entry.S
>> index ac594c9..8587c46 100644
>> --- a/xen/arch/x86/x86_64/compat/entry.S
>> +++ b/xen/arch/x86/x86_64/compat/entry.S
>> @@ -417,6 +417,8 @@ ENTRY(compat_hypercall_table)
>>           .quad do_domctl
>>           .quad compat_kexec_op
>>           .quad do_tmem_op
>> +        .quad do_ni_hypercall           /* reserved for XenClient */
> why XenClient here?

See include/public/xen.h, hypercall 39.

-boris


>
>> +        .quad do_xenpmu_op              /* 40 */
>>           .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
>>           .quad compat_ni_hypercall
>>           .endr
>> @@ -465,6 +467,8 @@ ENTRY(compat_hypercall_args_table)
>>           .byte 1 /* do_domctl                */
>>           .byte 2 /* compat_kexec_op          */
>>           .byte 1 /* do_tmem_op               */
>> +        .byte 0 /* reserved for XenClient   */
>> +        .byte 2 /* do_xenpmu_op             */  /* 40 */
>>           .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
>>           .byte 0 /* compat_ni_hypercall      */
>>           .endr
>> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
>> index a3ed216..704f4d1 100644
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -762,6 +762,8 @@ ENTRY(hypercall_table)
>>           .quad do_domctl
>>           .quad do_kexec_op
>>           .quad do_tmem_op
>> +        .quad do_ni_hypercall       /* reserved for XenClient */
>> +        .quad do_xenpmu_op          /* 40 */
>>           .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
>>           .quad do_ni_hypercall
>>           .endr
>> @@ -810,6 +812,8 @@ ENTRY(hypercall_args_table)
>>           .byte 1 /* do_domctl            */
>>           .byte 2 /* do_kexec             */
>>           .byte 1 /* do_tmem_op           */
>> +        .byte 0 /* reserved for XenClient */
>> +        .byte 2 /* do_xenpmu_op         */  /* 40 */
>>           .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
>>           .byte 0 /* do_ni_hypercall      */
>>           .endr
>> diff --git a/xen/include/Makefile b/xen/include/Makefile
>> index f7ccbc9..f97733a 100644
>> --- a/xen/include/Makefile
>> +++ b/xen/include/Makefile
>> @@ -26,7 +26,9 @@ headers-y := \
>>   headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
>>   headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
>>   headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
>> +headers-$(CONFIG_X86)     += compat/arch-x86/pmu.h
>>   headers-y                 += compat/arch-$(compat-arch-y).h
>> compat/xlat.h
>> +headers-y                 += compat/pmu.h
>>   headers-$(FLASK_ENABLE)   += compat/xsm/flask_op.h
>>
>>   cppflags-y                := -include public/xen-compat.h
>> diff --git a/xen/include/asm-x86/hvm/vpmu.h
>> b/xen/include/asm-x86/hvm/vpmu.h
>> index 6fa0def..8572835 100644
>> --- a/xen/include/asm-x86/hvm/vpmu.h
>> +++ b/xen/include/asm-x86/hvm/vpmu.h
>> @@ -24,13 +24,6 @@
>>
>>   #include <public/pmu.h>
>>
>> -/*
>> - * Flag bits given as a string on the hypervisor boot parameter 'vpmu'.
>> - * See arch/x86/hvm/vpmu.c.
>> - */
>> -#define VPMU_BOOT_ENABLED 0x1    /* vpmu generally enabled. */
>> -#define VPMU_BOOT_BTS     0x2    /* Intel BTS feature wanted. */
>> -
>>   #define vcpu_vpmu(vcpu)   (&(vcpu)->arch.vpmu)
>>   #define vpmu_vcpu(vpmu)   container_of((vpmu), struct vcpu, arch.vpmu)
>>
>> @@ -59,8 +52,8 @@ struct arch_vpmu_ops {
>>       void (*arch_vpmu_dump)(const struct vcpu *);
>>   };
>>
>> -int vmx_vpmu_initialise(struct vcpu *, unsigned int flags);
>> -int svm_vpmu_initialise(struct vcpu *, unsigned int flags);
>> +int vmx_vpmu_initialise(struct vcpu *);
>> +int svm_vpmu_initialise(struct vcpu *);
>>
>>   struct vpmu_struct {
>>       u32 flags;
>> @@ -116,5 +109,21 @@ void vpmu_dump(struct vcpu *v);
>>   extern int acquire_pmu_ownership(int pmu_ownership);
>>   extern void release_pmu_ownership(int pmu_ownership);
>>
>> +extern uint64_t vpmu_mode;
>> +extern uint64_t vpmu_features;
>> +
>> +/* Context switch */
>> +inline void vpmu_switch_from(struct vcpu *prev, struct vcpu *next)
>> +{
>> +    if ( (prev != next) && (vpmu_mode & XENPMU_MODE_SELF) )
>> +        vpmu_save(prev);
>> +}
>> +
>> +inline void vpmu_switch_to(struct vcpu *prev, struct vcpu *next)
>> +{
>> +    if ( (prev != next) && (vpmu_mode & XENPMU_MODE_SELF) )
>> +        vpmu_load(next);
>> +}
>> +
>>   #endif /* __ASM_X86_HVM_VPMU_H_*/
>>
>> diff --git a/xen/include/public/pmu.h b/xen/include/public/pmu.h
>> index e6f45ee..0855005 100644
>> --- a/xen/include/public/pmu.h
>> +++ b/xen/include/public/pmu.h
>> @@ -13,6 +13,48 @@
>>   #define XENPMU_VER_MAJ    0
>>   #define XENPMU_VER_MIN    1
>>
>> +/*
>> + * ` enum neg_errnoval
>> + * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params
>> *args);
>> + *
>> + * @cmd  == XENPMU_* (PMU operation)
>> + * @args == struct xenpmu_params
>> + */
>> +/* ` enum xenpmu_op { */
>> +#define XENPMU_mode_get        0 /* Also used for getting PMU version
>> */
>> +#define XENPMU_mode_set        1
>> +#define XENPMU_feature_get     2
>> +#define XENPMU_feature_set     3
>> +/* ` } */
>> +
>> +/* Parameters structure for HYPERVISOR_xenpmu_op call */
>> +struct xen_pmu_params {
>> +    /* IN/OUT parameters */
>> +    struct {
>> +        uint32_t maj;
>> +        uint32_t min;
>> +    } version;
>> +    uint64_t val;
>> +
>> +    /* IN parameters */
>> +    uint64_t vcpu;
>> +};
>> +typedef struct xen_pmu_params xen_pmu_params_t;
>> +DEFINE_XEN_GUEST_HANDLE(xen_pmu_params_t);
>> +
>> +/* PMU modes:
>> + * - XENPMU_MODE_OFF:   No PMU virtualization
>> + * - XENPMU_MODE_SELF:  Guests can profile themselves, dom0 profiles
>> + *                      itself and Xen
>> + */
>> +#define XENPMU_MODE_OFF           0
>> +#define XENPMU_MODE_SELF          (1<<0)
>> +
>> +/*
>> + * PMU features:
>> + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
>> + */
>> +#define XENPMU_FEATURE_INTEL_BTS  1
>>
>>   /* Shared between hypervisor and PV domain */
>>   struct xen_pmu_data {
>> diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
>> index a6a2092..0766790 100644
>> --- a/xen/include/public/xen.h
>> +++ b/xen/include/public/xen.h
>> @@ -101,6 +101,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
>>   #define __HYPERVISOR_kexec_op             37
>>   #define __HYPERVISOR_tmem_op              38
>>   #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient
>> */
>> +#define __HYPERVISOR_xenpmu_op            40
>>
>>   /* Architecture-specific hypercall definitions. */
>>   #define __HYPERVISOR_arch_0               48
>> diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h
>> index a9e5229..cf34547 100644
>> --- a/xen/include/xen/hypercall.h
>> +++ b/xen/include/xen/hypercall.h
>> @@ -14,6 +14,7 @@
>>   #include <public/event_channel.h>
>>   #include <public/tmem.h>
>>   #include <public/version.h>
>> +#include <public/pmu.h>
>>   #include <asm/hypercall.h>
>>   #include <xsm/xsm.h>
>>
>> @@ -139,6 +140,9 @@ do_tmem_op(
>>   extern long
>>   do_xenoprof_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg);
>>
>> +extern long
>> +do_xenpmu_op(int op, XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t)
>> arg);
>> +
>>   #ifdef CONFIG_COMPAT
>>
>>   extern int
>> diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst
>> index c8fafef..5809c60 100644
>> --- a/xen/include/xlat.lst
>> +++ b/xen/include/xlat.lst
>> @@ -101,6 +101,10 @@
>>   !	vcpu_set_singleshot_timer	vcpu.h
>>   ?	xenoprof_init			xenoprof.h
>>   ?	xenoprof_passive		xenoprof.h
>> +?	pmu_params			pmu.h
>> +?	pmu_intel_ctxt			arch-x86/pmu.h
>> +?	pmu_amd_ctxt			arch-x86/pmu.h
>> +?	pmu_cntr_pair			arch-x86/pmu.h
>>   ?	flask_access			xsm/flask_op.h
>>   !	flask_boolean			xsm/flask_op.h
>>   ?	flask_cache_stats		xsm/flask_op.h
>> --
>> 1.8.1.4