Re: [PATCH v2 6/8] x86: get per domain CQM information

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: "Xu, Dongxiao" <dongxiao.xu@intel.com>
Cc: "xen-devel@lists.xen.org" <xen-devel@lists.xen.org>
Subject: Re: [PATCH v2 6/8] x86: get per domain CQM information
Date: Mon, 25 Nov 2013 16:28:43 +0000	[thread overview]
Message-ID: <52937ABB.7040704@citrix.com> (raw)
In-Reply-To: <40776A41FC278F40B59438AD47D147A9118BB544@SHSMSX104.ccr.corp.intel.com>

On 25/11/13 06:20, Xu, Dongxiao wrote:
>> -----Original Message-----
>> From: Andrew Cooper [mailto:andrew.cooper3@citrix.com]
>> Sent: Thursday, November 21, 2013 10:09 PM
>> To: Xu, Dongxiao
>> Cc: xen-devel@lists.xen.org
>> Subject: Re: [Xen-devel] [PATCH v2 6/8] x86: get per domain CQM information
>>
>> On 21/11/13 07:20, dongxiao.xu@intel.com wrote:
>>> From: Dongxiao Xu <dongxiao.xu@intel.com>
>>>
>>> Retrive CQM information for certain domain, which reflects the L3 cache
>>> occupancy for a socket.
>>>
>>> Signed-off-by: Jiongxi Li <jiongxi.li@intel.com>
>>> Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
>>> ---
>>>  xen/arch/x86/pqos.c             |   57
>> ++++++++++++++++++++++++++++++++++
>>>  xen/arch/x86/sysctl.c           |   64
>> +++++++++++++++++++++++++++++++++++++++
>>>  xen/include/asm-x86/msr-index.h |    4 +++
>>>  xen/include/asm-x86/pqos.h      |   14 +++++++++
>>>  xen/include/public/domctl.h     |   14 +++++++++
>>>  xen/include/public/sysctl.h     |   15 +++++++++
>>>  6 files changed, 168 insertions(+)
>>>
>>> diff --git a/xen/arch/x86/pqos.c b/xen/arch/x86/pqos.c
>>> index e294799..d95784c 100644
>>> --- a/xen/arch/x86/pqos.c
>>> +++ b/xen/arch/x86/pqos.c
>>> @@ -19,14 +19,30 @@
>>>   * Place - Suite 330, Boston, MA 02111-1307 USA.
>>>   */
>>>  #include <asm/processor.h>
>>> +#include <asm/msr.h>
>>>  #include <xen/init.h>
>>>  #include <xen/spinlock.h>
>>>  #include <xen/sched.h>
>>> +#include <public/domctl.h>
>>>  #include <asm/pqos.h>
>>>
>>>  static bool_t pqos_enabled = 1;
>>>  boolean_param("pqos", pqos_enabled);
>>>
>>> +static void read_qm_data(void *arg)
>>> +{
>>> +    struct qm_element *qm_element = arg;
>>> +
>>> +    wrmsr(MSR_IA32_QOSEVTSEL, qm_element->evtid,
>> qm_element->rmid);
>>> +    rdmsrl(MSR_IA32_QMC, qm_element->qm_data);
>>> +}
>>> +
>>> +static void get_generic_qm_info(struct qm_element *qm_element)
>>> +{
>>> +    unsigned int cpu = qm_element->cpu;
>>> +    on_selected_cpus(cpumask_of(cpu), read_qm_data, qm_element, 1);
>>> +}
>>> +
>>>  unsigned int cqm_res_count = 0;
>>>  unsigned int cqm_upscaling_factor = 0;
>>>  bool_t cqm_enabled = 0;
>>> @@ -86,6 +102,23 @@ bool_t system_supports_cqm(void)
>>>      return cqm_enabled;
>>>  }
>>>
>>> +unsigned int get_cqm_count(void)
>>> +{
>>> +    return cqm_res_count;
>>> +}
>>> +
>>> +unsigned int get_cqm_avail(void)
>>> +{
>>> +    unsigned int cqm_avail = 0;
>>> +    int i;
>> unsigned int please.  If cqm_res_count has its top bit set, the
>> following loop may never terminate.
> OK.
>
>>> +
>>> +    for ( i = 0; i < cqm_res_count; i++ )
>>> +        if ( !cqm_res_array[i].inuse )
>>> +            cqm_avail++;
>>> +
>>> +    return cqm_avail;
>>> +}
>>> +
>>>  int alloc_cqm_rmid(struct domain *d)
>>>  {
>>>      int rmid, rc = 0;
>>> @@ -137,6 +170,30 @@ void free_cqm_rmid(struct domain *d)
>>>      d->arch.pqos_cqm_rmid = 0;
>>>  }
>>>
>>> +void get_cqm_info(uint32_t rmid, cpumask_t cpu_cqmdata_map,
>> A cpumask_t is already quite large, and will get larger in the future.
>> Pass by pointer please.
> OK.
>
>>> +                  struct xen_domctl_getdomcqminfo *info)
>>> +{
>>> +    struct qm_element element;
>>> +    unsigned int cpu, i;
>>> +
>>> +    for_each_cpu ( cpu, &cpu_cqmdata_map )
>>> +    {
>>> +        element.cpu = cpu;
>>> +        element.rmid = rmid;
>>> +        element.evtid = QOS_MONITOR_EVTID_L3;
>>> +
>>> +        get_generic_qm_info(&element);
>>> +
>>> +        i = cpu_to_socket(cpu);
>> cpu_to_socket() can return BAD_APICID.
> OK, will check the return value correctness.
>
>>> +        info->socket_cqmdata[i].valid =
>>> +            (element.qm_data & IA32_QM_CTR_ERROR_MASK) ? 0 : 1;
>> info->socket_cqmdata[i].valid = !(element.qm_data &
>> IA32_QM_CTR_ERROR_MASK);
> OK.
>
>>> +        if ( info->socket_cqmdata[i].valid )
>>> +            info->socket_cqmdata[i].l3c_occupancy = element.qm_data *
>> cqm_upscaling_factor;
>>> +        else
>>> +            info->socket_cqmdata[i].l3c_occupancy = 0;
>>> +    }
>>> +}
>>> +
>>>  /*
>>>   * Local variables:
>>>   * mode: C
>>> diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
>>> index 15d4b91..d631769 100644
>>> --- a/xen/arch/x86/sysctl.c
>>> +++ b/xen/arch/x86/sysctl.c
>>> @@ -28,6 +28,7 @@
>>>  #include <xen/nodemask.h>
>>>  #include <xen/cpu.h>
>>>  #include <xsm/xsm.h>
>>> +#include <asm/pqos.h>
>>>
>>>  #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
>>>
>>> @@ -101,6 +102,69 @@ long arch_do_sysctl(
>>>      }
>>>      break;
>>>
>>> +    case XEN_SYSCTL_getdomcqminfolist:
>>> +    {
>> This whole hypercall makes me somewhat uneasy.
>>
>>> +        struct domain *d;
>>> +        struct xen_domctl_getdomcqminfo info;
>>> +        uint32_t resource_count;
>>> +        uint32_t resource_avail;
>>> +        uint32_t num_domains = 0;
>>> +        cpumask_t cpu_cqmdata_map;
>>> +        DECLARE_BITMAP(sockets, QOS_MAX_SOCKETS);
>>> +        unsigned int cpu;
>>> +
>>> +        if ( !system_supports_cqm() )
>>> +        {
>>> +            ret = -ENODEV;
>>> +            break;
>>> +        }
>>> +
>>> +        resource_count = get_cqm_count();
>>> +        resource_avail = get_cqm_avail();
>>> +
>>> +        cpumask_clear(&cpu_cqmdata_map);
>>> +        bitmap_zero(sockets, QOS_MAX_SOCKETS);
>>> +        for_each_online_cpu(cpu)
>>> +        {
>>> +            int i = cpu_to_socket(cpu);
>>> +            if ( test_and_set_bit(i, sockets) )
>>> +                continue;
>>> +            cpumask_set_cpu(cpu, &cpu_cqmdata_map);
>>> +        }
>> What is this doing? It appears to be finding the first cpu on each socket.
> Yes, the CQM info is per-socket, so just one CPU within the socket is enough to get/set the data.
>
>>> +
>>> +        rcu_read_lock(&domlist_read_lock);
>>> +        for_each_domain ( d )
>>> +        {
>>> +            if ( d->domain_id < sysctl->u.getdomaininfolist.first_domain )
>>> +                continue;
>>> +            if ( num_domains == sysctl->u.getdomaininfolist.max_domains )
>>> +                break;
>>> +            if ( d->arch.pqos_cqm_rmid <= 0 )
>>> +                continue;
>> Is there any case where pqos_cqm_rmid can be negative? alloc_cqm_rmid()
>> never assigns a negative number now in v2, in which case
>> d->arch.pqos_cqm_rmid can probably be unsigned (and related int rmid's
>> can be similarly promoted to unsigned)
> Yes, it should be changed in the v2 patch, sorry.
>
>>> +            memset(&info, 0, sizeof(struct xen_domctl_getdomcqminfo));
>>> +            info.domain = d->domain_id;
>>> +            get_cqm_info(d->arch.pqos_cqm_rmid, cpu_cqmdata_map,
>> &info);
>>
>> So for a domain the hypercallee is interested in, we get its rmid, and
>> ask get_cqm_info() to individually IPI each one cpu from a socket to
>> fill in the info field?
>>
>> The IPIs are quite expensive, and this system will currently monopolise
>> the first cpu on each socket.
> Since the L3 cache usage is per-socket wise, so current solution is:
>  - Found the first CPU of each socket, and mark them in cpu_cqmdata_map.
>  - The get_cqm_info() issues IPI to those CPUs marked in cpu_cqmdata_map to get the per-socket data.
>
> Do you have proposed solution for this scenario?

At the very least, capture all the RMID statistics in a single IPI.  If
the target other node was running an HVM guest, this would cause
devastating performance with all the VMexits.

Beyond that, picking a random cpu on the target socket would be better
in cases where VMs are restricted to a subset of possible cpus.


A better method might be to have a per-socket buffer, sized by max
RMIDs, and the IPI fills the entire buffer with the current statistics. 
This allows you to capture the statistics across sockets in parrallel,
rather than serialising the collection while holding the lock.  Then,
the hypercall handler just needs to read the appropriate stats out of
the per-socket buffers when constructing the per-domain information.

>
>>> +
>>> +            if ( copy_to_guest_offset(sysctl->u.getdomcqminfolist.buffer,
>>> +                                      num_domains, &info, 1) )
>>> +            {
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +
>>> +            num_domains++;
>> So this loop is primarily bounded by the number of domains, where each
>> domain with a valid rmid will result in a spate of IPI?
>>
>> This looks like it needs hypercall continuation logic.
> OK, what about a preemption check every domain?

This is awkward, as you can't perform the continuation with the lock
held, and releasing it causes the stats to become stale, especially if
an RMID gets altered in-between.

A completely different option would be to allow the toolstack to have RO
mappings of the per-socket buffers.  Then, the hypercall itself degrades
to just the IPI to force an update of the per-socket information.

However, this then exposes a memory-based ABI with the toolstack.  One
solution to this would be explicitly define the contents of the pages as
reserved, and provide libxc accessors.  This allows the ABI to be
updated with a commit altering both Xen and libxc at once, while forging
the risk that some third party starts reying on the layout of the magic
pages.

This has the advantage that all real processing is deferred to toolstack
userspace, and Xen is just responsible for dumping the data with enough
structure to be parsed.

~Andrew

next prev parent reply	other threads:[~2013-11-25 16:28 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-21  7:20 [PATCH v2 0/8] enable Cache QoS Monitoring (CQM) feature dongxiao.xu
2013-11-21  7:20 ` [PATCH v2 1/8] x86: detect and initialize Cache QoS Monitoring feature dongxiao.xu
2013-11-21 12:14   ` Andrew Cooper
2013-11-21 12:19     ` Andrew Cooper
2013-11-25  3:06     ` Xu, Dongxiao
2013-11-25 15:40       ` Andrew Cooper
2013-11-25  8:57     ` Xu, Dongxiao
2013-11-25 15:58       ` Andrew Cooper
2013-11-21  7:20 ` [PATCH v2 2/8] x86: handle CQM resource when creating/destroying guests dongxiao.xu
2013-11-21 12:33   ` Andrew Cooper
2013-11-25  3:21     ` Xu, Dongxiao
2013-11-25 16:02       ` Andrew Cooper
2013-11-21  7:20 ` [PATCH v2 3/8] tools: " dongxiao.xu
2013-11-21  7:20 ` [PATCH v2 4/8] x86: dynamically attach/detach CQM service for a guest dongxiao.xu
2013-11-21 12:50   ` Andrew Cooper
2013-11-25  3:26     ` Xu, Dongxiao
2013-11-25 16:05       ` Andrew Cooper
2013-11-25 21:06   ` Konrad Rzeszutek Wilk
2013-11-21  7:20 ` [PATCH v2 5/8] tools: " dongxiao.xu
2013-11-25 21:00   ` Konrad Rzeszutek Wilk
2013-11-25 21:01     ` Konrad Rzeszutek Wilk
2013-11-21  7:20 ` [PATCH v2 6/8] x86: get per domain CQM information dongxiao.xu
2013-11-21 14:09   ` Andrew Cooper
2013-11-25  6:20     ` Xu, Dongxiao
2013-11-25 16:28       ` Andrew Cooper [this message]
2013-11-21  7:20 ` [PATCH v2 7/8] tools: " dongxiao.xu
2013-11-21  7:20 ` [PATCH v2 8/8] x86: enable CQM monitoring for each domain RMID dongxiao.xu
2013-11-21 14:19   ` Andrew Cooper
2013-11-25  7:22     ` Xu, Dongxiao
2013-11-25 16:32       ` Andrew Cooper
2013-11-21 14:36 ` [PATCH v2 0/8] enable Cache QoS Monitoring (CQM) feature Andrew Cooper
2013-11-25  7:24   ` Xu, Dongxiao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52937ABB.7040704@citrix.com \
    --to=andrew.cooper3@citrix.com \
    --cc=dongxiao.xu@intel.com \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).