Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Daniel P. Berrangé" <berrange@redhat.com>
To: Anthony Harivel <aharivel@redhat.com>
Cc: pbonzini@redhat.com, mtosatti@redhat.com, qemu-devel@nongnu.org,
	vchundur@redhat.com, rjarry@redhat.com
Subject: Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
Date: Wed, 17 Apr 2024 18:55:59 +0100	[thread overview]
Message-ID: <ZiANL6KkfZy3APJa@redhat.com> (raw)
In-Reply-To: <20240411121434.253353-4-aharivel@redhat.com>

On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:


> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index fad9a7e8ff30..37f68c496807 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -544,4 +544,6 @@ uint32_t kvm_dirty_ring_size(void);
>   * reported for the VM.
>   */
>  bool kvm_hwpoisoned_mem(void);
> +
> +bool kvm_is_rapl_feat_enable(CPUState *cs);
>  #endif
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index 882e37e12c5b..8dbeda473c6c 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -14,6 +14,9 @@
>  #include "qemu/accel.h"
>  #include "qemu/queue.h"
>  #include "sysemu/kvm.h"
> +#include "hw/boards.h"
> +#include "hw/i386/topology.h"
> +#include "io/channel-socket.h"
>  
>  typedef struct KVMSlot
>  {
> @@ -48,6 +51,34 @@ typedef struct KVMMemoryListener {
>  
>  #define KVM_MSI_HASHTAB_SIZE    256
>  
> +typedef struct KVMHostTopoInfo {
> +    /* Number of package on the Host */
> +    unsigned int maxpkgs;
> +    /* Number of cpus on the Host */
> +    unsigned int maxcpus;
> +    /* Number of cpus on each different package */
> +    unsigned int *pkg_cpu_count;
> +    /* Each package can have different maxticks */
> +    unsigned int *maxticks;
> +} KVMHostTopoInfo;
> +
> +struct KVMMsrEnergy {
> +    pid_t pid;
> +    bool enable;
> +    char *socket_path;
> +    QIOChannelSocket *sioc;
> +    QemuThread msr_thr;
> +    unsigned int vcpus;
> +    unsigned int vsockets;

Add 'guest_' prefix on to these two, to make it clearer

> +    X86CPUTopoInfo topo_info;

Lets call this 'guest_topo' too, to make it more explicitly
distinguished from the next 'host_topo' field.

> +    KVMHostTopoInfo host_topo;
> +    const CPUArchIdList *cpu_list;

This name choice has an unfortunate side effect.

cpu.h has a '#define cpu_list x86_cpu_list' for the purposes
of selecting the target specific  cpu_list function.

Unfortunately that macro affects your field name here, which
is why the code is wierdly able to set an 'x86_cpu_list' field
despite you having declared it 'cpu_list'.

I'd suggest perhaps calling this field 'guest_cpus' to avoid
this confusing side-effect, and again also making clear that
this is tracking guest, not host, CPUs.

> +    uint64_t *msr_value;
> +    uint64_t msr_unit;
> +    uint64_t msr_limit;
> +    uint64_t msr_info;
> +};
> +
>  enum KVMDirtyRingReaperState {
>      KVM_DIRTY_RING_REAPER_NONE = 0,
>      /* The reaper is sleeping */
> @@ -114,6 +145,7 @@ struct KVMState
>      bool kvm_dirty_ring_with_bitmap;
>      uint64_t kvm_eager_split_size;  /* Eager Page Splitting chunk size */
>      struct KVMDirtyRingReaper reaper;
> +    struct KVMMsrEnergy msr_energy;
>      NotifyVmexitOption notify_vmexit;
>      uint32_t notify_window;
>      uint32_t xen_version;

> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index e68cbe929302..3de69caa229e 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c

> @@ -2513,6 +2565,339 @@ static void register_smram_listener(Notifier *n, void *unused)
>                                   &smram_address_space, 1, "kvm-smram");
>  }
>  
> +static void *kvm_msr_energy_thread(void *data)
> +{
> +    KVMState *s = data;
> +    struct KVMMsrEnergy *vmsr = &s->msr_energy;
> +
> +    g_autofree package_energy_stat *pkg_stat = NULL;
> +    g_autofree thread_stat *thd_stat = NULL;
> +    g_autofree pid_t *thread_ids = NULL;
> +    g_autofree CPUState *cpu = NULL;
> +    g_autofree unsigned int *vpkgs_energy_stat = NULL;
> +    unsigned int num_threads = 0;
> +    unsigned int tmp_num_threads = 0;
> +
> +    X86CPUTopoIDs topo_ids;
> +
> +    rcu_register_thread();
> +
> +    /* Allocate memory for each package energy status */
> +    pkg_stat = g_new0(package_energy_stat, vmsr->host_topo.maxpkgs);
> +
> +    /* Allocate memory for thread stats */
> +    thd_stat = g_new0(thread_stat, 1);
> +
> +    /* Allocate memory for holding virtual package energy counter */
> +    vpkgs_energy_stat = g_new0(unsigned int, vmsr->vsockets);
> +
> +    /* Populate the max tick of each packages */
> +    for (int i = 0; i <= vmsr->host_topo.maxpkgs; i++) {

The 'maxticks' array is allocated as 'maxpkgs' in size, so this
limit condition needs to be '<' not '<=', to avoid an out of
bounds write.

> +        /*
> +         * Max numbers of ticks per package
> +         * Time in second * Number of ticks/second * Number of cores/package
> +         * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
> +         */
> +        vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
> +                        * sysconf(_SC_CLK_TCK)
> +                        * vmsr->host_topo.pkg_cpu_count[i];
> +    }
> +
> +    while (true) {
> +        /* Get all qemu threads id */
> +        thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);

Since you're re-allocating thread_ids on each loop iteration, you
need to free it, not overwrite the pointer. This is easily achieved
by moving the earlier declaration inside the loop body. ie 


   g_autofree pid_t *thread_ids =
        thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);

> +
> +        if (thread_ids == NULL) {
> +            goto clean;
> +        }
> +
> +        if (tmp_num_threads < num_threads) {
> +            thd_stat = g_renew(thread_stat, thd_stat, num_threads);
> +        }
> +
> +        tmp_num_threads = num_threads;

You could eliminate 'tmp_num_threads' and just unconditionally
call g_renew on each loop iteration. It'll effectively be a
no-op if the size hasn't changed since the last iteration,
and won't hurt performance in any measurable way.

> +
> +        /* Populate all the thread stats */
> +        for (int i = 0; i < num_threads; i++) {
> +            thd_stat[i].utime = g_new0(unsigned long long, 2);
> +            thd_stat[i].stime = g_new0(unsigned long long, 2);
> +            thd_stat[i].thread_id = thread_ids[i];
> +            vmsr_read_thread_stat(vmsr->pid,
> +                                  thd_stat[i].thread_id,
> +                                  thd_stat[i].utime,
> +                                  thd_stat[i].stime,
> +                                  &thd_stat[i].cpu_id);
> +            thd_stat[i].pkg_id =
> +                vmsr_get_physical_package_id(thd_stat[i].cpu_id);
> +        }
> +
> +        /* Retrieve all packages power plane energy counter */
> +        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
> +            for (int j = 0; j < num_threads; j++) {
> +                /*
> +                 * Use the first thread we found that ran on the CPU
> +                 * of the package to read the packages energy counter
> +                 */
> +                if (thd_stat[j].pkg_id == i) {
> +                    pkg_stat[i].e_start =
> +                    vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
> +                                  thd_stat[j].cpu_id,
> +                                  thd_stat[j].thread_id,
> +                                  s->msr_energy.sioc);
> +                    break;
> +                }
> +            }
> +        }
> +
> +        /* Sleep a short period while the other threads are working */
> +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> +
> +        /*
> +         * Retrieve all packages power plane energy counter
> +         * Calculate the delta of all packages
> +         */
> +        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
> +            for (int j = 0; j < num_threads; j++) {
> +                /*
> +                 * Use the first thread we found that ran on the CPU
> +                 * of the package to read the packages energy counter
> +                 */
> +                if (thd_stat[j].pkg_id == i) {
> +                    pkg_stat[i].e_end =
> +                    vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
> +                                  thd_stat[j].cpu_id,
> +                                  thd_stat[j].thread_id,
> +                                  s->msr_energy.sioc);
> +                    /*
> +                     * Prevent the case we have migrate the VM
> +                     * during the sleep period or any other cases
> +                     * were energy counter might be lower after
> +                     * the sleep period.
> +                     */
> +                    if (pkg_stat[i].e_end > pkg_stat[i].e_start) {
> +                        pkg_stat[i].e_delta =
> +                            pkg_stat[i].e_end - pkg_stat[i].e_start;
> +                    } else {
> +                        pkg_stat[i].e_delta = 0;
> +                    }
> +                    break;
> +                }
> +            }
> +        }
> +
> +        /* Delta of ticks spend by each thread between the sample */
> +        for (int i = 0; i < num_threads; i++) {
> +            vmsr_read_thread_stat(vmsr->pid,
> +                                  thd_stat[i].thread_id,
> +                                  thd_stat[i].utime,
> +                                  thd_stat[i].stime,
> +                                  &thd_stat[i].cpu_id);
> +
> +            if (vmsr->pid < 0) {
> +                /*
> +                 * We don't count the dead thread
> +                 * i.e threads that existed before the sleep
> +                 * and not anymore
> +                 */
> +                thd_stat[i].delta_ticks = 0;
> +            } else {
> +                vmsr_delta_ticks(thd_stat, i);
> +            }
> +        }
> +
> +        /*
> +         * Identify the vcpu threads
> +         * Calculate the number of vcpu per package
> +         */
> +        CPU_FOREACH(cpu) {
> +            for (int i = 0; i < num_threads; i++) {
> +                if (cpu->thread_id == thd_stat[i].thread_id) {
> +                    thd_stat[i].is_vcpu = true;
> +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> +                    pkg_stat[thd_stat[i].pkg_id].nb_vcpu++;
> +                    thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu);
> +                    break;
> +                }
> +            }
> +        }
> +
> +        /* Retrieve the virtual package number of each vCPU */
> +        for (int i = 0; i < vmsr->x86_cpu_list->len; i++) {
> +            for (int j = 0; j < num_threads; j++) {
> +                if ((thd_stat[j].acpi_id == vmsr->x86_cpu_list->cpus[i].arch_id)
> +                    && (thd_stat[j].is_vcpu == true)) {
> +                    x86_topo_ids_from_apicid(thd_stat[j].acpi_id,
> +                        &vmsr->topo_info, &topo_ids);
> +                    thd_stat[j].vpkg_id = topo_ids.pkg_id;
> +                }
> +            }
> +        }
> +
> +        /* Calculate the total energy of all non-vCPU thread */
> +        for (int i = 0; i < num_threads; i++) {
> +            double temp;

Move this inside the 'if', and assign to it at time of declaration.

> +            if ((thd_stat[i].is_vcpu != true) &&
> +                (thd_stat[i].delta_ticks > 0)) {
> +                temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
> +                    thd_stat[i].delta_ticks,
> +                    vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
> +                pkg_stat[thd_stat[i].pkg_id].e_ratio
> +                    += (uint64_t)lround(temp);
> +            }
> +        }
> +
> +        /* Calculate the ratio per non-vCPU thread of each package */
> +        for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
> +            if (pkg_stat[i].nb_vcpu > 0) {
> +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> +            }
> +        }
> +
> +        /*
> +         * Calculate the energy for each Package:
> +         * Energy Package = sum of each vCPU energy that belongs to the package
> +         */
> +        for (int i = 0; i < num_threads; i++) {
> +            double temp;

Aain can move inside the 'if'

> +
> +            if ((thd_stat[i].is_vcpu == true) && \
> +                    (thd_stat[i].delta_ticks > 0)) {
> +                temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
> +                    thd_stat[i].delta_ticks,
> +                    vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
> +                vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
> +                    (uint64_t)lround(temp);
> +                vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
> +                    pkg_stat[thd_stat[i].pkg_id].e_ratio;
> +            }
> +        }
> +
> +        /*
> +         * Finally populate the vmsr register of each vCPU with the total
> +         * package value to emulate the real hardware where each CPU return the
> +         * value of the package it belongs.
> +         */
> +        for (int i = 0; i < num_threads; i++) {
> +            if ((thd_stat[i].is_vcpu == true) && \
> +                    (thd_stat[i].delta_ticks > 0)) {
> +                vmsr->msr_value[thd_stat[i].vcpu_id] = \
> +                                        vpkgs_energy_stat[thd_stat[i].vpkg_id];
> +          }
> +        }
> +
> +        /* Freeing memory before zeroing the pointer */
> +        for (int i = 0; i < num_threads; i++) {
> +            g_free(thd_stat[i].utime);
> +            g_free(thd_stat[i].stime);
> +        }
> +        /* Zeroing the reused memory with g_renew */
> +        memset(thd_stat, 0, num_threads * sizeof(thread_stat));

This memset() should be done immediately after the 'g_renew'
call, since if g_renew enlarged the array, it won't have
zero'd the newly added elements.

> +        memset(thread_ids, 0, sizeof(pid_t));

We need to be free'ing thread_ids, not zero'ing it. This
memset can just be removed entirely if you follow my
earlier suggestion to rely on g_autofree and declare
thread_ids inside the loop body.

> +    }
> +
> +clean:
> +    rcu_unregister_thread();
> +    return NULL;
> +}
> +
> +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> +{
> +    struct KVMMsrEnergy *r = &s->msr_energy;
> +    int ret = 0;
> +
> +    /*
> +     * Sanity check
> +     * 1. Host cpu must be Intel cpu
> +     * 2. RAPL must be enabled on the Host
> +     */
> +    if (is_host_cpu_intel()) {
> +        error_report("The RAPL feature can only be enabled on hosts\
> +                      with Intel CPU models");
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    if (!is_rapl_enabled()) {
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    /* Retrieve the virtual topology */
> +    vmsr_init_topo_info(&r->topo_info, ms);
> +
> +    /* Retrieve the number of vcpu */
> +    r->vcpus = ms->smp.cpus;
> +
> +    /* Retrieve the number of virtual sockets */
> +    r->vsockets = ms->smp.sockets;
> +
> +    /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */
> +    r->msr_value = g_new0(uint64_t, r->vcpus);
> +
> +    /* Retrieve the CPUArchIDlist */
> +    r->x86_cpu_list = x86_possible_cpu_arch_ids(ms);
> +
> +    /* Max number of cpus on the Host */
> +    r->host_topo.maxcpus = vmsr_get_maxcpus();
> +    if (r->host_topo.maxcpus == 0) {
> +        error_report("host max cpus = 0");
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    /* Max number of packages on the host */
> +    r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus);
> +    if (r->host_topo.maxpkgs == 0) {
> +        error_report("host max pkgs = 0");
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    /* Allocate memory for each package on the host */
> +    r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs);
> +    r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs);
> +
> +    vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count,
> +                                r->host_topo.maxpkgs);
> +    for (int i = 0; i < r->host_topo.maxpkgs; i++) {
> +        if (r->host_topo.pkg_cpu_count[i] == 0) {
> +            error_report("cpu per packages = 0 on package_%d", i);
> +            ret = 1;
> +            goto out;
> +        }
> +    }
> +
> +    /* Get QEMU PID*/
> +    r->pid = getpid();
> +
> +    /* Compute the socket path if necessary */
> +    if (s->msr_energy.socket_path == NULL) {
> +        s->msr_energy.socket_path = vmsr_compute_default_paths();
> +    }
> +
> +    /* Open socket with vmsr helper */
> +    s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path);

Need to detect & report error if opening the socket failed.

> +
> +    /* Those MSR values should not change */
> +    r->msr_unit  = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid,
> +                                    s->msr_energy.sioc);
> +    r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid,
> +                                    s->msr_energy.sioc);
> +    r->msr_info  = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid,
> +                                    s->msr_energy.sioc);
> +    if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) {
> +        error_report("can't read any virtual msr");
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> +                       kvm_msr_energy_thread,
> +                       s, QEMU_THREAD_JOINABLE);
> +out:
> +    return ret;
> +}
> +
>  int kvm_arch_get_default_type(MachineState *ms)
>  {
>      return 0;

> diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> new file mode 100644
> index 000000000000..934d14c4305f
> --- /dev/null
> +++ b/target/i386/kvm/vmsr_energy.c
> @@ -0,0 +1,335 @@
> +/*
> + * QEMU KVM support -- x86 virtual RAPL msr
> + *
> + * Copyright 2024 Red Hat, Inc. 2024
> + *
> + *  Author:
> + *      Anthony Harivel <aharivel@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/error-report.h"
> +#include "vmsr_energy.h"
> +#include "io/channel.h"
> +#include "io/channel-socket.h"
> +#include "hw/boards.h"
> +#include "cpu.h"
> +#include "host-cpu.h"

> +
> +QIOChannelSocket *vmsr_open_socket(const char *path)
> +{
> +    g_autofree char *socket_path = NULL;
> +
> +    socket_path = g_strdup(path);

Assign this at time of declaring socket_path

> +
> +    SocketAddress saddr = {
> +        .type = SOCKET_ADDRESS_TYPE_UNIX,
> +        .u.q_unix.path = socket_path
> +    };
> +
> +    QIOChannelSocket *sioc = qio_channel_socket_new();
> +    Error *local_err = NULL;
> +
> +    qio_channel_set_name(QIO_CHANNEL(sioc), "vmsr-helper");
> +    qio_channel_socket_connect_sync(sioc,
> +                                    &saddr,
> +                                    &local_err);
> +    if (local_err) {
> +        /* Close socket. */
> +        qio_channel_close(QIO_CHANNEL(sioc), NULL);
> +        object_unref(OBJECT(sioc));

You need a 'return NULL' here, otherwise we're returning
a free'd pointer to the caller. Aso we need to report the
error.

> +    }
> +
> + return sioc;

Indent misaligned.

> +}
> +
> +uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, uint32_t tid,
> +                       QIOChannelSocket *sioc)
> +{
> +    uint64_t data = 0;
> +    int r = 0;
> +    Error *local_err = NULL;
> +    g_autofree uint32_t buffer[3];

g_autofree is wrong for a stack allocated array, just
remove it.

> +    /*
> +     * Send the required arguments:
> +     * 1. RAPL MSR register to read
> +     * 2. On which CPU ID
> +     * 3. From which vCPU (Thread ID)
> +     */
> +    buffer[0] = reg;
> +    buffer[1] = cpu_id;
> +    buffer[2] = tid;
> +
> +    r = qio_channel_write_all(QIO_CHANNEL(sioc),
> +                              (char *)buffer, sizeof(buffer),
> +                              &local_err);
> +    if (r < 0) {
> +        goto out_close;
> +    }
> +
> +    r = qio_channel_read_all(QIO_CHANNEL(sioc),
> +                             (char *)data, sizeof(data),
> +                             &local_err);
> +    if (r < 0) {
> +        data = 0;
> +        goto out_close;
> +    }
> +
> +out_close:
> +   return data;
> +}

> +/* Retrieve the max number of physical cpu on the host */
> +unsigned int vmsr_get_maxcpus(void)
> +{
> +    GDir *dir;
> +    const gchar *entry_name;
> +    unsigned int cpu_count = 0;
> +    const char *path = "/sys/devices/system/cpu/";
> +
> +    dir = g_dir_open(path, 0, NULL);
> +    if (dir == NULL) {
> +        error_report("Unable to open cpu directory");

Slightly more friendly to include the path in the error
message, and also the error that glib reports.eg

   g_autoptr(GError) gerr = NULL;
   GDir *dir = g_dir_open(path, 0, &gerr);
   if (dir == NULL) {
       error_report("Unable to open cpu directory %s: %s", path, err->message);

> +        return -1;
> +    }
> +
> +    while ((entry_name = g_dir_read_name(dir)) != NULL) {
> +        if (g_ascii_strncasecmp(entry_name, "cpu", 3) == 0 &&
> +            isdigit(entry_name[3])) {
> +            cpu_count++;
> +        }
> +    }
> +
> +    g_dir_close(dir);
> +
> +    return cpu_count;
> +}
> +
> +/* Count the number of physical cpu on each packages */
> +unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
> +                                         unsigned int max_pkgs)
> +{
> +    g_autofree char *file_contents = NULL;
> +    g_autofree char *path = NULL;
> +    gsize length;
> +
> +    /* Iterate over cpus and count cpus in each package */
> +    for (int cpu_id = 0; ; cpu_id++) {
> +        path = g_build_filename(
> +                g_strdup_printf("/sys/devices/system/cpu/cpu%d/"
> +                    "topology/physical_package_id", cpu_id), NULL);

The result of 'g_strdup_printf' needs assigning to a variable
marked with 'g_autofree', before being passed to g_build_filename
to avoid a mem leak.

> +
> +        if (!g_file_get_contents(path, &file_contents, &length, NULL)) {
> +            break; /* No more cpus */
> +        }
> +
> +        /* Get the physical package ID for this CPU */
> +        int package_id = atoi(file_contents);
> +
> +        /* Check if the package ID is within the known number of packages */
> +        if (package_id >= 0 && package_id < max_pkgs) {
> +            /* If yes, count the cpu for this package*/
> +            package_count[package_id]++;
> +        }
> +    }
> +
> +    return 0;
> +}

> +/* Read the scheduled time for a given thread of a give pid */
> +void vmsr_read_thread_stat(pid_t pid,
> +                      unsigned int thread_id,
> +                      unsigned long long *utime,
> +                      unsigned long long *stime,
> +                      unsigned int *cpu_id)
> +{
> +    g_autofree char *path = NULL;
> +
> +    path = g_build_filename(g_strdup_printf("/proc/%u/task/%d/stat", pid, \
> +            thread_id), NULL);

Again need to assign g_strdup_printf result to a g_autofree
marked variable to avoid memleak.

> +
> +    FILE *file = fopen(path, "r");
> +    if (file == NULL) {
> +        pid = -1;
> +        return;
> +    }
> +
> +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> +           utime, stime, cpu_id) != 3)
> +    {
> +        pid = -1;
> +        return;
> +    }
> +
> +    fclose(file);
> +    return;
> +}
> +
> +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> +pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads)
> +{
> +    g_autofree char *path = g_build_filename("/proc",
> +        g_strdup_printf("%d/task", pid), NULL);

Another memleak of g_strdup_printf()result

> +
> +    DIR *dir = opendir(path);
> +    if (dir == NULL) {
> +        error_report("Error opening /proc/qemu/task");
> +        return NULL;
> +    }
> +
> +    pid_t *thread_ids = NULL;
> +    unsigned int thread_count = 0;
> +
> +    g_autofree struct dirent *ent = NULL;
> +    while ((ent = readdir(dir)) != NULL) {
> +        if (ent->d_name[0] == '.') {
> +            continue;
> +        }
> +        pid_t tid = atoi(ent->d_name);
> +        if (pid != tid) {
> +            thread_ids = g_renew(pid_t, thread_ids, (thread_count + 1));
> +            thread_ids[thread_count] = tid;
> +            thread_count++;
> +        }
> +    }
> +
> +    closedir(dir);
> +
> +    *num_threads = thread_count;
> +    return thread_ids;
> +}

> diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> new file mode 100644
> index 000000000000..a04114179346
> --- /dev/null
> +++ b/target/i386/kvm/vmsr_energy.h
> @@ -0,0 +1,99 @@
> +/*
> + * QEMU KVM support -- x86 virtual energy-related MSR.
> + *
> + * Copyright 2024 Red Hat, Inc. 2024
> + *
> + *  Author:
> + *      Anthony Harivel <aharivel@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef VMSR_ENERGY_H
> +#define VMSR_ENERGY_H
> +
> +#include <stdint.h>
> +#include "qemu/osdep.h"
> +#include "io/channel-socket.h"
> +#include "hw/i386/topology.h"
> +
> +/*
> + * Define the interval time in micro seconds between 2 samples of
> + * energy related MSRs
> + */
> +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> +
> +/*
> + * Thread statistic
> + * @ thread_id: TID (thread ID)
> + * @ is_vcpu: true if TID is vCPU thread
> + * @ cpu_id: CPU number last executed on
> + * @ pkg_id: package number of the CPU
> + * @ vcpu_id: vCPU ID
> + * @ vpkg: virtual package number
> + * @ acpi_id: APIC id of the vCPU
> + * @ utime: amount of clock ticks the thread
> + *          has been scheduled in User mode
> + * @ stime: amount of clock ticks the thread
> + *          has been scheduled in System mode
> + * @ delta_ticks: delta of utime+stime between
> + *          the two samples (before/after sleep)
> + */
> +struct thread_stat {

Suggest 'vmsr_thread_stat' to better namespace 

> +    unsigned int thread_id;
> +    bool is_vcpu;
> +    unsigned int cpu_id;
> +    unsigned int pkg_id;
> +    unsigned int vpkg_id;
> +    unsigned int vcpu_id;
> +    unsigned long acpi_id;
> +    unsigned long long *utime;
> +    unsigned long long *stime;
> +    unsigned long long delta_ticks;
> +};
> +
> +/*
> + * Package statistic
> + * @ e_start: package energy counter before the sleep
> + * @ e_end: package energy counter after the sleep
> + * @ e_delta: delta of package energy counter
> + * @ e_ratio: store the energy ratio of non-vCPU thread
> + * @ nb_vcpu: number of vCPU running on this package
> + */
> +struct package_energy_stat {

And 'vmsr_package_energy_stat' too


> +    uint64_t e_start;
> +    uint64_t e_end;
> +    uint64_t e_delta;
> +    uint64_t e_ratio;
> +    unsigned int nb_vcpu;
> +};
> +
> +typedef struct thread_stat thread_stat;
> +typedef struct package_energy_stat package_energy_stat;
> +
> +char *vmsr_compute_default_paths(void);
> +void vmsr_read_thread_stat(pid_t pid,
> +                      unsigned int thread_id,
> +                      unsigned long long *utime,
> +                      unsigned long long *stime,
> +                      unsigned int *cpu_id);
> +
> +QIOChannelSocket *vmsr_open_socket(const char *path);
> +uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id,
> +                       uint32_t tid, QIOChannelSocket *sioc);
> +void vmsr_delta_ticks(thread_stat *thd_stat, int i);
> +unsigned int vmsr_get_maxcpus(void);
> +unsigned int vmsr_get_max_physical_package(unsigned int max_cpus);
> +unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
> +                                         unsigned int max_pkgs);
> +int vmsr_get_physical_package_id(int cpu_id);
> +pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads);
> +double vmsr_get_ratio(uint64_t e_delta,
> +                      unsigned long long delta_ticks,
> +                      unsigned int maxticks);
> +void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, const MachineState *ms);
> +bool is_host_cpu_intel(void);
> +int is_rapl_enabled(void);
> +#endif /* VMSR_ENERGY_H */

Bearing in mind the other feedback from Intel about future CPUs
not using MSRs for this interface, I wonder if a rename is worth
doing.

eg call the file 'rapl.h', and the APIs / structs all have a
'rapl_' name prefix, instead of 'vmsr'.

Then use 'rapl_vmsr_' as a name prefix on the ones that are
specifically tied to the MSR based impl. In future we can
have 'rapl_tpmi' as the prefix for the new interface.


With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

next prev parent reply	other threads:[~2024-04-17 17:57 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-11 12:14 [PATCH v5 0/3] Add support for the RAPL MSRs series Anthony Harivel
2024-04-11 12:14 ` [PATCH v5 1/3] qio: add support for SO_PEERCRED for socket channel Anthony Harivel
2024-04-12 10:58   ` Paolo Bonzini
2024-04-11 12:14 ` [PATCH v5 2/3] tools: build qemu-vmsr-helper Anthony Harivel
2024-04-11 12:14 ` [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu Anthony Harivel
2024-04-17 10:07   ` Zhao Liu
2024-04-17 12:27     ` Daniel P. Berrangé
2024-04-17 15:13       ` Zhao Liu
2024-04-18  8:33         ` Anthony Harivel
2024-04-18 10:52     ` Anthony Harivel
2024-04-19 15:47       ` Zhao Liu
2024-04-17 17:55   ` Daniel P. Berrangé [this message]
2024-04-18 16:42   ` Daniel P. Berrangé
2024-04-18 16:57     ` Anthony Harivel
2024-04-25 15:34     ` Anthony Harivel
2024-04-25 15:42       ` Daniel P. Berrangé
2024-04-26  8:36         ` Anthony Harivel
2024-05-06  9:41           ` Anthony Harivel
2024-04-12 10:57 ` [PATCH v5 0/3] Add support for the RAPL MSRs series Paolo Bonzini
2024-04-17 17:58   ` Daniel P. Berrangé
2024-04-19 17:30     ` Paolo Bonzini
2024-04-17 17:23 ` Daniel P. Berrangé
2024-04-18  8:08   ` Anthony Harivel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZiANL6KkfZy3APJa@redhat.com \
    --to=berrange@redhat.com \
    --cc=aharivel@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=rjarry@redhat.com \
    --cc=vchundur@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.