Re: [RFC PATCH 1/3] plugins: Added a new cache modelling plugin

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Alex Bennée" <alex.bennee@linaro.org>
To: Mahmoud Mandour <ma.mandourr@gmail.com>
Cc: qemu-devel@nongnu.org
Subject: Re: [RFC PATCH 1/3] plugins: Added a new cache modelling plugin
Date: Tue, 01 Jun 2021 10:53:08 +0100	[thread overview]
Message-ID: <87a6o9deiy.fsf@linaro.org> (raw)
In-Reply-To: <20210529152203.40358-2-ma.mandourr@gmail.com>


Mahmoud Mandour <ma.mandourr@gmail.com> writes:

> Added a cache modelling plugin that uses a static configuration used in
> many of the commercial microprocessors and uses random eviction policy.
>
> Signed-off-by: Mahmoud Mandour <ma.mandourr@gmail.com>
> ---
>  contrib/plugins/Makefile |   1 +
>  contrib/plugins/cache.c  | 398 +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 399 insertions(+)
>  create mode 100644 contrib/plugins/cache.c
>
> diff --git a/contrib/plugins/Makefile b/contrib/plugins/Makefile
> index b9d7935e5e..2237b47f8b 100644
> --- a/contrib/plugins/Makefile
> +++ b/contrib/plugins/Makefile
> @@ -18,6 +18,7 @@ NAMES += hotpages
>  NAMES += howvec
>  NAMES += lockstep
>  NAMES += hwprofile
> +NAMES += cache
>  
>  SONAMES := $(addsuffix .so,$(addprefix lib,$(NAMES)))
>  
> diff --git a/contrib/plugins/cache.c b/contrib/plugins/cache.c
> new file mode 100644
> index 0000000000..f8c15ebed2
> --- /dev/null
> +++ b/contrib/plugins/cache.c
> @@ -0,0 +1,398 @@
> +/*
> + * Copyright (C) 2021, Mahmoud Mandour <ma.mandourr@gmail.com>
> + *
> + * License: GNU GPL, version 2 or later.
> + *   See the COPYING file in the top-level directory.
> + */
> +
> +#include <inttypes.h>
> +#include <assert.h>
> +#include <stdlib.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <glib.h>
> +
> +#include <qemu-plugin.h>
> +
> +QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
> +
> +static GRand *rng;
> +static GHashTable *dmiss_ht;
> +static GHashTable *imiss_ht;
> +
> +static GMutex dmtx, imtx;
> +
> +static int limit;
> +static bool sys;
> +
> +static uint64_t dmem_accesses;
> +static uint64_t dmisses;
> +
> +static uint64_t imem_accesses;
> +static uint64_t imisses;
> +
> +static enum qemu_plugin_mem_rw rw = QEMU_PLUGIN_MEM_RW;
> +
> +enum AccessResult {
> +    HIT = 0,
> +    MISS = 1
> +};
> +
> +struct InsnData {
> +    char *disas_str;
> +    uint64_t addr;
> +    uint64_t misses;
> +};

A little commentary to the relationship between CacheSet and CacheBlock
would be useful here for those trying to follow the code. Maybe a little
ascii art if your up to it?

> +
> +struct CacheBlock {
> +    uint64_t tag;
> +    bool valid;
> +};
> +
> +struct CacheSet {
> +    struct CacheBlock *blocks;
> +};
> +
> +struct Cache {
> +    struct CacheSet *sets;
> +    int num_sets;
> +
> +    int cachesize;
> +    int blksize;
> +    int assoc;
> +
> +    uint64_t blk_mask;
> +    uint64_t set_mask;
> +    uint64_t tag_mask;
> +};
> +
> +struct Cache *dcache, *icache;
> +
> +static int pow_of_two(int num)
> +{
> +    g_assert((num & (num - 1)) == 0);
> +    int ret = 0;
> +    while (num /= 2) {
> +        ret++;
> +    }
> +    return ret;
> +}

You could probably eliminate this by:

  a) pre-calculating masks and shifts at start-up
  b) expressing cache-size as a power of 2 (are caches ever not?)

Currently it is by far the biggest hit on the CPU:

  46.42%  qemu-aarch64  libcache.so              [.] pow_of_two
  16.71%  qemu-aarch64  libcache.so              [.] lru_update_on_hit
  14.12%  qemu-aarch64  libcache.so              [.] in_cache
   6.73%  qemu-aarch64  libcache.so              [.] extract_tag
   4.52%  qemu-aarch64  libcache.so              [.] extract_set
   4.48%  qemu-aarch64  libcache.so              [.] access_cache
   2.34%  qemu-aarch64  libcache.so              [.] vcpu_insn_exec
   1.63%  qemu-aarch64  libcache.so              [.] vcpu_mem_access
   0.72%  qemu-aarch64  libglib-2.0.so.0.5800.3  [.] g_mutex_lock

> +static inline uint64_t extract_tag(struct Cache *cache, uint64_t addr)
> +{
> +    return (addr & cache->tag_mask) >>
> +        (pow_of_two(cache->num_sets) + pow_of_two(cache->blksize));
> +}
> +
> +static inline uint64_t extract_set(struct Cache *cache, uint64_t addr)
> +{
> +    return (addr & cache->set_mask) >> (pow_of_two(cache->blksize));
> +}

It would make sense to enforce pow_of_two for num_sets and blksize on
initialisation to avoid doing it for every tag. Maybe rename them to
set_shift and blksize_shift to better indicate their usage.

> +
> +static struct Cache *cache_init(int blksize, int assoc, int cachesize)
> +{
> +    struct Cache *cache;
> +    int i;
> +
> +    cache = g_new(struct Cache, 1);
> +    cache->blksize = blksize;
> +    cache->assoc = assoc;
> +    cache->cachesize = cachesize;
> +    cache->num_sets = cachesize / (blksize * assoc);
> +    cache->sets = g_new(struct CacheSet, cache->num_sets);
> +
> +    for (i = 0; i < cache->num_sets; i++) {
> +        cache->sets[i].blocks = g_new0(struct CacheBlock, assoc);
> +    }
> +
> +    cache->blk_mask = blksize - 1;
> +    cache->set_mask = ((cache->num_sets - 1) << (pow_of_two(cache->blksize)));
> +    cache->tag_mask = ~(cache->set_mask | cache->blk_mask);
> +
> +    return cache;
> +}
> +
> +static int get_invalid_block(struct Cache *cache, uint64_t set)
> +{
> +    int i;
> +
> +    for (i = 0; i < cache->assoc; i++) {
> +        if (!cache->sets[set].blocks[i].valid) {
> +            /* conflict miss */
> +            return i;
> +        }
> +    }
> +
> +    /* compulsary miss */
> +    return -1;
> +}
> +
> +static int get_replaced_block(struct Cache *cache)
> +{
> +    return g_rand_int_range(rng, 0, cache->assoc);
> +}
> +
> +static bool in_cache(struct Cache *cache, uint64_t addr)
> +{
> +    int i;
> +    uint64_t tag, set;
> +
> +    tag = extract_tag(cache, addr);
> +    set = extract_set(cache, addr);
> +
> +    for (i = 0; i < cache->assoc; i++) {
> +        if (cache->sets[set].blocks[i].tag == tag &&
> +                cache->sets[set].blocks[i].valid) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +static enum AccessResult access_cache(struct Cache *cache, uint64_t
> addr)

Does the enum really make things easier compared to a straight bool? You
could just explain things with a comment:

  /**
   * access_cache() - simulate a cache access
   * @cache: reference to the cache being used
   * @addr: address of cached entity
   *
   * Returns true if the cache hit, false otherwise and the cache is
   * updated for next time.
   */
  static bool access_cache(struct Cache *cache, uint64_t addr)
  {

And then have:

  if (!access_cache(icache, addr)) {
      struct InsnData *insn = get_or_create(imiss_ht, userdata, addr);
      insn->misses++;
      imisses++;
  }


> +{
> +    uint64_t tag, set;
> +    int replaced_blk;
> +
> +    if (in_cache(cache, addr)) {
> +        return HIT;
> +    }
> +
> +    tag = extract_tag(cache, addr);
> +    set = extract_set(cache, addr);
> +
> +    replaced_blk = get_invalid_block(cache, set);
> +
> +    if (replaced_blk == -1) {
> +        replaced_blk = get_replaced_block(cache);
> +    }
> +
> +    cache->sets[set].blocks[replaced_blk].tag = tag;
> +    cache->sets[set].blocks[replaced_blk].valid = true;
> +
> +    return MISS;
> +}
> +
> +struct InsnData *get_or_create(GHashTable *ht, struct InsnData *insn_data,
> +                               uint64_t addr)
> +{
> +    struct InsnData *insn = g_hash_table_lookup(ht, GUINT_TO_POINTER(addr));
> +    if (!insn) {
> +        g_hash_table_insert(ht, GUINT_TO_POINTER(addr), (gpointer) insn_data);
> +        insn = insn_data;
> +    }
> +
> +    return insn;
> +}
> +
> +static void vcpu_mem_access(unsigned int cpu_index, qemu_plugin_meminfo_t info,
> +                            uint64_t vaddr, void *userdata)
> +{
> +    uint64_t insn_addr;
> +    uint64_t effective_addr;
> +    struct qemu_plugin_hwaddr *hwaddr;
> +
> +    g_mutex_lock(&dmtx);
> +    hwaddr = qemu_plugin_get_hwaddr(info, vaddr);
> +    if (hwaddr && qemu_plugin_hwaddr_is_io(hwaddr)) {
> +        g_free(userdata);
> +        return;
> +    }
> +
> +    insn_addr = ((struct InsnData *) userdata)->addr;
> +    effective_addr = hwaddr ? qemu_plugin_hwaddr_phys_addr(hwaddr) : vaddr;
> +
> +    if (access_cache(dcache, effective_addr) == MISS) {
> +        struct InsnData *insn = get_or_create(dmiss_ht, userdata, insn_addr);
> +        insn->misses++;
> +        dmisses++;
> +    }
> +    dmem_accesses++;
> +    g_mutex_unlock(&dmtx);
> +}
> +
> +static void vcpu_insn_exec(unsigned int vcpu_index, void *userdata)
> +{
> +    uint64_t addr;
> +
> +    g_mutex_lock(&imtx);
> +    addr = ((struct InsnData *) userdata)->addr;
> +
> +    if (access_cache(icache, addr) == MISS) {
> +        struct InsnData *insn = get_or_create(imiss_ht, userdata, addr);
> +        insn->misses++;
> +        imisses++;
> +    }
> +    imem_accesses++;
> +    g_mutex_unlock(&imtx);
> +}
> +
> +static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
> +{
> +    size_t n_insns;
> +    size_t i;
> +
> +    n_insns = qemu_plugin_tb_n_insns(tb);
> +    for (i = 0; i < n_insns; i++) {
> +        struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i);
> +        uint64_t effective_addr;
> +
> +        if (sys) {
> +            effective_addr = (uint64_t) qemu_plugin_insn_haddr(insn);
> +        } else {
> +            effective_addr = (uint64_t) qemu_plugin_insn_vaddr(insn);
> +        }
> +
> +        struct InsnData *ddata = g_new(struct InsnData, 1);
> +        struct InsnData *idata = g_new(struct InsnData, 1);

OK I think I see what you where saying on the sync up earlier. You need
to take into account any given instruction may get translated multiple
times so I think for any given instruction you are tracking you want to
get or create an entry here.

> +
> +        ddata->disas_str = qemu_plugin_insn_disas(insn);
> +        ddata->misses = 0;
> +        ddata->addr = effective_addr;
> +
> +        idata->disas_str = g_strdup(ddata->disas_str);
> +        idata->misses = 0;
> +        idata->addr = effective_addr;

And you might as well combine the InsnData to track both data and icache
misses in one structure to avoid the duplication of records and strings.

> +
> +        qemu_plugin_register_vcpu_mem_cb(insn, vcpu_mem_access,
> +                                         QEMU_PLUGIN_CB_NO_REGS,
> +                                         rw, ddata);
> +
> +        qemu_plugin_register_vcpu_insn_exec_cb(insn, vcpu_insn_exec,
> +                                               QEMU_PLUGIN_CB_NO_REGS, idata);
> +    }
> +}
> +
> +static void print_entry(gpointer data)
> +{
> +    struct InsnData *insn = (struct InsnData *) data;
> +    g_autoptr(GString) xx = g_string_new("");
> +    g_string_append_printf(xx, "0x%" PRIx64 ": %s - misses: %lu\n",
> +            insn->addr, insn->disas_str, insn->misses);

As you are likely going to want to post-process this data I would
suggest a slightly more machine readable format:

  address, misses, instruction
  0x419298, 2, mov x0, x21
  0x41aa40, 2, add x0, x0, #0x17
  0x419640, 2,  add x5, x4, #0x218
  0x41aa10, 2,  adrp x1, #0x48b000
  0x4002d0, 2,  adrp x16, #0x48b000

> +    qemu_plugin_outs(xx->str);
> +}
> +
> +static void free_insn(gpointer data)
> +{
> +    struct InsnData *insn = (struct InsnData *) data;
> +    g_free(insn->disas_str);
> +    g_free(insn);
> +}
> +
> +static void free_cache(struct Cache *cache)
> +{
> +    for (int i = 0; i < cache->num_sets; i++) {
> +        g_free(cache->sets[i].blocks);
> +    }
> +
> +    g_free(cache->sets);
> +}
> +
> +static int cmp(gconstpointer a, gconstpointer b)

This will likely need renaming if you ever want to sort by different
things.

> +{
> +    struct InsnData *insn_a = (struct InsnData *) a;
> +    struct InsnData *insn_b = (struct InsnData *) b;
> +
> +    return insn_a->misses < insn_b->misses ? 1 : -1;
> +}
> +
> +static void print_stats()
> +{
> +    g_autoptr(GString) rep = g_string_new("");
> +    g_string_append_printf(rep,
> +            "Data accesses: %lu, Misses: %lu\nMiss rate: %lf%%\n\n",
> +            dmem_accesses,
> +            dmisses,
> +            ((double)dmisses / dmem_accesses) * 100.0);
> +
> +    g_string_append_printf(rep,
> +            "Instruction accesses: %lu, Misses: %lu\nMiss rate: %lf%%\n\n",
> +            imem_accesses,
> +            imisses,
> +            ((double)imisses / imem_accesses) * 100.0);
> +
> +    qemu_plugin_outs(rep->str);
> +}
> +
> +static void plugin_exit()
> +{
> +    GList *curr;
> +    int i;
> +
> +    g_mutex_lock(&imtx);
> +    g_mutex_lock(&dmtx);
> +    GList *dmiss_insns = g_hash_table_get_values(dmiss_ht);
> +    GList *imiss_insns = g_hash_table_get_values(imiss_ht);
> +    dmiss_insns = g_list_sort(dmiss_insns, cmp);
> +    imiss_insns = g_list_sort(imiss_insns, cmp);
> +
> +    print_stats();
> +
> +    qemu_plugin_outs("Most data-missing instructions\n");
> +    for (curr = dmiss_insns, i = 0; curr && i < limit; i++, curr = curr->next) {
> +        print_entry(curr->data);
> +    }
> +
> +    qemu_plugin_outs("\nMost fetch-missing instructions\n");
> +    for (curr = imiss_insns, i = 0; curr && i < limit; i++, curr = curr->next) {
> +        print_entry(curr->data);
> +    }
> +
> +    free_cache(dcache);
> +    free_cache(icache);
> +
> +    g_list_free(dmiss_insns);
> +    g_list_free(imiss_insns);
> +
> +    g_hash_table_destroy(dmiss_ht);
> +    g_hash_table_destroy(imiss_ht);
> +
> +    g_mutex_unlock(&dmtx);
> +    g_mutex_unlock(&imtx);
> +}
> +
> +QEMU_PLUGIN_EXPORT
> +int qemu_plugin_install(qemu_plugin_id_t id, const qemu_info_t *info,
> +                        int argc, char **argv)
> +{
> +    int i;
> +    int iassoc, iblksize, icachesize;
> +    int dassoc, dblksize, dcachesize;
> +
> +    limit = 32;
> +    sys = info->system_emulation;
> +
> +    dassoc = 8;
> +    dblksize = 64;
> +    dcachesize = dblksize * dassoc * 32;
> +
> +    iassoc = 8;
> +    iblksize = 64;
> +    icachesize = iblksize * iassoc * 32;
> +
> +    rng = g_rand_new();
> +
> +    for (i = 0; i < argc; i++) {
> +        char *opt = argv[i];
> +        if (g_str_has_prefix(opt, "limit=")) {
> +            limit = g_ascii_strtoull(opt + 6, NULL, 10);
> +        } else {
> +            fprintf(stderr, "option parsing failed: %s\n", opt);
> +            return -1;
> +        }
> +    }
> +
> +    dcache = cache_init(dblksize, dassoc, dcachesize);
> +    icache = cache_init(iblksize, iassoc, icachesize);
> +
> +    qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
> +    qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
> +
> +    dmiss_ht = g_hash_table_new_full(NULL, g_direct_equal, NULL, free_insn);
> +    imiss_ht = g_hash_table_new_full(NULL, g_direct_equal, NULL, free_insn);
> +
> +    return 0;
> +}


-- 
Alex Bennée

next prev parent reply	other threads:[~2021-06-01 11:14 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-29 15:22 [RFC PATCH 0/3] Cache modelling TCG plugin Mahmoud Mandour
2021-05-29 15:22 ` [RFC PATCH 1/3] plugins: Added a new cache modelling plugin Mahmoud Mandour
2021-06-01  9:53   ` Alex Bennée [this message]
2021-06-02  3:51     ` Mahmoud Mandour
2021-05-29 15:22 ` [RFC PATCH 2/3] plugins: cache: Enabled parameterization and added trace printing Mahmoud Mandour
2021-05-29 15:22 ` [RFC PATCH 3/3] plugins: cache: Added FIFO and LRU eviction policies Mahmoud Mandour
2021-05-30  6:36 ` [RFC PATCH 0/3] Cache modelling TCG plugin Philippe Mathieu-Daudé
2021-05-30  6:41   ` Mahmoud Mandour

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87a6o9deiy.fsf@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=ma.mandourr@gmail.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.