From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:54826) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1cH2Fb-0001Dg-QV for qemu-devel@nongnu.org; Wed, 14 Dec 2016 00:37:15 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1cH2FZ-0007sh-B7 for qemu-devel@nongnu.org; Wed, 14 Dec 2016 00:37:11 -0500 Message-ID: <1481693756.1555.30.camel@gmail.com> From: Suraj Jitindar Singh Date: Wed, 14 Dec 2016 16:35:56 +1100 In-Reply-To: <20161212040603.27295-4-david@gibson.dropbear.id.au> References: <20161212040603.27295-1-david@gibson.dropbear.id.au> <20161212040603.27295-4-david@gibson.dropbear.id.au> Content-Type: text/plain; charset="UTF-8" Mime-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: Re: [Qemu-devel] [PATCHv3 3/5] pseries: Implement HPT resizing List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: David Gibson , paulus@samba.org Cc: agraf@suse.de, mdroth@linux.vnet.ibm.com, thuth@redhat.com, lvivier@redhat.com, qemu-ppc@nongnu.org, qemu-devel@nongnu.org On Mon, 2016-12-12 at 15:06 +1100, David Gibson wrote: > This patch implements hypercalls allowing a PAPR guest to resize its > own > hash page table.  This will eventually allow for more flexible memory > hotplug. > > The implementation is partially asynchronous, handled in a special > thread > running the hpt_prepare_thread() function.  The state of a pending > resize > is stored in SPAPR_MACHINE->pending_hpt. > > The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new > HPT, or, > if one is already in progress, monitor it for completion.  If there > is an > existing HPT resize in progress that doesn't match the size specified > in > the call, it will cancel it, replacing it with a new one matching the > given size. > > The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and > can only > be called successfully once H_RESIZE_HPT_PREPARE has successfully > completed initialization of a new HPT.  The guest must ensure that > there > are no concurrent accesses to the existing HPT while this is called > (this > effectively means stop_machine() for Linux guests). > > For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing > each > HPTE into the new HPT.  This can have quite high latency, but it > seems to > be of the order of typical migration downtime latencies for HPTs of > size > up to ~2GiB (which would be used in a 256GiB guest). > > In future we probably want to move more of the rehashing to the > "prepare" > phase, by having H_ENTER and other hcalls update both current and > pending HPTs.  That's a project for another day, but should be > possible > without any changes to the guest interface. > > Signed-off-by: David Gibson > --- >  hw/ppc/spapr.c          |   4 +- >  hw/ppc/spapr_hcall.c    | 345 > +++++++++++++++++++++++++++++++++++++++++++++++- >  include/hw/ppc/spapr.h  |   6 + >  target-ppc/mmu-hash64.h |   4 + >  4 files changed, 353 insertions(+), 6 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 846ce51..d057031 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -93,8 +93,6 @@ >   >  #define PHANDLE_XICP            0x00001111 >   > -#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift)) > - >  static XICSState *try_create_xics(const char *type, int nr_servers, >                                    int nr_irqs, Error **errp) >  { > @@ -1055,7 +1053,7 @@ static void close_htab_fd(sPAPRMachineState > *spapr) >      spapr->htab_fd = -1; >  } >   > -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize) > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize) >  { >      int shift; >   > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 72a9c4d..1e89061 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -2,6 +2,7 @@ >  #include "qapi/error.h" >  #include "sysemu/sysemu.h" >  #include "qemu/log.h" > +#include "qemu/error-report.h" >  #include "cpu.h" >  #include "exec/exec-all.h" >  #include "helper_regs.h" > @@ -355,20 +356,319 @@ static target_ulong h_read(PowerPCCPU *cpu, > sPAPRMachineState *spapr, >      return H_SUCCESS; >  } >   > +struct sPAPRPendingHPT { > +    /* These fields are read-only after initialization */ > +    int shift; > +    QemuThread thread; > + > +    /* These fields are protected by the BQL */ > +    bool complete; > + > +    /* These fields are private to the preparation thread if > +     * !complete, otherwise protected by the BQL */ > +    int ret; > +    void *hpt; > +}; > + > +static void free_pending_hpt(sPAPRPendingHPT *pending) > +{ > +    if (pending->hpt) { > +        qemu_vfree(pending->hpt); > +    } > + > +    g_free(pending); > +} > + > +static void *hpt_prepare_thread(void *opaque) > +{ > +    sPAPRPendingHPT *pending = opaque; > +    size_t size = 1ULL << pending->shift; > + > +    pending->hpt = qemu_memalign(size, size); > +    if (pending->hpt) { > +        memset(pending->hpt, 0, size); > +        pending->ret = H_SUCCESS; > +    } else { > +        pending->ret = H_NO_MEM; > +    } > + > +    qemu_mutex_lock_iothread(); > + > +    if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) { > +        /* Ready to go */ > +        pending->complete = true; > +    } else { > +        /* We've been cancelled, clean ourselves up */ > +        free_pending_hpt(pending); > +    } > + > +    qemu_mutex_unlock_iothread(); > +    return NULL; > +} > + > +/* Must be called with BQL held */ > +static void cancel_hpt_prepare(sPAPRMachineState *spapr) > +{ > +    sPAPRPendingHPT *pending = spapr->pending_hpt; > + > +    /* Let the thread know it's cancelled */ > +    spapr->pending_hpt = NULL; > + > +    if (!pending) { > +        /* Nothing to do */ > +        return; > +    } > + > +    if (!pending->complete) { > +        /* thread will clean itself up */ > +        return; > +    } > + > +    free_pending_hpt(pending); > +} > + > +static int build_dimm_list(Object *obj, void *opaque) > +{ > +    GSList **list = opaque; > + > +    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { > +        DeviceState *dev = DEVICE(obj); > +        if (dev->realized) { /* only realized DIMMs matter */ > +            *list = g_slist_prepend(*list, dev); > +        } > +    } > + > +    object_child_foreach(obj, build_dimm_list, opaque); > +    return 0; > +} > + > +static ram_addr_t get_current_ram_size(void) > +{ > +    GSList *list = NULL, *item; > +    ram_addr_t size = ram_size; > + > +    build_dimm_list(qdev_get_machine(), &list); > +    for (item = list; item; item = g_slist_next(item)) { > +        Object *obj = OBJECT(item->data); > +        if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) { > +            size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, > +                                            &error_abort); > +        } > +    } > +    g_slist_free(list); > + > +    return size; > +} > + >  static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu, >                                           sPAPRMachineState *spapr, >                                           target_ulong opcode, >                                           target_ulong *args) >  { >      target_ulong flags = args[0]; > -    target_ulong shift = args[1]; > +    int shift = args[1]; > +    sPAPRPendingHPT *pending = spapr->pending_hpt; >   >      if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) { >          return H_AUTHORITY; >      } >   >      trace_spapr_h_resize_hpt_prepare(flags, shift); > -    return H_HARDWARE; > + > +    if (flags != 0) { > +        return H_PARAMETER; > +    } > + > +    if (shift && ((shift < 18) || (shift > 46))) { > +        return H_PARAMETER; > +    } > + > +    if (pending) { > +        /* something already in progress */ > +        if (pending->shift == shift) { > +            /* and it's suitable */ > +            if (pending->complete) { > +                return pending->ret; > +            } else { > +                return H_LONG_BUSY_ORDER_100_MSEC; > +            } > +        } > + > +        /* not suitable, cancel and replace */ > +        cancel_hpt_prepare(spapr); > +    } > + > +    if (!shift) { > +        /* nothing to do */ > +        return H_SUCCESS; > +    } > + > +    /* start new prepare */ > + > +    /* We only allow the guest to allocate an HPT one order above > what > +     * we'd normally give them (to stop a small guest claiming a > huge > +     * chunk of resources in the HPT */ > +    if (shift > (spapr_hpt_shift_for_ramsize(get_current_ram_size()) > + 1)) Given we already allocate one lower than recommended in the ISA by default, should we set this to 2 to allow an allocation of 1 greater than the recommended minimum? > +        return H_RESOURCE; > +    } > + > +    pending = g_new0(sPAPRPendingHPT, 1); > +    pending->shift = shift; > +    pending->ret = H_HARDWARE; > + > +    qemu_thread_create(&pending->thread, "sPAPR HPT prepare", > +                       hpt_prepare_thread, pending, > QEMU_THREAD_DETACHED); > + > +    spapr->pending_hpt = pending; > + > +    /* In theory we could estimate the time more accurately based on > +     * the new size, but there's not much point */ > +    return H_LONG_BUSY_ORDER_100_MSEC; > +} > + > +static uint64_t new_hpte_load0(void *htab, uint64_t pteg, int slot) > +{ > +    uint8_t *addr = htab; > + > +    addr += pteg * HASH_PTEG_SIZE_64; > +    addr += slot * HASH_PTE_SIZE_64; > +    return  ldq_p(addr); > +} > + > +static void new_hpte_store(void *htab, uint64_t pteg, int slot, > +                           uint64_t pte0, uint64_t pte1) > +{ > +    uint8_t *addr = htab; > + > +    addr += pteg * HASH_PTEG_SIZE_64; > +    addr += slot * HASH_PTE_SIZE_64; > + > +    stq_p(addr, pte0); > +    stq_p(addr + HASH_PTE_SIZE_64/2, pte1); > +} > + > +static int rehash_hpte(PowerPCCPU *cpu, uint64_t token, > +                       void *old, uint64_t oldsize, > +                       void *new, uint64_t newsize, > +                       uint64_t pteg, int slot) > +{ > +    uint64_t old_hash_mask = (oldsize >> 7) - 1; > +    uint64_t new_hash_mask = (newsize >> 7) - 1; > +    target_ulong pte0 = ppc_hash64_load_hpte0(cpu, token, slot); > +    target_ulong pte1; > +    uint64_t avpn; > +    unsigned shift; > +    uint64_t hash, new_pteg, replace_pte0; > + > +    if (!(pte0 & HPTE64_V_VALID) || !(pte0 & HPTE64_V_BOLTED)) { > +        return H_SUCCESS; > +    } > + > +    pte1 = ppc_hash64_load_hpte1(cpu, token, slot); > + > +    shift = ppc_hash64_hpte_page_shift_noslb(cpu, pte0, pte1); > +    assert(shift); /* H_ENTER should never have allowed a bad > encoding */ > +    avpn = HPTE64_V_AVPN_VAL(pte0) & ~(((1ULL << shift) - 1) >> 23); > + > +    if (pte0 & HPTE64_V_SECONDARY) { > +        pteg = ~pteg; > +    } > + > +    if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_256M) { > +        uint64_t offset, vsid; > + > +        /* We only have 28 - 23 bits of offset in avpn */ > +        offset = (avpn & 0x1f) << 23; > +        vsid = avpn >> 5; > +        /* We can find more bits from the pteg value */ > +        if (shift < 23) { > +            offset |= ((vsid ^ pteg) & old_hash_mask) << shift; > +        } > + > +        hash = vsid ^ (offset >> shift); > +    } else if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_1T) { > +        uint64_t offset, vsid; > + > +        /* We only have 40 - 23 bits of seg_off in avpn */ > +        offset = (avpn & 0x1ffff) << 23; > +        vsid = avpn >> 17; > +        if (shift < 23) { > +            offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) > << shift; > +        } > + > +        hash = vsid ^ (vsid << 25) ^ (offset >> shift); > +    } else { > +        error_report("rehash_pte: Bad segment size in HPTE"); > +        return H_HARDWARE; > +    } > + > +    new_pteg = hash & new_hash_mask; > +    if (pte0 & HPTE64_V_SECONDARY) { > +        assert(~pteg == (hash & old_hash_mask)); > +        new_pteg = ~new_pteg; > +    } else { > +        assert(pteg == (hash & old_hash_mask)); > +    } > +    assert((oldsize != newsize) || (pteg == new_pteg)); > +    replace_pte0 = new_hpte_load0(new, new_pteg, slot); > +    if (replace_pte0 & HPTE64_V_VALID) { > +        assert(newsize < oldsize); > +        if (replace_pte0 & HPTE64_V_BOLTED) { > +            if (pte0 & HPTE64_V_BOLTED) { > +                /* Bolted collision, nothing we can do */ > +                return H_PTEG_FULL; > +            } else { > +                /* Discard this hpte */ > +                return H_SUCCESS; > +            } > +        } > +    } > + > +    new_hpte_store(new, new_pteg, slot, pte0, pte1); > +    return H_SUCCESS; > +} > + > +static int rehash_hpt(PowerPCCPU *cpu, > +                      void *old, uint64_t oldsize, > +                      void *new, uint64_t newsize) > +{ > +    CPUPPCState *env = &cpu->env; > +    uint64_t n_ptegs = oldsize >> 7; > +    uint64_t pteg; > +    int slot; > +    int rc; > + > +    assert(env->external_htab == old); > + > +    for (pteg = 0; pteg < n_ptegs; pteg++) { > +        uint64_t token = ppc_hash64_start_access(cpu, pteg * > HPTES_PER_GROUP); > + > +        if (!token) { > +            return H_HARDWARE; > +        } > + > +        for (slot = 0; slot < HPTES_PER_GROUP; slot++) { > +            rc = rehash_hpte(cpu, token, old, oldsize, new, newsize, > +                             pteg, slot); > +            if (rc != H_SUCCESS) { > +                ppc_hash64_stop_access(cpu, token); > +                return rc; > +            } > +        } > +        ppc_hash64_stop_access(cpu, token); > +    } > + > +    return H_SUCCESS; > +} > + > +static void pivot_hpt(CPUState *cs, run_on_cpu_data arg) > +{ > +    sPAPRMachineState *spapr = SPAPR_MACHINE(arg.host_ptr); > +    PowerPCCPU *cpu = POWERPC_CPU(cs); > + > +    cpu_synchronize_state(cs); > +    ppc_hash64_set_external_hpt(cpu, spapr->htab, spapr->htab_shift, > +                                &error_fatal); >  } >   >  static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu, > @@ -378,13 +678,52 @@ static target_ulong > h_resize_hpt_commit(PowerPCCPU *cpu, >  { >      target_ulong flags = args[0]; >      target_ulong shift = args[1]; > +    sPAPRPendingHPT *pending = spapr->pending_hpt; > +    int rc; > +    size_t newsize; >   >      if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) { >          return H_AUTHORITY; >      } >   >      trace_spapr_h_resize_hpt_commit(flags, shift); > -    return H_HARDWARE; > + > +    if (flags != 0) { > +        return H_PARAMETER; > +    } > + > +    if (!pending || (pending->shift != shift)) { > +        /* no matching prepare */ > +        return H_CLOSED; > +    } > + > +    if (!pending->complete) { > +        /* prepare has not completed */ > +        return H_BUSY; > +    } > + > +    newsize = 1ULL << pending->shift; > +    rc = rehash_hpt(cpu, spapr->htab, HTAB_SIZE(spapr), > +                    pending->hpt, newsize); > +    if (rc == H_SUCCESS) { > +        CPUState *cs; > + > +        qemu_vfree(spapr->htab); > +        spapr->htab = pending->hpt; > +        spapr->htab_shift = pending->shift; > + > +        CPU_FOREACH(cs) { > +            run_on_cpu(cs, pivot_hpt, RUN_ON_CPU_HOST_PTR(spapr)); > +        } > + > +        pending->hpt = NULL; /* so it's not free()d */ > +    } > + > +    /* Clean up */ > +    spapr->pending_hpt = NULL; > +    free_pending_hpt(pending); > + > +    return rc; >  } >   >  static target_ulong h_set_sprg0(PowerPCCPU *cpu, sPAPRMachineState > *spapr, > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index d2c758b..954bada 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -14,6 +14,7 @@ struct sPAPRNVRAM; >  typedef struct sPAPRConfigureConnectorState > sPAPRConfigureConnectorState; >  typedef struct sPAPREventLogEntry sPAPREventLogEntry; >  typedef struct sPAPREventSource sPAPREventSource; > +typedef struct sPAPRPendingHPT sPAPRPendingHPT; >   >  #define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL >  #define SPAPR_ENTRY_POINT       0x100 > @@ -72,6 +73,8 @@ struct sPAPRMachineState { >      sPAPRResizeHPT resize_hpt; >      void *htab; >      uint32_t htab_shift; > +    sPAPRPendingHPT *pending_hpt; /* in-progress resize */ > + >      hwaddr rma_size; >      int vrma_adjust; >      ssize_t rtas_size; > @@ -627,6 +630,7 @@ void > spapr_hotplug_req_remove_by_count_indexed(sPAPRDRConnectorType > drc_type, >                                                 uint32_t count, > uint32_t index); >  void *spapr_populate_hotplug_cpu_dt(CPUState *cs, int *fdt_offset, >                                      sPAPRMachineState *spapr); > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize); >   >  /* rtas-configure-connector state */ >  struct sPAPRConfigureConnectorState { > @@ -674,4 +678,6 @@ int spapr_rng_populate_dt(void *fdt); >   >  void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data > arg); >   > +#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift)) > + >  #endif /* HW_SPAPR_H */ > diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h > index ab5d347..4a1b781 100644 > --- a/target-ppc/mmu-hash64.h > +++ b/target-ppc/mmu-hash64.h > @@ -60,11 +60,15 @@ void ppc_hash64_update_rmls(CPUPPCState *env); >  #define HASH_PTE_SIZE_64        16 >  #define HASH_PTEG_SIZE_64       (HASH_PTE_SIZE_64 * HPTES_PER_GROUP) >   > +#define HPTE64_V_SSIZE          SLB_VSID_B > +#define HPTE64_V_SSIZE_256M     SLB_VSID_B_256M > +#define HPTE64_V_SSIZE_1T       SLB_VSID_B_1T >  #define HPTE64_V_SSIZE_SHIFT    62 >  #define HPTE64_V_AVPN_SHIFT     7 >  #define HPTE64_V_AVPN           0x3fffffffffffff80ULL >  #define HPTE64_V_AVPN_VAL(x)    (((x) & HPTE64_V_AVPN) >> > HPTE64_V_AVPN_SHIFT) >  #define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & > 0xffffffffffffff83ULL)) > +#define HPTE64_V_BOLTED         0x0000000000000010ULL >  #define HPTE64_V_LARGE          0x0000000000000004ULL >  #define HPTE64_V_SECONDARY      0x0000000000000002ULL >  #define HPTE64_V_VALID          0x0000000000000001ULL