From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from [140.186.70.92] (port=36271 helo=eggs.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1PzsGT-0006N3-TW
	for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:59 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <agraf@suse.de>) id 1PzsGS-0006Q5-6E
	for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:57 -0400
Received: from cantor.suse.de ([195.135.220.2]:38452 helo=mx1.suse.de)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <agraf@suse.de>) id 1PzsGR-0006Ph-Lc
	for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:56 -0400
Message-ID: <4D80D153.7050606@suse.de>
Date: Wed, 16 Mar 2011 16:03:47 +0100
From: Alexander Graf <agraf@suse.de>
MIME-Version: 1.0
References: <1300251423-6715-1-git-send-email-david@gibson.dropbear.id.au>
	<1300251423-6715-16-git-send-email-david@gibson.dropbear.id.au>
In-Reply-To: <1300251423-6715-16-git-send-email-david@gibson.dropbear.id.au>
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Subject: [Qemu-devel] Re: [PATCH 15/26] Virtual hash page table handling on
	pSeries machine
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: David Gibson <david@gibson.dropbear.id.au>
Cc: paulus@samba.org, qemu-devel@nongnu.org, anton@samba.org

On 03/16/2011 05:56 AM, David Gibson wrote:
> On pSeries logical partitions, excepting the old POWER4-style full system
> partitions, the guest does not have direct access to the hardware page
> table.  Instead, the pagetable exists in hypervisor memory, and the guest
> must manipulate it with hypercalls.
>
> However, our current pSeries emulation more closely resembles the old
> style where the guest must set up and handle the pagetables itself.  This
> patch converts it to act like a modern partition.
>
> This involves two things: first, the hash translation path is modified to
> permit the has table to be stored externally to the emulated machine's
> RAM.  The pSeries machine init code configures the CPUs to use this mode.
>
> Secondly, we emulate the PAPR hypercalls for manipulating the external
> hashed page table.
>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   hw/spapr.c          |   32 ++++++-
>   hw/spapr_hcall.c    |  247 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   target-ppc/cpu.h    |    2 +
>   target-ppc/helper.c |   36 ++++++--
>   4 files changed, 305 insertions(+), 12 deletions(-)
>
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 25e4a9e..c3d9286 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -50,12 +50,15 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>                                 sPAPREnvironment *spapr,
>                                 target_phys_addr_t initrd_base,
>                                 target_phys_addr_t initrd_size,
> -                              const char *kernel_cmdline)
> +                              const char *kernel_cmdline,
> +                              long hash_shift)
>   {
>       void *fdt;
>       uint64_t mem_reg_property[] = { 0, cpu_to_be64(ramsize) };
>       uint32_t start_prop = cpu_to_be32(initrd_base);
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> +    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
> +    char hypertas_prop[] = "hcall-pft\0hcall-term";
>       int i;
>       char *modelname;
>       int ret;
> @@ -138,6 +141,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>            * full emu, for kvm we should copy it from the host */
>           _FDT((fdt_property_cell(fdt, "clock-frequency", 1000000000)));
>           _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
> +        _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
>           _FDT((fdt_property_string(fdt, "status", "okay")));
>           _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
>
> @@ -153,6 +157,14 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       _FDT((fdt_end_node(fdt)));
>
> +    /* RTAS */
> +    _FDT((fdt_begin_node(fdt, "rtas")));
> +
> +    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> +                       sizeof(hypertas_prop))));
> +
> +    _FDT((fdt_end_node(fdt)));
> +
>       /* vdevice */
>       _FDT((fdt_begin_node(fdt, "vdevice")));
>
> @@ -203,12 +215,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>                              const char *cpu_model)
>   {
>       CPUState *envs[MAX_CPUS];
> -    void *fdt;
> +    void *fdt, *htab;
>       int i;
>       ram_addr_t ram_offset;
>       target_phys_addr_t fdt_addr;
>       uint32_t kernel_base, initrd_base;
> -    long kernel_size, initrd_size;
> +    long kernel_size, initrd_size, htab_size;
> +    long pteg_shift = 17;
>       int fdt_size;
>       sPAPREnvironment *spapr;
>
> @@ -248,6 +261,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>       ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
>       cpu_register_physical_memory(0, ram_size, ram_offset);
>
> +    /* allocate hash page table */
> +    htab_size = 1ULL<<  (pteg_shift + 7);

Linux makes the htab size depend on the provided amount of ram. 
Shouldn't we do the same?

> +    htab = qemu_mallocz(htab_size);
> +
> +    for (i = 0; i<  smp_cpus; i++) {
> +        envs[i]->external_htab = htab;
> +        envs[i]->htab_base = -1;
> +        envs[i]->htab_mask = htab_size - 1;
> +    }
> +
>       spapr->vio_bus = spapr_vio_bus_init();
>
>       for (i = 0; i<  MAX_SERIAL_PORTS; i++) {
> @@ -293,7 +316,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>
>       /* Prepare the device tree */
>       fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, envs, spapr,
> -                           initrd_base, initrd_size, kernel_cmdline);
> +                           initrd_base, initrd_size, kernel_cmdline,
> +                           pteg_shift + 7);
>       if (!fdt) {
>           hw_error("Couldn't create pSeries device tree\n");
>           exit(1);
> diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
> index 6ddac00..2b14000 100644
> --- a/hw/spapr_hcall.c
> +++ b/hw/spapr_hcall.c
> @@ -1,8 +1,246 @@
>   #include "sysemu.h"
>   #include "cpu.h"
>   #include "qemu-char.h"
> +#include "sysemu.h"
> +#include "qemu-char.h"
> +#include "exec-all.h"
>   #include "hw/spapr.h"
>
> +#define HPTES_PER_GROUP 8
> +
> +#define HPTE_V_SSIZE_SHIFT      62
> +#define HPTE_V_AVPN_SHIFT       7
> +#define HPTE_V_AVPN             0x3fffffffffffff80ULL
> +#define HPTE_V_AVPN_VAL(x)      (((x)&  HPTE_V_AVPN)>>  HPTE_V_AVPN_SHIFT)
> +#define HPTE_V_COMPARE(x,y)     (!(((x) ^ (y))&  0xffffffffffffff80UL))
> +#define HPTE_V_BOLTED           0x0000000000000010ULL
> +#define HPTE_V_LOCK             0x0000000000000008ULL
> +#define HPTE_V_LARGE            0x0000000000000004ULL
> +#define HPTE_V_SECONDARY        0x0000000000000002ULL
> +#define HPTE_V_VALID            0x0000000000000001ULL
> +
> +#define HPTE_R_PP0              0x8000000000000000ULL
> +#define HPTE_R_TS               0x4000000000000000ULL
> +#define HPTE_R_KEY_HI           0x3000000000000000ULL
> +#define HPTE_R_RPN_SHIFT        12
> +#define HPTE_R_RPN              0x3ffffffffffff000ULL
> +#define HPTE_R_FLAGS            0x00000000000003ffULL
> +#define HPTE_R_PP               0x0000000000000003ULL
> +#define HPTE_R_N                0x0000000000000004ULL
> +#define HPTE_R_G                0x0000000000000008ULL
> +#define HPTE_R_M                0x0000000000000010ULL
> +#define HPTE_R_I                0x0000000000000020ULL
> +#define HPTE_R_W                0x0000000000000040ULL
> +#define HPTE_R_WIMG             0x0000000000000078ULL
> +#define HPTE_R_C                0x0000000000000080ULL
> +#define HPTE_R_R                0x0000000000000100ULL
> +#define HPTE_R_KEY_LO           0x0000000000000e00ULL
> +
> +#define HPTE_V_1TB_SEG          0x4000000000000000ULL
> +#define HPTE_V_VRMA_MASK        0x4001ffffff000000ULL
> +
> +#define HPTE_V_HVLOCK           0x40ULL
> +
> +static inline int lock_hpte(void *hpte, target_ulong bits)
> +{
> +    uint64_t pteh;
> +
> +    pteh = ldq_p(hpte);
> +
> +    /* FIXME: probably need some sort of lockage for SMP */

Guest SMP doesn't get mapped to host SMP. So you're safe here.

> +    if (pteh&  bits) {
> +        return 0;
> +    }
> +    stq_p(hpte, pteh | HPTE_V_HVLOCK);
> +    return 1;
> +}
> +
> +static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r,
> +                                     target_ulong pte_index)
> +{
> +    target_ulong rb, va_low;
> +
> +    rb = (v&  ~0x7fULL)<<  16; /* AVA field */
> +    va_low = pte_index>>  3;
> +    if (v&  HPTE_V_SECONDARY)

Braces

> +        va_low = ~va_low;
> +    /* xor vsid from AVA */
> +    if (!(v&  HPTE_V_1TB_SEG))

Braces

> +        va_low ^= v>>  12;
> +    else
> +        va_low ^= v>>  24;
> +    va_low&= 0x7ff;
> +    if (v&  HPTE_V_LARGE) {
> +        rb |= 1;                         /* L field */
> +#if 0 /* Disable that P7 specific bit for now */
> +        if (r&  0xff000) {
> +            /* non-16MB large page, must be 64k */
> +            /* (masks depend on page size) */
> +            rb |= 0x1000;                /* page encoding in LP field */
> +            rb |= (va_low&  0x7f)<<  16; /* 7b of VA in AVA/LP field */
> +            rb |= (va_low&  0xfe);       /* AVAL field */
> +        }
> +#endif
> +    } else {
> +        /* 4kB page */
> +        rb |= (va_low&  0x7ff)<<  12;   /* remaining 11b of AVA */
> +    }
> +    rb |= (v>>  54)&  0x300;            /* B field */
> +    return rb;
> +}
> +
> +static target_ulong h_enter(CPUState *env, sPAPREnvironment *spapr,
> +                            target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong pteh = args[2];
> +    target_ulong ptel = args[3];
> +    target_ulong porder;
> +    target_ulong i, pa;
> +    uint8_t *hpte;
> +
> +    /* only handle 4k and 16M pages for now */
> +    porder = 12;
> +    if (pteh&  HPTE_V_LARGE) {
> +        if ((ptel&  0xf000) == 0x1000) {
> +            /* 64k page */

According to the comment above and the #if 0 in tlbie you don't support 
64k pages?

> +            porder = 16;
> +        } else if ((ptel&  0xff000) == 0) {
> +            /* 16M page */
> +            porder = 24;
> +            /* lowest AVA bit must be 0 for 16M pages */
> +            if (pteh&  0x80)

Braces

> +                return H_PARAMETER;
> +        } else {
> +            return H_PARAMETER;
> +        }
> +    }
> +
> +    pa = ptel&  HPTE_R_RPN;
> +    /* FIXME: bounds check the pa? */
> +
> +    /* Check WIMG */
> +    if ((ptel&  HPTE_R_WIMG) != HPTE_R_M)

Braces

> +        return H_PARAMETER;
> +    pteh&= ~0x60ULL;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask)

Braces

> +        return H_PARAMETER;
> +    if (likely((flags&  H_EXACT) == 0)) {
> +        pte_index&= ~7ULL;
> +        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +        for (i = 0; ; ++i) {
> +            if (i == 8)

Braces

> +                return H_PTEG_FULL;
> +            if (((ldq_p(hpte)&  HPTE_V_VALID) == 0)&&
> +                lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
> +                break;
> +            }
> +            hpte += HASH_PTE_SIZE_64;
> +        }
> +    } else {
> +        i = 0;
> +        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +        if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
> +            return H_PTEG_FULL;
> +        }
> +    }
> +    stq_p(hpte + (HASH_PTE_SIZE_64/2), ptel);
> +    /* eieio();  FIXME: need some sort of barrier for smp? */

see above :)

> +    stq_p(hpte, pteh);
> +
> +    assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +    args[0] = pte_index + i;
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_remove(CPUState *env, sPAPREnvironment *spapr,
> +                             target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong avpn = args[2];
> +    uint8_t *hpte;
> +    target_ulong v, r, rb;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask) {
> +        return H_PARAMETER;
> +    }
> +
> +    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
> +        /* We have no real concurrency in qemu soft-emulation, so we
> +         * will never actually have a contested lock */
> +        assert(0);
> +    }
> +
> +    v = ldq_p(hpte);
> +    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
> +
> +    if ((v&  HPTE_V_VALID) == 0 ||
> +        ((flags&  H_AVPN)&&  (v&  ~0x7fULL) != avpn) ||
> +        ((flags&  H_ANDCOND)&&  (v&  avpn) != 0)) {
> +        stq_p(hpte, v&  ~HPTE_V_HVLOCK);
> +        assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +        return H_NOT_FOUND;
> +    }
> +    args[0] = v&  ~HPTE_V_HVLOCK;
> +    args[1] = r;
> +    stq_p(hpte, 0);
> +    rb = compute_tlbie_rb(v, r, pte_index);
> +//    ppc_tlb_invalidate_one(env, rb);

Huh?

> +    tlb_flush(env, 1);
> +    assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_protect(CPUState *env, sPAPREnvironment *spapr,
> +                              target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong avpn = args[2];
> +    uint8_t *hpte;
> +    target_ulong v, r, rb;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask) {
> +        return H_PARAMETER;
> +    }
> +
> +    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
> +        /* We have no real concurrency in qemu soft-emulation, so we
> +         * will never actually have a contested lock */
> +        assert(0);
> +    }
> +
> +    v = ldq_p(hpte);
> +    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
> +
> +    if ((v&  HPTE_V_VALID) == 0 ||
> +        ((flags&  H_AVPN)&&  (v&  ~0x7fULL) != avpn)) {
> +        stq_p(hpte, v&  ~HPTE_V_HVLOCK);
> +        assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +        return H_NOT_FOUND;
> +    }
> +
> +    r&= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
> +           HPTE_R_KEY_HI | HPTE_R_KEY_LO);
> +    r |= (flags<<  55)&  HPTE_R_PP0;
> +    r |= (flags<<  48)&  HPTE_R_KEY_HI;
> +    r |= flags&  (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
> +    rb = compute_tlbie_rb(v, r, pte_index);
> +    stq_p(hpte, v&  ~HPTE_V_VALID);
> +    //ppc_tlb_invalidate_one(env, rb);

Huh?

> +    tlb_flush(env, 1);

Wow, why do you need a full tlb flush here?


Alex