From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from [140.186.70.92] (port=36271 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1PzsGT-0006N3-TW for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:59 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1PzsGS-0006Q5-6E for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:57 -0400 Received: from cantor.suse.de ([195.135.220.2]:38452 helo=mx1.suse.de) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1PzsGR-0006Ph-Lc for qemu-devel@nongnu.org; Wed, 16 Mar 2011 11:03:56 -0400 Message-ID: <4D80D153.7050606@suse.de> Date: Wed, 16 Mar 2011 16:03:47 +0100 From: Alexander Graf MIME-Version: 1.0 References: <1300251423-6715-1-git-send-email-david@gibson.dropbear.id.au> <1300251423-6715-16-git-send-email-david@gibson.dropbear.id.au> In-Reply-To: <1300251423-6715-16-git-send-email-david@gibson.dropbear.id.au> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Subject: [Qemu-devel] Re: [PATCH 15/26] Virtual hash page table handling on pSeries machine List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: David Gibson Cc: paulus@samba.org, qemu-devel@nongnu.org, anton@samba.org On 03/16/2011 05:56 AM, David Gibson wrote: > On pSeries logical partitions, excepting the old POWER4-style full system > partitions, the guest does not have direct access to the hardware page > table. Instead, the pagetable exists in hypervisor memory, and the guest > must manipulate it with hypercalls. > > However, our current pSeries emulation more closely resembles the old > style where the guest must set up and handle the pagetables itself. This > patch converts it to act like a modern partition. > > This involves two things: first, the hash translation path is modified to > permit the has table to be stored externally to the emulated machine's > RAM. The pSeries machine init code configures the CPUs to use this mode. > > Secondly, we emulate the PAPR hypercalls for manipulating the external > hashed page table. > > Signed-off-by: David Gibson > --- > hw/spapr.c | 32 ++++++- > hw/spapr_hcall.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++ > target-ppc/cpu.h | 2 + > target-ppc/helper.c | 36 ++++++-- > 4 files changed, 305 insertions(+), 12 deletions(-) > > diff --git a/hw/spapr.c b/hw/spapr.c > index 25e4a9e..c3d9286 100644 > --- a/hw/spapr.c > +++ b/hw/spapr.c > @@ -50,12 +50,15 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize, > sPAPREnvironment *spapr, > target_phys_addr_t initrd_base, > target_phys_addr_t initrd_size, > - const char *kernel_cmdline) > + const char *kernel_cmdline, > + long hash_shift) > { > void *fdt; > uint64_t mem_reg_property[] = { 0, cpu_to_be64(ramsize) }; > uint32_t start_prop = cpu_to_be32(initrd_base); > uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size); > + uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)}; > + char hypertas_prop[] = "hcall-pft\0hcall-term"; > int i; > char *modelname; > int ret; > @@ -138,6 +141,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize, > * full emu, for kvm we should copy it from the host */ > _FDT((fdt_property_cell(fdt, "clock-frequency", 1000000000))); > _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr))); > + _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop)))); > _FDT((fdt_property_string(fdt, "status", "okay"))); > _FDT((fdt_property(fdt, "64-bit", NULL, 0))); > > @@ -153,6 +157,14 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize, > > _FDT((fdt_end_node(fdt))); > > + /* RTAS */ > + _FDT((fdt_begin_node(fdt, "rtas"))); > + > + _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop, > + sizeof(hypertas_prop)))); > + > + _FDT((fdt_end_node(fdt))); > + > /* vdevice */ > _FDT((fdt_begin_node(fdt, "vdevice"))); > > @@ -203,12 +215,13 @@ static void ppc_spapr_init(ram_addr_t ram_size, > const char *cpu_model) > { > CPUState *envs[MAX_CPUS]; > - void *fdt; > + void *fdt, *htab; > int i; > ram_addr_t ram_offset; > target_phys_addr_t fdt_addr; > uint32_t kernel_base, initrd_base; > - long kernel_size, initrd_size; > + long kernel_size, initrd_size, htab_size; > + long pteg_shift = 17; > int fdt_size; > sPAPREnvironment *spapr; > > @@ -248,6 +261,16 @@ static void ppc_spapr_init(ram_addr_t ram_size, > ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size); > cpu_register_physical_memory(0, ram_size, ram_offset); > > + /* allocate hash page table */ > + htab_size = 1ULL<< (pteg_shift + 7); Linux makes the htab size depend on the provided amount of ram. Shouldn't we do the same? > + htab = qemu_mallocz(htab_size); > + > + for (i = 0; i< smp_cpus; i++) { > + envs[i]->external_htab = htab; > + envs[i]->htab_base = -1; > + envs[i]->htab_mask = htab_size - 1; > + } > + > spapr->vio_bus = spapr_vio_bus_init(); > > for (i = 0; i< MAX_SERIAL_PORTS; i++) { > @@ -293,7 +316,8 @@ static void ppc_spapr_init(ram_addr_t ram_size, > > /* Prepare the device tree */ > fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, envs, spapr, > - initrd_base, initrd_size, kernel_cmdline); > + initrd_base, initrd_size, kernel_cmdline, > + pteg_shift + 7); > if (!fdt) { > hw_error("Couldn't create pSeries device tree\n"); > exit(1); > diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c > index 6ddac00..2b14000 100644 > --- a/hw/spapr_hcall.c > +++ b/hw/spapr_hcall.c > @@ -1,8 +1,246 @@ > #include "sysemu.h" > #include "cpu.h" > #include "qemu-char.h" > +#include "sysemu.h" > +#include "qemu-char.h" > +#include "exec-all.h" > #include "hw/spapr.h" > > +#define HPTES_PER_GROUP 8 > + > +#define HPTE_V_SSIZE_SHIFT 62 > +#define HPTE_V_AVPN_SHIFT 7 > +#define HPTE_V_AVPN 0x3fffffffffffff80ULL > +#define HPTE_V_AVPN_VAL(x) (((x)& HPTE_V_AVPN)>> HPTE_V_AVPN_SHIFT) > +#define HPTE_V_COMPARE(x,y) (!(((x) ^ (y))& 0xffffffffffffff80UL)) > +#define HPTE_V_BOLTED 0x0000000000000010ULL > +#define HPTE_V_LOCK 0x0000000000000008ULL > +#define HPTE_V_LARGE 0x0000000000000004ULL > +#define HPTE_V_SECONDARY 0x0000000000000002ULL > +#define HPTE_V_VALID 0x0000000000000001ULL > + > +#define HPTE_R_PP0 0x8000000000000000ULL > +#define HPTE_R_TS 0x4000000000000000ULL > +#define HPTE_R_KEY_HI 0x3000000000000000ULL > +#define HPTE_R_RPN_SHIFT 12 > +#define HPTE_R_RPN 0x3ffffffffffff000ULL > +#define HPTE_R_FLAGS 0x00000000000003ffULL > +#define HPTE_R_PP 0x0000000000000003ULL > +#define HPTE_R_N 0x0000000000000004ULL > +#define HPTE_R_G 0x0000000000000008ULL > +#define HPTE_R_M 0x0000000000000010ULL > +#define HPTE_R_I 0x0000000000000020ULL > +#define HPTE_R_W 0x0000000000000040ULL > +#define HPTE_R_WIMG 0x0000000000000078ULL > +#define HPTE_R_C 0x0000000000000080ULL > +#define HPTE_R_R 0x0000000000000100ULL > +#define HPTE_R_KEY_LO 0x0000000000000e00ULL > + > +#define HPTE_V_1TB_SEG 0x4000000000000000ULL > +#define HPTE_V_VRMA_MASK 0x4001ffffff000000ULL > + > +#define HPTE_V_HVLOCK 0x40ULL > + > +static inline int lock_hpte(void *hpte, target_ulong bits) > +{ > + uint64_t pteh; > + > + pteh = ldq_p(hpte); > + > + /* FIXME: probably need some sort of lockage for SMP */ Guest SMP doesn't get mapped to host SMP. So you're safe here. > + if (pteh& bits) { > + return 0; > + } > + stq_p(hpte, pteh | HPTE_V_HVLOCK); > + return 1; > +} > + > +static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r, > + target_ulong pte_index) > +{ > + target_ulong rb, va_low; > + > + rb = (v& ~0x7fULL)<< 16; /* AVA field */ > + va_low = pte_index>> 3; > + if (v& HPTE_V_SECONDARY) Braces > + va_low = ~va_low; > + /* xor vsid from AVA */ > + if (!(v& HPTE_V_1TB_SEG)) Braces > + va_low ^= v>> 12; > + else > + va_low ^= v>> 24; > + va_low&= 0x7ff; > + if (v& HPTE_V_LARGE) { > + rb |= 1; /* L field */ > +#if 0 /* Disable that P7 specific bit for now */ > + if (r& 0xff000) { > + /* non-16MB large page, must be 64k */ > + /* (masks depend on page size) */ > + rb |= 0x1000; /* page encoding in LP field */ > + rb |= (va_low& 0x7f)<< 16; /* 7b of VA in AVA/LP field */ > + rb |= (va_low& 0xfe); /* AVAL field */ > + } > +#endif > + } else { > + /* 4kB page */ > + rb |= (va_low& 0x7ff)<< 12; /* remaining 11b of AVA */ > + } > + rb |= (v>> 54)& 0x300; /* B field */ > + return rb; > +} > + > +static target_ulong h_enter(CPUState *env, sPAPREnvironment *spapr, > + target_ulong opcode, target_ulong *args) > +{ > + target_ulong flags = args[0]; > + target_ulong pte_index = args[1]; > + target_ulong pteh = args[2]; > + target_ulong ptel = args[3]; > + target_ulong porder; > + target_ulong i, pa; > + uint8_t *hpte; > + > + /* only handle 4k and 16M pages for now */ > + porder = 12; > + if (pteh& HPTE_V_LARGE) { > + if ((ptel& 0xf000) == 0x1000) { > + /* 64k page */ According to the comment above and the #if 0 in tlbie you don't support 64k pages? > + porder = 16; > + } else if ((ptel& 0xff000) == 0) { > + /* 16M page */ > + porder = 24; > + /* lowest AVA bit must be 0 for 16M pages */ > + if (pteh& 0x80) Braces > + return H_PARAMETER; > + } else { > + return H_PARAMETER; > + } > + } > + > + pa = ptel& HPTE_R_RPN; > + /* FIXME: bounds check the pa? */ > + > + /* Check WIMG */ > + if ((ptel& HPTE_R_WIMG) != HPTE_R_M) Braces > + return H_PARAMETER; > + pteh&= ~0x60ULL; > + > + if ((pte_index * HASH_PTE_SIZE_64)& ~env->htab_mask) Braces > + return H_PARAMETER; > + if (likely((flags& H_EXACT) == 0)) { > + pte_index&= ~7ULL; > + hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64); > + for (i = 0; ; ++i) { > + if (i == 8) Braces > + return H_PTEG_FULL; > + if (((ldq_p(hpte)& HPTE_V_VALID) == 0)&& > + lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) { > + break; > + } > + hpte += HASH_PTE_SIZE_64; > + } > + } else { > + i = 0; > + hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64); > + if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) { > + return H_PTEG_FULL; > + } > + } > + stq_p(hpte + (HASH_PTE_SIZE_64/2), ptel); > + /* eieio(); FIXME: need some sort of barrier for smp? */ see above :) > + stq_p(hpte, pteh); > + > + assert (!(ldq_p(hpte)& HPTE_V_HVLOCK)); > + args[0] = pte_index + i; > + return H_SUCCESS; > +} > + > +static target_ulong h_remove(CPUState *env, sPAPREnvironment *spapr, > + target_ulong opcode, target_ulong *args) > +{ > + target_ulong flags = args[0]; > + target_ulong pte_index = args[1]; > + target_ulong avpn = args[2]; > + uint8_t *hpte; > + target_ulong v, r, rb; > + > + if ((pte_index * HASH_PTE_SIZE_64)& ~env->htab_mask) { > + return H_PARAMETER; > + } > + > + hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64); > + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) { > + /* We have no real concurrency in qemu soft-emulation, so we > + * will never actually have a contested lock */ > + assert(0); > + } > + > + v = ldq_p(hpte); > + r = ldq_p(hpte + (HASH_PTE_SIZE_64/2)); > + > + if ((v& HPTE_V_VALID) == 0 || > + ((flags& H_AVPN)&& (v& ~0x7fULL) != avpn) || > + ((flags& H_ANDCOND)&& (v& avpn) != 0)) { > + stq_p(hpte, v& ~HPTE_V_HVLOCK); > + assert (!(ldq_p(hpte)& HPTE_V_HVLOCK)); > + return H_NOT_FOUND; > + } > + args[0] = v& ~HPTE_V_HVLOCK; > + args[1] = r; > + stq_p(hpte, 0); > + rb = compute_tlbie_rb(v, r, pte_index); > +// ppc_tlb_invalidate_one(env, rb); Huh? > + tlb_flush(env, 1); > + assert (!(ldq_p(hpte)& HPTE_V_HVLOCK)); > + return H_SUCCESS; > +} > + > +static target_ulong h_protect(CPUState *env, sPAPREnvironment *spapr, > + target_ulong opcode, target_ulong *args) > +{ > + target_ulong flags = args[0]; > + target_ulong pte_index = args[1]; > + target_ulong avpn = args[2]; > + uint8_t *hpte; > + target_ulong v, r, rb; > + > + if ((pte_index * HASH_PTE_SIZE_64)& ~env->htab_mask) { > + return H_PARAMETER; > + } > + > + hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64); > + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) { > + /* We have no real concurrency in qemu soft-emulation, so we > + * will never actually have a contested lock */ > + assert(0); > + } > + > + v = ldq_p(hpte); > + r = ldq_p(hpte + (HASH_PTE_SIZE_64/2)); > + > + if ((v& HPTE_V_VALID) == 0 || > + ((flags& H_AVPN)&& (v& ~0x7fULL) != avpn)) { > + stq_p(hpte, v& ~HPTE_V_HVLOCK); > + assert (!(ldq_p(hpte)& HPTE_V_HVLOCK)); > + return H_NOT_FOUND; > + } > + > + r&= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | > + HPTE_R_KEY_HI | HPTE_R_KEY_LO); > + r |= (flags<< 55)& HPTE_R_PP0; > + r |= (flags<< 48)& HPTE_R_KEY_HI; > + r |= flags& (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); > + rb = compute_tlbie_rb(v, r, pte_index); > + stq_p(hpte, v& ~HPTE_V_VALID); > + //ppc_tlb_invalidate_one(env, rb); Huh? > + tlb_flush(env, 1); Wow, why do you need a full tlb flush here? Alex