[Qemu-devel] [PATCH 04/22] pseries: Allow KVM Book3S-HV on PPC970 CPUS

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Alexander Graf <agraf@suse.de>
To: qemu-devel@nongnu.org
Cc: Blue Swirl <blauwirbel@gmail.com>,
	Paul Mackerras <paulus@samba.org>,
	qemu-ppc@nongnu.org, David Gibson <david@gibson.dropbear.id.au>
Subject: [Qemu-devel] [PATCH 04/22] pseries: Allow KVM Book3S-HV on PPC970 CPUS
Date: Sun, 30 Oct 2011 21:22:55 +0100	[thread overview]
Message-ID: <1320006193-15219-5-git-send-email-agraf@suse.de> (raw)
In-Reply-To: <1320006193-15219-1-git-send-email-agraf@suse.de>

From: David Gibson <david@gibson.dropbear.id.au>

At present, using the hypervisor aware Book3S-HV KVM will only work
with qemu on POWER7 CPUs.  PPC970 CPUs also have hypervisor
capability, but they lack the VRMA feature which makes assigning guest
memory easier.

In order to allow KVM Book3S-HV on PPC970, we need to specially
allocate the first chunk of guest memory (the "Real Mode Area" or
RMA), so that it is physically contiguous.

Sufficiently recent host kernels allow such contiguous RMAs to be
allocated, with a kvm capability advertising whether the feature is
available and/or necessary on this hardware.  This patch enables qemu
to use this support, thus allowing kvm acceleration of pseries qemu
machines on PPC970 hardware.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>

---

agraf: fix to use memory api
---
 hw/spapr.c           |   56 +++++++++++++++++++++++++++++++++++++++----------
 target-ppc/kvm.c     |   44 +++++++++++++++++++++++++++++++++++++++
 target-ppc/kvm_ppc.h |    8 +++++++
 3 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/hw/spapr.c b/hw/spapr.c
index c2675e1..193398b 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -91,6 +91,7 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num)
 }
 
 static void *spapr_create_fdt_skel(const char *cpu_model,
+                                   target_phys_addr_t rma_size,
                                    target_phys_addr_t initrd_base,
                                    target_phys_addr_t initrd_size,
                                    const char *boot_device,
@@ -99,7 +100,9 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
 {
     void *fdt;
     CPUState *env;
-    uint64_t mem_reg_property[] = { 0, cpu_to_be64(ram_size) };
+    uint64_t mem_reg_property_rma[] = { 0, cpu_to_be64(rma_size) };
+    uint64_t mem_reg_property_nonrma[] = { cpu_to_be64(rma_size),
+                                           cpu_to_be64(ram_size - rma_size) };
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
     uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
@@ -145,15 +148,25 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
 
     _FDT((fdt_end_node(fdt)));
 
-    /* memory node */
+    /* memory node(s) */
     _FDT((fdt_begin_node(fdt, "memory@0")));
 
     _FDT((fdt_property_string(fdt, "device_type", "memory")));
-    _FDT((fdt_property(fdt, "reg",
-                       mem_reg_property, sizeof(mem_reg_property))));
-
+    _FDT((fdt_property(fdt, "reg", mem_reg_property_rma,
+                       sizeof(mem_reg_property_rma))));
     _FDT((fdt_end_node(fdt)));
 
+    if (ram_size > rma_size) {
+        char mem_name[32];
+
+        sprintf(mem_name, "memory@%" PRIx64, (uint64_t)rma_size);
+        _FDT((fdt_begin_node(fdt, mem_name)));
+        _FDT((fdt_property_string(fdt, "device_type", "memory")));
+        _FDT((fdt_property(fdt, "reg", mem_reg_property_nonrma,
+                           sizeof(mem_reg_property_nonrma))));
+        _FDT((fdt_end_node(fdt)));
+    }
+
     /* cpus */
     _FDT((fdt_begin_node(fdt, "cpus")));
 
@@ -346,6 +359,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     int i;
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram = g_new(MemoryRegion, 1);
+    target_phys_addr_t rma_alloc_size, rma_size;
     uint32_t initrd_base;
     long kernel_size, initrd_size, fw_size;
     long pteg_shift = 17;
@@ -354,10 +368,23 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     spapr = g_malloc(sizeof(*spapr));
     cpu_ppc_hypercall = emulate_spapr_hypercall;
 
-    /* We place the device tree just below either the top of RAM, or
-     * 2GB, so that it can be processed with 32-bit code if
-     * necessary */
-    spapr->fdt_addr = MIN(ram_size, 0x80000000) - FDT_MAX_SIZE;
+    /* Allocate RMA if necessary */
+    rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma", sysmem);
+
+    if (rma_alloc_size == -1) {
+        hw_error("qemu: Unable to create RMA\n");
+        exit(1);
+    }
+    if (rma_alloc_size && (rma_alloc_size < ram_size)) {
+        rma_size = rma_alloc_size;
+    } else {
+        rma_size = ram_size;
+    }
+
+    /* We place the device tree just below either the top of the RMA,
+     * or just below 2GB, whichever is lowere, so that it can be
+     * processed with 32-bit real mode code if necessary */
+    spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE;
     spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE;
 
     /* init CPUs */
@@ -382,8 +409,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
     /* allocate RAM */
     spapr->ram_limit = ram_size;
-    memory_region_init_ram(ram, NULL, "ppc_spapr.ram", spapr->ram_limit);
-    memory_region_add_subregion(sysmem, 0, ram);
+    if (spapr->ram_limit > rma_alloc_size) {
+        ram_addr_t nonrma_base = rma_alloc_size;
+        ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size;
+
+        memory_region_init_ram(ram, NULL, "ppc_spapr.ram", nonrma_size);
+        memory_region_add_subregion(sysmem, nonrma_base, ram);
+    }
 
     /* allocate hash page table.  For now we always make this 16mb,
      * later we should probably make it scale to the size of guest
@@ -507,7 +539,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     }
 
     /* Prepare the device tree */
-    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
+    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
                                             initrd_base, initrd_size,
                                             boot_device, kernel_cmdline,
                                             pteg_shift + 7);
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 6c7ca6f..fd17d23 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -55,6 +55,7 @@ static int cap_interrupt_level = false;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
+static int cap_ppc_rma;
 
 /* XXX We have a race condition where we actually have a level triggered
  *     interrupt, but the infrastructure can't expose that yet, so the guest
@@ -79,6 +80,7 @@ int kvm_arch_init(KVMState *s)
     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
+    cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
 
     if (!cap_interrupt_level) {
         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
@@ -758,6 +760,48 @@ int kvmppc_smt_threads(void)
     return cap_ppc_smt ? cap_ppc_smt : 1;
 }
 
+off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
+{
+    void *rma;
+    off_t size;
+    int fd;
+    struct kvm_allocate_rma ret;
+    MemoryRegion *rma_region;
+
+    /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported
+     * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but
+     *                      not necessary on this hardware
+     * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware
+     *
+     * FIXME: We should allow the user to force contiguous RMA
+     * allocation in the cap_ppc_rma==1 case.
+     */
+    if (cap_ppc_rma < 2) {
+        return 0;
+    }
+
+    fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret);
+    if (fd < 0) {
+        fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n",
+                strerror(errno));
+        return -1;
+    }
+
+    size = MIN(ret.rma_size, 256ul << 20);
+
+    rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    if (rma == MAP_FAILED) {
+        fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno));
+        return -1;
+    };
+
+    rma_region = g_new(MemoryRegion, 1);
+    memory_region_init_ram_ptr(rma_region, NULL, name, size, rma);
+    memory_region_add_subregion(sysmem, 0, rma_region);
+
+    return size;
+}
+
 bool kvm_arch_stop_on_emulation_error(CPUState *env)
 {
     return true;
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index c298411..82f32f4 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -9,6 +9,8 @@
 #ifndef __KVM_PPC_H__
 #define __KVM_PPC_H__
 
+#include "memory.h"
+
 void kvmppc_init(void);
 
 #ifdef CONFIG_KVM
@@ -19,6 +21,7 @@ int kvmppc_get_hypercall(CPUState *env, uint8_t *buf, int buf_len);
 int kvmppc_set_interrupt(CPUState *env, int irq, int level);
 void kvmppc_set_papr(CPUState *env);
 int kvmppc_smt_threads(void);
+off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
 
 #else
 
@@ -51,6 +54,11 @@ static inline int kvmppc_smt_threads(void)
     return 1;
 }
 
+static inline off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
+{
+    return 0;
+}
+
 #endif
 
 #ifndef CONFIG_KVM
-- 
1.6.0.2

next prev parent reply	other threads:[~2011-10-30 20:14 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-10-30 20:22 [Qemu-devel] [PULL 00/22] ppc patch queue 2011-10-30 Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 01/22] ppc/e500_pci: Fix code style Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 02/22] ppc/e500_pci: Fix an array overflow issue Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 03/22] pseries: Support SMT systems for KVM Book3S-HV Alexander Graf
2011-10-30 20:22 ` Alexander Graf [this message]
2011-10-30 20:22 ` [Qemu-devel] [PATCH 05/22] pseries: Use Book3S-HV TCE acceleration capabilities Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 06/22] pseries: Update SLOF firmware image Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 07/22] Set an invalid-bits mask for each SPE instructions Alexander Graf
2011-10-30 20:22 ` [Qemu-devel] [PATCH 08/22] ppc: Generalize the kvmppc_get_clockfreq() function Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 09/22] pseries: Add device tree properties for VMX/VSX and DFP under kvm Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 10/22] pseries: Update SLOF firmware image Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 11/22] ppc: Remove broken partial PVR matching Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 12/22] ppc: First cut implementation of -cpu host Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 13/22] ppc: Add cpu defs for POWER7 revisions 2.1 and 2.3 Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 14/22] pseries: Under kvm use guest cpu = host cpu by default Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 15/22] PPC: Bump qemu-system-ppc to 64-bit physical address space Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 16/22] PPC: Disable non-440 CPUs for ppcemb target Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 17/22] ppc: Avoid decrementer related kvm exits Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 18/22] PPC: Fail configure when libfdt is not available Alexander Graf
2011-11-01 19:28   ` Blue Swirl
2011-11-01 20:42     ` Alexander Graf
2011-11-01 23:59     ` [Qemu-devel] [Qemu-ppc] " David Gibson
2011-10-30 20:23 ` [Qemu-devel] [PATCH 19/22] pseries: Correct vmx/dfp handling in both KVM and TCG cases Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 20/22] ppc: Fix up usermode only builds Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 21/22] KVM: PPC: Override host vmx/vsx/dfp only when information known Alexander Graf
2011-10-30 20:23 ` [Qemu-devel] [PATCH 22/22] pseries: Allow writes to KVM accelerated TCE table Alexander Graf
2011-10-31  4:03 ` [Qemu-devel] [PULL 00/22] ppc patch queue 2011-10-30 Alexander Graf
2011-10-31  4:12 ` [Qemu-devel] [PATCH 23/22] ppc: Alter CPU state to mask out TCG unimplemented instructions as appropriate Alexander Graf
2011-10-31  4:12 ` [Qemu-devel] [PATCH 24/22] pseries: Add partial support for PCI Alexander Graf
2011-11-01 21:05 ` [Qemu-devel] [PULL 00/22] ppc patch queue 2011-10-30 Blue Swirl
2011-11-01 21:41   ` Anthony Liguori
2011-11-01 22:14     ` Alexander Graf
2011-11-01 22:16       ` Anthony Liguori
2011-11-01 22:28         ` Alexander Graf
2011-11-01 22:32           ` Anthony Liguori
2011-11-02  0:12           ` [Qemu-devel] [Qemu-ppc] " David Gibson
2011-11-02 19:59     ` [Qemu-devel] " Blue Swirl
2011-11-02 20:11       ` Anthony Liguori
2011-11-02 20:38       ` Alexander Graf

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c2675e1 dfblob:193398b dfblob:6c7ca6f dfblob:fd17d23
dfblob:c298411 dfblob:82f32f4 )
 OR (
bs:"[Qemu-devel] [PATCH 04/22] pseries: Allow KVM Book3S-HV on PPC970 CPUS" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1320006193-15219-5-git-send-email-agraf@suse.de \
    --to=agraf@suse.de \
    --cc=blauwirbel@gmail.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=paulus@samba.org \
    --cc=qemu-devel@nongnu.org \
    --cc=qemu-ppc@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).