From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: qemu-devel@nongnu.org
Cc: Anthony Liguori <aliguori@us.ibm.com>,
aik@ozlabs.ru, Alexander Graf <agraf@suse.de>,
qemu-ppc@nongnu.org, Paul Mackerras <paulus@samba.org>,
David Gibson <david@gibson.dropbear.id.au>
Subject: [Qemu-devel] [PATCH 12/19] pseries: savevm support for pseries machine
Date: Sat, 6 Jul 2013 23:54:09 +1000 [thread overview]
Message-ID: <1373118856-30171-13-git-send-email-aik@ozlabs.ru> (raw)
In-Reply-To: <1373118856-30171-1-git-send-email-aik@ozlabs.ru>
From: David Gibson <david@gibson.dropbear.id.au>
This adds the necessary pieces to implement savevm / migration for the
pseries machine. The most complex part here is migrating the hash
table - for the paravirtualized pseries machine the guest's hash page
table is not stored within guest memory, but externally and the guest
accesses it via hypercalls.
This patch uses a hypervisor reserved bit of the HPTE as a dirty bit
(tracking changes to the HPTE itself, not the page it references).
This is used to implement a live migration style incremental save and
restore of the hash table contents.
In addition it adds VMStateDescription information to save and restore
the (few) remaining pieces of state information needed by the pseries
machine.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
hw/ppc/spapr.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++-
hw/ppc/spapr_hcall.c | 8 +-
include/hw/ppc/spapr.h | 12 ++-
3 files changed, 281 insertions(+), 8 deletions(-)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d8f1614..bf348c7 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -32,6 +32,7 @@
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
#include "kvm_ppc.h"
+#include "mmu-hash64.h"
#include "hw/boards.h"
#include "hw/ppc/ppc.h"
@@ -667,7 +668,7 @@ static void spapr_cpu_reset(void *opaque)
env->spr[SPR_HIOR] = 0;
- env->external_htab = spapr->htab;
+ env->external_htab = (uint8_t *)spapr->htab;
env->htab_base = -1;
env->htab_mask = HTAB_SIZE(spapr) - 1;
env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
@@ -719,6 +720,268 @@ static int spapr_vga_init(PCIBus *pci_bus)
}
}
+static const VMStateDescription vmstate_spapr = {
+ .name = "spapr",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .minimum_version_id_old = 1,
+ .fields = (VMStateField []) {
+ VMSTATE_UINT32(next_irq, sPAPREnvironment),
+
+ /* RTC offset */
+ VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
+
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+#define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2))
+#define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
+#define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
+#define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
+
+static int htab_save_setup(QEMUFile *f, void *opaque)
+{
+ sPAPREnvironment *spapr = opaque;
+
+ spapr->htab_save_index = 0;
+ spapr->htab_first_pass = true;
+
+ /* "Iteration" header */
+ qemu_put_be32(f, spapr->htab_shift);
+
+ return 0;
+}
+
+#define MAX_ITERATION_NS 5000000 /* 5 ms */
+
+static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
+ int64_t max_ns)
+{
+ int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
+ int index = spapr->htab_save_index;
+ int64_t starttime = qemu_get_clock_ns(rt_clock);
+
+ assert(spapr->htab_first_pass);
+
+ do {
+ int chunkstart;
+
+ /* Consume invalid HPTEs */
+ while ((index < htabslots)
+ && !HPTE_VALID(HPTE(spapr->htab, index))) {
+ index++;
+ CLEAN_HPTE(HPTE(spapr->htab, index));
+ }
+
+ /* Consume valid HPTEs */
+ chunkstart = index;
+ while ((index < htabslots)
+ && HPTE_VALID(HPTE(spapr->htab, index))) {
+ index++;
+ CLEAN_HPTE(HPTE(spapr->htab, index));
+ }
+
+ if (index > chunkstart) {
+ int n_valid = index - chunkstart;
+
+ qemu_put_be32(f, chunkstart);
+ qemu_put_be16(f, n_valid);
+ qemu_put_be16(f, 0);
+ qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
+ HASH_PTE_SIZE_64 * n_valid);
+
+ if ((qemu_get_clock_ns(rt_clock) - starttime) > max_ns) {
+ break;
+ }
+ }
+ } while ((index < htabslots) && !qemu_file_rate_limit(f));
+
+ if (index >= htabslots) {
+ assert(index == htabslots);
+ index = 0;
+ spapr->htab_first_pass = false;
+ }
+ spapr->htab_save_index = index;
+}
+
+static bool htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
+ int64_t max_ns)
+{
+ bool final = max_ns < 0;
+ int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
+ int examined = 0, sent = 0;
+ int index = spapr->htab_save_index;
+ int64_t starttime = qemu_get_clock_ns(rt_clock);
+
+ assert(!spapr->htab_first_pass);
+
+ do {
+ int chunkstart, invalidstart;
+
+ /* Consume non-dirty HPTEs */
+ while ((index < htabslots)
+ && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
+ index++;
+ examined++;
+ }
+
+ chunkstart = index;
+ /* Consume valid dirty HPTEs */
+ while ((index < htabslots)
+ && HPTE_DIRTY(HPTE(spapr->htab, index))
+ && HPTE_VALID(HPTE(spapr->htab, index))) {
+ CLEAN_HPTE(HPTE(spapr->htab, index));
+ index++;
+ examined++;
+ }
+
+ invalidstart = index;
+ /* Consume invalid dirty HPTEs */
+ while ((index < htabslots)
+ && HPTE_DIRTY(HPTE(spapr->htab, index))
+ && !HPTE_VALID(HPTE(spapr->htab, index))) {
+ CLEAN_HPTE(HPTE(spapr->htab, index));
+ index++;
+ examined++;
+ }
+
+ if (index > chunkstart) {
+ int n_valid = invalidstart - chunkstart;
+ int n_invalid = index - invalidstart;
+
+ qemu_put_be32(f, chunkstart);
+ qemu_put_be16(f, n_valid);
+ qemu_put_be16(f, n_invalid);
+ qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
+ HASH_PTE_SIZE_64 * n_valid);
+ sent += index - chunkstart;
+
+ if (!final && (qemu_get_clock_ns(rt_clock) - starttime) > max_ns) {
+ break;
+ }
+ }
+
+ if (examined >= htabslots) {
+ break;
+ }
+
+ if (index >= htabslots) {
+ assert(index == htabslots);
+ index = 0;
+ }
+ } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
+
+ if (index >= htabslots) {
+ assert(index == htabslots);
+ index = 0;
+ }
+
+ spapr->htab_save_index = index;
+
+ return (examined >= htabslots) && (sent == 0);
+}
+
+static int htab_save_iterate(QEMUFile *f, void *opaque)
+{
+ sPAPREnvironment *spapr = opaque;
+ bool nothingleft = false;;
+
+ /* Iteration header */
+ qemu_put_be32(f, 0);
+
+ if (spapr->htab_first_pass) {
+ htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
+ } else {
+ nothingleft = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
+ }
+
+ /* End marker */
+ qemu_put_be32(f, 0);
+ qemu_put_be16(f, 0);
+ qemu_put_be16(f, 0);
+
+ return nothingleft ? 1 : 0;
+}
+
+static int htab_save_complete(QEMUFile *f, void *opaque)
+{
+ sPAPREnvironment *spapr = opaque;
+
+ /* Iteration header */
+ qemu_put_be32(f, 0);
+
+ htab_save_later_pass(f, spapr, -1);
+
+ /* End marker */
+ qemu_put_be32(f, 0);
+ qemu_put_be16(f, 0);
+ qemu_put_be16(f, 0);
+
+ return 0;
+}
+
+static int htab_load(QEMUFile *f, void *opaque, int version_id)
+{
+ sPAPREnvironment *spapr = opaque;
+ uint32_t section_hdr;
+
+ if (version_id < 1 || version_id > 1) {
+ fprintf(stderr, "htab_load() bad version\n");
+ return -EINVAL;
+ }
+
+ section_hdr = qemu_get_be32(f);
+
+ if (section_hdr) {
+ /* First section, just the hash shift */
+ if (spapr->htab_shift != section_hdr) {
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ while (true) {
+ uint32_t index;
+ uint16_t n_valid, n_invalid;
+
+ index = qemu_get_be32(f);
+ n_valid = qemu_get_be16(f);
+ n_invalid = qemu_get_be16(f);
+
+ if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
+ /* End of Stream */
+ break;
+ }
+
+ if ((index + n_valid + n_invalid) >=
+ (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
+ /* Bad index in stream */
+ fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
+ "in htab stream\n", index, n_valid, n_invalid);
+ return -EINVAL;
+ }
+
+ if (n_valid) {
+ qemu_get_buffer(f, HPTE(spapr->htab, index),
+ HASH_PTE_SIZE_64 * n_valid);
+ }
+ if (n_invalid) {
+ memset(HPTE(spapr->htab, index + n_valid), 0,
+ HASH_PTE_SIZE_64 * n_invalid);
+ }
+ }
+
+ return 0;
+}
+
+static SaveVMHandlers savevm_htab_handlers = {
+ .save_live_setup = htab_save_setup,
+ .save_live_iterate = htab_save_iterate,
+ .save_live_complete = htab_save_complete,
+ .load_state = htab_load,
+};
+
static struct icp_state *try_create_xics(const char *type, int nr_servers,
int nr_irqs)
{
@@ -987,6 +1250,10 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
spapr->entry_point = 0x100;
+ vmstate_register(NULL, 0, &vmstate_spapr, spapr);
+ register_savevm_live(NULL, "spapr/htab", -1, 1,
+ &savevm_htab_handlers, spapr);
+
/* Prepare the device tree */
spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
initrd_base, initrd_size,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e6f321d..7ca984e 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -115,7 +115,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
}
ppc_hash64_store_hpte1(env, hpte, ptel);
/* eieio(); FIXME: need some sort of barrier for smp? */
- ppc_hash64_store_hpte0(env, hpte, pteh);
+ ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
args[0] = pte_index + i;
return H_SUCCESS;
@@ -152,7 +152,7 @@ static target_ulong remove_hpte(CPUPPCState *env, target_ulong ptex,
}
*vp = v;
*rp = r;
- ppc_hash64_store_hpte0(env, hpte, 0);
+ ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
rb = compute_tlbie_rb(v, r, ptex);
ppc_tlb_invalidate_one(env, rb);
return REMOVE_SUCCESS;
@@ -282,11 +282,11 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
r |= (flags << 48) & HPTE64_R_KEY_HI;
r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
rb = compute_tlbie_rb(v, r, pte_index);
- ppc_hash64_store_hpte0(env, hpte, v & ~HPTE64_V_VALID);
+ ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY);
ppc_tlb_invalidate_one(env, rb);
ppc_hash64_store_hpte1(env, hpte, r);
/* Don't need a memory barrier, due to qemu's global lock */
- ppc_hash64_store_hpte0(env, hpte, v);
+ ppc_hash64_store_hpte0(env, hpte, v | HPTE64_V_HPTE_DIRTY);
return H_SUCCESS;
}
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 09c4570..4cfe449 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -9,6 +9,8 @@ struct sPAPRPHBState;
struct sPAPRNVRAM;
struct icp_state;
+#define HPTE64_V_HPTE_DIRTY 0x0000000000000040ULL
+
typedef struct sPAPREnvironment {
struct VIOsPAPRBus *vio_bus;
QLIST_HEAD(, sPAPRPHBState) phbs;
@@ -17,20 +19,24 @@ typedef struct sPAPREnvironment {
hwaddr ram_limit;
void *htab;
- long htab_shift;
+ uint32_t htab_shift;
hwaddr rma_size;
int vrma_adjust;
hwaddr fdt_addr, rtas_addr;
long rtas_size;
void *fdt_skel;
target_ulong entry_point;
- int next_irq;
- int rtc_offset;
+ uint32_t next_irq;
+ uint64_t rtc_offset;
char *cpu_model;
bool has_graphics;
uint32_t epow_irq;
Notifier epow_notifier;
+
+ /* Migration state */
+ int htab_save_index;
+ bool htab_first_pass;
} sPAPREnvironment;
#define H_SUCCESS 0
--
1.8.3.2
next prev parent reply other threads:[~2013-07-06 13:55 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-07-06 13:53 [Qemu-devel] [PATCH 00/19 v4] spapr: migration, pci, msi, power8 Alexey Kardashevskiy
2013-07-06 13:53 ` [Qemu-devel] [PATCH 01/19] pseries: move interrupt controllers to hw/intc/ Alexey Kardashevskiy
2013-07-06 13:53 ` [Qemu-devel] [PATCH 02/19] pseries: rework XICS Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 03/19] savevm: Implement VMS_DIVIDE flag Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 04/19] target-ppc: Convert ppc cpu savevm to VMStateDescription Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 05/19] pseries: savevm support for XICS interrupt controller Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 06/19] pseries: savevm support for VIO devices Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 07/19] pseries: savevm support for PAPR VIO logical lan Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 08/19] pseries: savevm support for PAPR VIO logical tty Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 09/19] pseries: savevm support for PAPR TCE tables Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 10/19] pseries: rework PAPR virtual SCSI Alexey Kardashevskiy
2013-07-08 11:57 ` [Qemu-devel] [PATCH v2] " Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 11/19] pseries: savevm support for " Alexey Kardashevskiy
2013-07-06 13:54 ` Alexey Kardashevskiy [this message]
2013-07-06 13:54 ` [Qemu-devel] [PATCH 13/19] pseries: savevm support for PCI host bridge Alexey Kardashevskiy
2013-07-07 23:33 ` David Gibson
2013-07-06 13:54 ` [Qemu-devel] [PATCH 14/19] target-ppc: Add helper for KVM_PPC_RTAS_DEFINE_TOKEN Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 15/19] pseries: Support for in-kernel XICS interrupt controller Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 16/19] pseries: savevm support with KVM Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 17/19] target-ppc: Add POWER8 v1.0 CPU model Alexey Kardashevskiy
2013-07-06 13:54 ` [Qemu-devel] [PATCH 18/19] target-ppc: Enhance the CPU node labels for the guest device tree for pseries Alexey Kardashevskiy
2013-07-08 1:09 ` David Gibson
2013-07-08 9:02 ` Andreas Färber
2013-07-08 15:49 ` Prerna Saxena
2013-07-08 16:45 ` Andreas Färber
2013-07-10 6:38 ` Prerna Saxena
2013-07-10 9:11 ` Andreas Färber
2013-07-08 15:45 ` [Qemu-devel] [PATCH v2 " Prerna Saxena
2013-07-06 13:54 ` [Qemu-devel] [PATCH 19/19] spapr-pci: rework MSI/MSIX Alexey Kardashevskiy
2013-07-29 20:24 ` [Qemu-devel] [PATCH 00/19 v4] spapr: migration, pci, msi, power8 Anthony Liguori
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1373118856-30171-13-git-send-email-aik@ozlabs.ru \
--to=aik@ozlabs.ru \
--cc=agraf@suse.de \
--cc=aliguori@us.ibm.com \
--cc=david@gibson.dropbear.id.au \
--cc=paulus@samba.org \
--cc=qemu-devel@nongnu.org \
--cc=qemu-ppc@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).