* [Qemu-devel] [PATCH RFC 1/2] migration: implement checkpoint loading
2015-11-24 16:42 [Qemu-devel] [RFC V2, PATCH 0/2] Checkpoint-assisted migration proposal Bohdan Trach
@ 2015-11-24 16:42 ` Bohdan Trach
2015-11-24 16:42 ` [Qemu-devel] [PATCH RFC 2/2] migration: use checkpoint during migration Bohdan Trach
2015-11-24 17:51 ` [Qemu-devel] [RFC V2, PATCH 0/2] Checkpoint-assisted migration proposal Eric Blake
2 siblings, 0 replies; 5+ messages in thread
From: Bohdan Trach @ 2015-11-24 16:42 UTC (permalink / raw)
To: qemu-devel; +Cc: Bohdan Trach, amit.shah, thomas.knauth, quintela
This commit adds functions used to open the checkpoint saved by the
dump-guest-memory command and populate the hash table used by the
checkpoint-assisted migration mechanism. SHA256 is used to checkpoint
the pages. Only ELF memory dump format is supported at the moment.
Signed-off-by: Bohdan Trach <bohdan.trach@mailbox.tu-dresden.de>
---
include/migration/migration.h | 4 ++
migration/ram.c | 157 ++++++++++++++++++++++++++++++++++++++++++
qemu-options.hx | 9 +++
trace-events | 3 +
vl.c | 9 +++
5 files changed, 182 insertions(+)
diff --git a/include/migration/migration.h b/include/migration/migration.h
index fd018b7..4904c85 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -321,4 +321,8 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
PostcopyState postcopy_state_get(void);
/* Set the state and return the old state */
PostcopyState postcopy_state_set(PostcopyState new_state);
+
+void allocate_checksum_table(void);
+void init_checksum_lookup_table(const char *checkpoint_path);
+extern const char *checkpoint_path;
#endif
diff --git a/migration/ram.c b/migration/ram.c
index 1eb155a..379a381 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -27,6 +27,7 @@
*/
#include <stdint.h>
#include <zlib.h>
+#include <elf.h>
#include "qemu/bitops.h"
#include "qemu/bitmap.h"
#include "qemu/timer.h"
@@ -39,6 +40,7 @@
#include "trace.h"
#include "exec/ram_addr.h"
#include "qemu/rcu_queue.h"
+#include "crypto/hash.h"
#ifdef DEBUG_MIGRATION_RAM
#define DPRINTF(fmt, ...) \
@@ -48,6 +50,159 @@
do { } while (0)
#endif
+#define SHA256_DIGEST_LENGTH 32
+static int fd_checkpoint = -1;
+/* indexed by page number */
+static uint64_t hashes_size = 0;
+static uint64_t hashes_entries = 0;
+static uint8_t *hashes = 0;
+
+typedef struct {
+ uint8_t hash[SHA256_DIGEST_LENGTH];
+ uint64_t offset;
+} hash_offset_entry;
+
+static uint64_t hash_offset_entries = 0;
+static uint64_t max_hash_offset_entries;
+static hash_offset_entry* hash_offset_array = 0;
+static uint8_t all_zeroes_hash[SHA256_DIGEST_LENGTH];
+
+static inline void SHA256(void *data, size_t data_len, void* digest)
+{
+ uint8_t *out = NULL;
+ size_t rlen = 0;
+ qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, data, data_len, &out, &rlen, NULL);
+ assert(rlen == SHA256_DIGEST_LENGTH);
+ memcpy(digest, out, rlen);
+ g_free(out);
+}
+
+static char* sha256s(const uint8_t *digest) {
+ /* SHA256 is 32 bytes, i.e., 64 hexadecimal digits. + 1 for trailing \0. */
+ static const size_t size = 64 + 1;
+ static char hex_digits[64 + 1];
+ int digit;
+
+ for (digit = 0; digit < 64; digit += 2) {
+ snprintf(hex_digits+digit, 3, "%02x", digest[digit/2]);
+ }
+
+ hex_digits[size-1] = '\0';
+ return hex_digits;
+}
+
+static int uint256_compare(const void* x, const void* y)
+{
+ return memcmp(x, y, SHA256_DIGEST_LENGTH);
+}
+
+static int cmp_hash_offset_entry(const void* a, const void* b) {
+ hash_offset_entry* e = (hash_offset_entry*) a;
+ hash_offset_entry* f = (hash_offset_entry*) b;
+
+ return memcmp(e->hash, f->hash, SHA256_DIGEST_LENGTH);
+}
+
+void allocate_checksum_table(void) {
+ RAMBlock *block;
+ size_t sz = 0;
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ sz += block->used_length;
+ }
+
+ max_hash_offset_entries = hashes_entries = (sz / TARGET_PAGE_SIZE);
+ trace_allocate_checksum_table(hashes_entries);
+ hashes_size = hashes_entries * SHA256_DIGEST_LENGTH;
+
+ hashes = g_try_malloc0(hashes_size);
+ if (!hashes) {
+ error_report("Error allocating hashes");
+ return;
+ }
+
+ uint8_t all_zeroes[TARGET_PAGE_SIZE];
+ bzero(all_zeroes, TARGET_PAGE_SIZE);
+ SHA256(all_zeroes, TARGET_PAGE_SIZE, all_zeroes_hash);
+
+ hash_offset_array = g_try_malloc0(max_hash_offset_entries * sizeof(hash_offset_entry));
+ if (!hash_offset_array) {
+ error_report("Error allocating hash_offset_array");
+ return;
+ }
+}
+
+/* phdr.p_offset + phdr.p_memsz is the beginning of the dumped memory */
+static off_t seek_elf64(int f)
+{
+ Elf64_Ehdr elf;
+ Elf64_Phdr phdr;
+ off_t off;
+
+ assert(sizeof(elf) == read(f, &elf, sizeof(elf)));
+ assert(sizeof(phdr) == read(f, &phdr, sizeof(phdr)));
+ off = lseek(f, phdr.p_offset + phdr.p_memsz, SEEK_SET);
+ return off;
+}
+
+static off_t seek_elf32(int f)
+{
+ Elf32_Ehdr elf;
+ Elf32_Phdr phdr;
+ off_t off;
+
+ assert(sizeof(elf) == read(f, &elf, sizeof(elf)));
+ assert(sizeof(phdr) == read(f, &phdr, sizeof(phdr)));
+ off = lseek(f, phdr.p_offset + phdr.p_memsz, SEEK_SET);
+ return off;
+}
+
+static off_t seek_to_memory(int checkpoint_fd)
+{
+ char ident[16];
+ assert(16 == read(checkpoint_fd, ident, sizeof(ident)));
+ /* seek_elf* expect zero offset */
+ lseek(checkpoint_fd, 0, SEEK_SET);
+ if (ident[EI_CLASS] == ELFCLASS64) {
+ return seek_elf64(checkpoint_fd);
+ } else {
+ return seek_elf32(checkpoint_fd);
+ }
+}
+
+void init_checksum_lookup_table(const char *checkpoint_path)
+{
+ ssize_t rc;
+ uint8_t* pg;
+ struct stat sb;
+ uint64_t idx;
+
+ trace_init_checksum_lookup_table_start(ram_size);
+
+ rc = stat(checkpoint_path, &sb);
+ if (rc == -1 && errno == ENOENT) return;
+ assert(rc == 0);
+
+ pg = g_malloc0(TARGET_PAGE_SIZE);
+ fd_checkpoint = qemu_open(checkpoint_path, O_RDONLY);
+ assert(fd_checkpoint != -1);
+
+ for (idx = seek_to_memory(fd_checkpoint); idx < sb.st_size;
+ idx += TARGET_PAGE_SIZE) {
+ rc = read(fd_checkpoint, pg, TARGET_PAGE_SIZE);
+ assert(rc == TARGET_PAGE_SIZE);
+ assert(hash_offset_entries < max_hash_offset_entries);
+ SHA256(pg, TARGET_PAGE_SIZE, hash_offset_array[hash_offset_entries].hash);
+ hash_offset_array[hash_offset_entries].offset = idx;
+ trace_init_checksum_lookup_table_hash(
+ sha256s(hash_offset_array[hash_offset_entries].hash),
+ hash_offset_array[hash_offset_entries].offset);
+ hash_offset_entries++;
+ };
+
+ qsort(hash_offset_array, hash_offset_entries, sizeof(hash_offset_entry),
+ cmp_hash_offset_entry);
+ g_free(pg);
+}
static int dirty_rate_high_cnt;
static uint64_t bitmap_sync_count;
@@ -1874,6 +2029,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
migration_bitmap_sync_init();
qemu_mutex_init(&migration_bitmap_mutex);
+ qsort(hashes, hashes_entries, SHA256_DIGEST_LENGTH, uint256_compare);
+
if (migrate_use_xbzrle()) {
XBZRLE_cache_lock();
XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
diff --git a/qemu-options.hx b/qemu-options.hx
index 0eea4ee..1913375 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3557,6 +3557,15 @@ Dump json-encoded vmstate information for current machine type to file
in @var{file}
ETEXI
+DEF("checkpoint", HAS_ARG, QEMU_OPTION_checkpoint,
+ "-checkpoint file path to checkpoint file\n", QEMU_ARCH_ALL)
+STEXI
+@item -checkpoint @var{path}
+@findex -checkpoint
+Checkpoint file to use during incoming migrations. Reduces network
+traffic and total migration time.
+ETEXI
+
DEFHEADING(Generic object creation)
DEF("object", HAS_ARG, QEMU_OPTION_object,
diff --git a/trace-events b/trace-events
index 0b0ff02..eee060b 100644
--- a/trace-events
+++ b/trace-events
@@ -1264,6 +1264,9 @@ migration_throttle(void) ""
ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
ram_postcopy_send_discard_bitmap(void) ""
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
+allocate_checksum_table(uint64_t npages) "pages=%" PRIu64
+init_checksum_lookup_table_start(uint64_t ram_size) "ram_size=%" PRIu64
+init_checksum_lookup_table_hash(const char* hash, uint64_t offset) "hash=%s offset=%" PRIu64
# hw/display/qxl.c
disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
diff --git a/vl.c b/vl.c
index 525929b..2dfac86 100644
--- a/vl.c
+++ b/vl.c
@@ -138,6 +138,7 @@ int display_opengl;
static int display_remote;
const char* keyboard_layout = NULL;
ram_addr_t ram_size;
+const char *checkpoint_path = NULL;
const char *mem_path = NULL;
int mem_prealloc = 0; /* force preallocation of physical target memory */
bool enable_mlock = false;
@@ -3355,6 +3356,9 @@ int main(int argc, char **argv, char **envp)
}
break;
#endif
+ case QEMU_OPTION_checkpoint:
+ checkpoint_path = optarg;
+ break;
case QEMU_OPTION_mempath:
mem_path = optarg;
break;
@@ -4653,6 +4657,7 @@ int main(int argc, char **argv, char **envp)
}
}
+ allocate_checksum_table();
qdev_prop_check_globals();
if (vmstate_dump_file) {
/* dump and exit */
@@ -4662,6 +4667,10 @@ int main(int argc, char **argv, char **envp)
if (incoming) {
Error *local_err = NULL;
+ if (checkpoint_path) {
+ init_checksum_lookup_table(checkpoint_path);
+ }
+
qemu_start_incoming_migration(incoming, &local_err);
if (local_err) {
error_report("-incoming %s: %s", incoming,
--
2.4.10
^ permalink raw reply related [flat|nested] 5+ messages in thread* [Qemu-devel] [PATCH RFC 2/2] migration: use checkpoint during migration
2015-11-24 16:42 [Qemu-devel] [RFC V2, PATCH 0/2] Checkpoint-assisted migration proposal Bohdan Trach
2015-11-24 16:42 ` [Qemu-devel] [PATCH RFC 1/2] migration: implement checkpoint loading Bohdan Trach
@ 2015-11-24 16:42 ` Bohdan Trach
2015-11-24 17:51 ` [Qemu-devel] [RFC V2, PATCH 0/2] Checkpoint-assisted migration proposal Eric Blake
2 siblings, 0 replies; 5+ messages in thread
From: Bohdan Trach @ 2015-11-24 16:42 UTC (permalink / raw)
To: qemu-devel; +Cc: Bohdan Trach, amit.shah, thomas.knauth, quintela
Extend memory page saving and loading functions to utilize information
available in checkpoints to avoid sending full pages over the network.
Signed-off-by: Bohdan Trach <bohdan.trach@mailbox.tu-dresden.de>
---
migration/ram.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
trace-events | 5 +++
2 files changed, 127 insertions(+), 9 deletions(-)
diff --git a/migration/ram.c b/migration/ram.c
index 379a381..79cb143 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -203,6 +203,16 @@ void init_checksum_lookup_table(const char *checkpoint_path)
cmp_hash_offset_entry);
g_free(pg);
}
+
+static int is_outgoing_with_checkpoint(void) {
+ return (fd_checkpoint != -1);
+}
+
+static uint32_t get_page_nr(uint64_t addr) {
+ assert((addr % TARGET_PAGE_SIZE) == 0);
+ return (addr / TARGET_PAGE_SIZE);
+}
+
static int dirty_rate_high_cnt;
static uint64_t bitmap_sync_count;
@@ -219,6 +229,7 @@ static uint64_t bitmap_sync_count;
#define RAM_SAVE_FLAG_XBZRLE 0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
+#define RAM_SAVE_FLAG_HASH 0x200
static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
@@ -887,6 +898,7 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
uint8_t *p;
int ret;
bool send_async = true;
+ uint8_t hash[SHA256_DIGEST_LENGTH];
p = block->host + offset;
@@ -935,16 +947,32 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
/* XBZRLE overflow or normal page */
if (pages == -1) {
- *bytes_transferred += save_page_header(f, block,
- offset | RAM_SAVE_FLAG_PAGE);
- if (send_async) {
- qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
- } else {
- qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+ if (is_outgoing_with_checkpoint()) {
+ SHA256(p, TARGET_PAGE_SIZE, hash);
+
+ if (bsearch(hash, hashes, hashes_entries,
+ SHA256_DIGEST_LENGTH, uint256_compare) != NULL) {
+
+ *bytes_transferred += save_page_header(f, block, offset | RAM_SAVE_FLAG_HASH);
+ qemu_put_buffer(f, hash, SHA256_DIGEST_LENGTH);
+ *bytes_transferred += SHA256_DIGEST_LENGTH;
+ pages = 1;
+ trace_ram_load_send_hash(offset&TARGET_PAGE_MASK, (offset | RAM_SAVE_FLAG_HASH)& ~TARGET_PAGE_MASK, sha256s(hash));
+ }
+ }
+ if (pages == -1) {
+ *bytes_transferred += save_page_header(f, block,
+ offset | RAM_SAVE_FLAG_PAGE);
+ if (send_async) {
+ qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+ } else {
+ qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+ }
+ *bytes_transferred += TARGET_PAGE_SIZE;
+ pages = 1;
+ acct_info.norm_pages++;
+ trace_ram_load_send_page(offset&TARGET_PAGE_MASK, (offset | RAM_SAVE_FLAG_PAGE)& ~TARGET_PAGE_MASK);
}
- *bytes_transferred += TARGET_PAGE_SIZE;
- pages = 1;
- acct_info.norm_pages++;
}
XBZRLE_cache_unlock();
@@ -2540,6 +2568,58 @@ static int ram_load_postcopy(QEMUFile *f)
return ret;
}
+/**
+ * If migration source determined we already have the chunk, it only
+ * sends a hash of the page's content. Read it from local storage,
+ * e.g., an old checkpoint.
+ * @param host Address which, after this function, should have a content matching the functions 2nd parameter.
+ * @param hash The hash value.
+ * @param size Size of the memory region in bytes. Typically, size is a single page, e.g., 4 KiB.
+ * @param fd file descriptor of checkpoint file
+ */
+static void ram_handle_hash(void *host, uint64_t guest_phy_addr, uint8_t *hash, uint64_t size)
+{
+ assert(fd_checkpoint != -1);
+
+ /* fprintf(stdout, "ram_handle_hash: incoming has %u!\n", hash); */
+ uint8_t local_page_hash[SHA256_DIGEST_LENGTH];
+ SHA256(host, TARGET_PAGE_SIZE, local_page_hash);
+
+ if (0 != memcmp(local_page_hash, hash, SHA256_DIGEST_LENGTH)) {
+ /* Computed hash does not match the hash the migration source
+ sent us for this page. */
+ hash_offset_entry* v = bsearch(hash, hash_offset_array, hash_offset_entries,
+ sizeof(hash_offset_entry), cmp_hash_offset_entry);
+ if (v == NULL) {
+ /* For some reason the source thought the destination
+ already has this block. But it doesn't. Hmmm ... */
+ trace_ram_handle_hash_unknown(sha256s(hash), guest_phy_addr);
+ assert(0);
+ }
+
+ trace_ram_handle_hash(guest_phy_addr, sha256s(hash), v->offset);
+
+ off_t offset_actual = lseek(fd_checkpoint, v->offset, SEEK_SET);
+ assert(offset_actual == v->offset);
+
+ ssize_t read_actual = read(fd_checkpoint, host, TARGET_PAGE_SIZE);
+ assert(read_actual == TARGET_PAGE_SIZE);
+ SHA256(host, TARGET_PAGE_SIZE, local_page_hash);
+ if (0 != memcmp(local_page_hash, hash, SHA256_DIGEST_LENGTH)) {
+ trace_ram_handle_hash_mismatch(sha256s(local_page_hash));
+ assert(0);
+ }
+ }
+}
+
+static void add_remote_hash(ram_addr_t addr, uint8_t *hash) {
+ uint64_t page_nr = get_page_nr(addr);
+ memcpy(&hashes[page_nr * SHA256_DIGEST_LENGTH],
+ hash,
+ SHA256_DIGEST_LENGTH);
+}
+
+
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
int flags = 0, ret = 0;
@@ -2572,6 +2652,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
ram_addr_t addr, total_ram_bytes;
void *host = NULL;
uint8_t ch;
+ uint8_t hash[SHA256_DIGEST_LENGTH];
addr = qemu_get_be64(f);
flags = addr & ~TARGET_PAGE_MASK;
@@ -2627,10 +2708,34 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
case RAM_SAVE_FLAG_COMPRESS:
ch = qemu_get_byte(f);
ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
+ if (fd_checkpoint != -1) {
+ if (ch != 0) {
+ SHA256(host, TARGET_PAGE_SIZE, hash);
+ add_remote_hash(addr, hash);
+ } else {
+ add_remote_hash(addr, all_zeroes_hash);
+ }
+ }
break;
+ case RAM_SAVE_FLAG_HASH:
+ host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+ ret = -EINVAL;
+ break;
+ }
+ qemu_get_buffer(f, hash, SHA256_DIGEST_LENGTH);
+
+ ram_handle_hash(host, addr, hash, TARGET_PAGE_SIZE);
+ add_remote_hash(addr, hash);
+ break;
case RAM_SAVE_FLAG_PAGE:
qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
+ if (fd_checkpoint != -1) {
+ SHA256(host, TARGET_PAGE_SIZE, hash);
+ add_remote_hash(addr, hash);
+ }
break;
case RAM_SAVE_FLAG_COMPRESS_PAGE:
@@ -2642,6 +2747,10 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
}
qemu_get_buffer(f, compressed_data_buf, len);
decompress_data_with_multi_threads(compressed_data_buf, host, len);
+ if (fd_checkpoint != -1) {
+ SHA256(host, TARGET_PAGE_SIZE, hash);
+ add_remote_hash(addr, hash);
+ }
break;
case RAM_SAVE_FLAG_XBZRLE:
@@ -2651,6 +2760,10 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
ret = -EINVAL;
break;
}
+ if (fd_checkpoint != -1) {
+ SHA256(host, TARGET_PAGE_SIZE, hash);
+ add_remote_hash(addr, hash);
+ }
break;
case RAM_SAVE_FLAG_EOS:
/* normal exit */
diff --git a/trace-events b/trace-events
index eee060b..e543821 100644
--- a/trace-events
+++ b/trace-events
@@ -1267,6 +1267,11 @@ ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %
allocate_checksum_table(uint64_t npages) "pages=%" PRIu64
init_checksum_lookup_table_start(uint64_t ram_size) "ram_size=%" PRIu64
init_checksum_lookup_table_hash(const char* hash, uint64_t offset) "hash=%s offset=%" PRIu64
+ram_load_send_hash(uint64_t addr, uint64_t flags, const char *hash) "addr=%" PRIx64 " flags=%" PRIx64 " hash=%s"
+ram_load_send_page(uint64_t addr, uint64_t flags) "addr=%" PRIx64 " flags=%" PRIx64
+ram_handle_hash(uint64_t addr, const char *hash, uint64_t offset) "addr=%" PRIx64 " hash=%s offset=%" PRIx64
+ram_handle_hash_mismatch(const char *hash) "mismatch: hash=%s"
+ram_handle_hash_unknown(const char *hash, uint64_t offset) "unknown hash %s at guest addr %08" PRIx64
# hw/display/qxl.c
disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
--
2.4.10
^ permalink raw reply related [flat|nested] 5+ messages in thread