[Qemu-devel] [PATCH 0/10] Live migration for QEMU

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/10] Live migration for QEMU
@ 2008-09-09 19:49 Anthony Liguori
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
                   ` (10 more replies)
  0 siblings, 11 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This series adds live migration support to QEMU.  It's inspired by the
implementation of live migration in KVM, but at this point, is almost
a full rewrite.  Uri Lublin did a large amount of the work on the live
migration implementation in KVM.

This patch series is not yet ready to apply.  There are a few FIXMEs
and I have to add back support for restoring v2 saved images.  I wanted
to get these patches out on the list though for review since it's a rather
large series.

Live migration will work with any target that supports save/restore.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10 13:25   ` Chris Lalancette
                     ` (2 more replies)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op Anthony Liguori
                   ` (9 subsequent siblings)
  10 siblings, 3 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

To support live migration, we override QEMUFile so that instead of writing to
disk, the save/restore state happens over a network connection.

This patch makes QEMUFile read/write operations function pointers so that we
can override them for live migration.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/hw/hw.h b/hw/hw.h
index b84ace0..771dbd4 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -7,9 +7,36 @@
 
 /* VM Load/Save */
 
+/* This function writes a chunk of data to a file at the given position.
+ * The pos argument can be ignored if the file is only being used for
+ * streaming.  The handler should try to write all of the data it can.
+ */
+typedef void (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
+                                     int64_t pos, int size);
+
+/* Read a chunk of data from a file at the given position.  The pos argument
+ * can be ignored if the file is only be used for streaming.  The number of
+ * bytes actually read should be returned.
+ */
+typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
+                                    int64_t pos, int size);
+
+/* Close a file and return an error code */
+typedef int (QEMUFileCloseFunc)(void *opaque);
+
+/* Called to determine if the file has exceeded it's bandwidth allocation.  The
+ * bandwidth capping is a soft limit, not a hard limit.
+ */
+typedef int (QEMUFileRateLimit)(void *opaque);
+
+QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
+			 QEMUFileGetBufferFunc *get_buffer,
+			 QEMUFileCloseFunc *close,
+                         QEMUFileRateLimit *rate_limit);
 QEMUFile *qemu_fopen(const char *filename, const char *mode);
+QEMUFile *qemu_fopen_fd(int fd);
 void qemu_fflush(QEMUFile *f);
-void qemu_fclose(QEMUFile *f);
+int qemu_fclose(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
 void qemu_put_be16(QEMUFile *f, unsigned int v);
@@ -20,6 +47,12 @@ int qemu_get_byte(QEMUFile *f);
 unsigned int qemu_get_be16(QEMUFile *f);
 unsigned int qemu_get_be32(QEMUFile *f);
 uint64_t qemu_get_be64(QEMUFile *f);
+int qemu_file_rate_limit(QEMUFile *f);
+
+/* Try to send any outstanding data.  This function is useful when output is
+ * halted due to rate limiting or EAGAIN errors occur as it can be used to
+ * resume output. */
+void qemu_file_put_notify(QEMUFile *f);
 
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
diff --git a/vl.c b/vl.c
index 0a457a9..659fd95 100644
--- a/vl.c
+++ b/vl.c
@@ -6152,11 +6152,12 @@ void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
 #define IO_BUF_SIZE 32768
 
 struct QEMUFile {
-    FILE *outfile;
-    BlockDriverState *bs;
-    int is_file;
-    int is_writable;
-    int64_t base_offset;
+    QEMUFilePutBufferFunc *put_buffer;
+    QEMUFileGetBufferFunc *get_buffer;
+    QEMUFileCloseFunc *close;
+    QEMUFileRateLimit *rate_limit;
+    void *opaque;
+
     int64_t buf_offset; /* start of buffer when writing, end of buffer
                            when reading */
     int buf_index;
@@ -6164,58 +6165,194 @@ struct QEMUFile {
     uint8_t buf[IO_BUF_SIZE];
 };
 
+typedef struct QEMUFileFD
+{
+    int fd;
+    QEMUFile *file;
+} QEMUFileFD;
+
+static void fd_put_notify(void *opaque)
+{
+    QEMUFileFD *s = opaque;
+
+    /* Remove writable callback and do a put notify */
+    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
+    qemu_file_put_notify(s->file);
+}
+
+static int fd_put_buffer(void *opaque, const uint8_t *buf,
+                         int64_t pos, int size)
+{
+    QEMUFileFD *s = opaque;
+    ssize_t len;
+
+    do {
+        len = write(s->fd, buf, size);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1)
+        len = -errno;
+
+    /* When the fd becomes writable again, register a callback to do
+     * a put notify */
+    if (len == -EAGAIN)
+        qemu_set_fd_handler2(s->fd, NULL, NULL, fd_put_notify, s);
+
+    return len;
+}
+
+static int fd_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileFD *s = opaque;
+    ssize_t len;
+
+    do {
+        len = read(s->fd, buf, size);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1)
+        len = -errno;
+
+    return len;
+}
+
+static int fd_close(void *opaque)
+{
+    QEMUFileFD *s = opaque;
+    qemu_free(s);
+    return 0;
+}
+
+QEMUFile *qemu_fopen_fd(int fd)
+{
+    QEMUFileFD *s = qemu_mallocz(sizeof(QEMUFileFD));
+    s->fd = fd;
+    s->file = qemu_fopen_ops(s, fd_put_buffer, fd_get_buffer, fd_close, NULL);
+    return s->file;
+}
+
+typedef struct QEMUFileUnix
+{
+    FILE *outfile;
+} QEMUFileUnix;
+
+static void file_put_buffer(void *opaque, const uint8_t *buf,
+                            int64_t pos, int size)
+{
+    QEMUFileUnix *s = opaque;
+    fseek(s->outfile, pos, SEEK_SET);
+    fwrite(buf, 1, size, s->outfile);
+}
+
+static int file_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileUnix *s = opaque;
+    fseek(s->outfile, pos, SEEK_SET);
+    return fread(buf, 1, size, s->outfile);
+}
+
+static int file_close(void *opaque)
+{
+    QEMUFileUnix *s = opaque;
+    fclose(s->outfile);
+    qemu_free(s);
+    return 0;
+}
+
 QEMUFile *qemu_fopen(const char *filename, const char *mode)
 {
-    QEMUFile *f;
+    QEMUFileUnix *s;
 
-    f = qemu_mallocz(sizeof(QEMUFile));
-    if (!f)
+    s = qemu_mallocz(sizeof(QEMUFileUnix));
+    if (!s)
         return NULL;
-    if (!strcmp(mode, "wb")) {
-        f->is_writable = 1;
-    } else if (!strcmp(mode, "rb")) {
-        f->is_writable = 0;
-    } else {
-        goto fail;
-    }
-    f->outfile = fopen(filename, mode);
-    if (!f->outfile)
+
+    s->outfile = fopen(filename, mode);
+    if (!s->outfile)
         goto fail;
-    f->is_file = 1;
-    return f;
- fail:
-    if (f->outfile)
-        fclose(f->outfile);
-    qemu_free(f);
+
+    if (!strcmp(mode, "wb"))
+	return qemu_fopen_ops(s, file_put_buffer, NULL, file_close, NULL);
+    else if (!strcmp(mode, "rb"))
+	return qemu_fopen_ops(s, NULL, file_get_buffer, file_close, NULL);
+
+fail:
+    if (s->outfile)
+        fclose(s->outfile);
+    qemu_free(s);
     return NULL;
 }
 
-static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int64_t offset, int is_writable)
+typedef struct QEMUFileBdrv
+{
+    BlockDriverState *bs;
+    int64_t base_offset;
+} QEMUFileBdrv;
+
+static void bdrv_put_buffer(void *opaque, const uint8_t *buf,
+                            int64_t pos, int size)
+{
+    QEMUFileBdrv *s = opaque;
+    bdrv_pwrite(s->bs, s->base_offset + pos, buf, size);
+}
+
+static int bdrv_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileBdrv *s = opaque;
+    return bdrv_pread(s->bs, s->base_offset + pos, buf, size);
+}
+
+static int bdrv_fclose(void *opaque)
+{
+    QEMUFileBdrv *s = opaque;
+    qemu_free(s);
+    return 0;
+}
+
+QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int64_t offset, int is_writable)
+{
+    QEMUFileBdrv *s;
+
+    s = qemu_mallocz(sizeof(QEMUFileBdrv));
+    if (!s)
+        return NULL;
+
+    s->bs = bs;
+    s->base_offset = offset;
+
+    if (is_writable)
+	return qemu_fopen_ops(s, bdrv_put_buffer, NULL, bdrv_fclose, NULL);
+
+    return qemu_fopen_ops(s, NULL, bdrv_get_buffer, bdrv_fclose, NULL);
+}
+
+QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
+			 QEMUFileGetBufferFunc *get_buffer,
+			 QEMUFileCloseFunc *close,
+                         QEMUFileRateLimit *rate_limit)
 {
     QEMUFile *f;
 
     f = qemu_mallocz(sizeof(QEMUFile));
     if (!f)
-        return NULL;
-    f->is_file = 0;
-    f->bs = bs;
-    f->is_writable = is_writable;
-    f->base_offset = offset;
+	return NULL;
+
+    f->opaque = opaque;
+    f->put_buffer = put_buffer;
+    f->get_buffer = get_buffer;
+    f->close = close;
+    f->rate_limit = rate_limit;
+
     return f;
 }
 
 void qemu_fflush(QEMUFile *f)
 {
-    if (!f->is_writable)
+    if (!f->put_buffer)
         return;
+
     if (f->buf_index > 0) {
-        if (f->is_file) {
-            fseek(f->outfile, f->buf_offset, SEEK_SET);
-            fwrite(f->buf, 1, f->buf_index, f->outfile);
-        } else {
-            bdrv_pwrite(f->bs, f->base_offset + f->buf_offset,
-                        f->buf, f->buf_index);
-        }
+	f->put_buffer(f->opaque, f->buf, f->buf_offset, f->buf_index);
         f->buf_offset += f->buf_index;
         f->buf_index = 0;
     }
@@ -6225,32 +6362,31 @@ static void qemu_fill_buffer(QEMUFile *f)
 {
     int len;
 
-    if (f->is_writable)
+    if (!f->get_buffer)
         return;
-    if (f->is_file) {
-        fseek(f->outfile, f->buf_offset, SEEK_SET);
-        len = fread(f->buf, 1, IO_BUF_SIZE, f->outfile);
-        if (len < 0)
-            len = 0;
-    } else {
-        len = bdrv_pread(f->bs, f->base_offset + f->buf_offset,
-                         f->buf, IO_BUF_SIZE);
-        if (len < 0)
-            len = 0;
-    }
+
+    len = f->get_buffer(f->opaque, f->buf, f->buf_offset, IO_BUF_SIZE);
+    if (len < 0)
+	len = 0;
+
     f->buf_index = 0;
     f->buf_size = len;
     f->buf_offset += len;
 }
 
-void qemu_fclose(QEMUFile *f)
+int qemu_fclose(QEMUFile *f)
 {
-    if (f->is_writable)
-        qemu_fflush(f);
-    if (f->is_file) {
-        fclose(f->outfile);
-    }
+    int ret = 0;
+    qemu_fflush(f);
+    if (f->close)
+	ret = f->close(f->opaque);
     qemu_free(f);
+    return ret;
+}
+
+void qemu_file_put_notify(QEMUFile *f)
+{
+    f->put_buffer(f->opaque, NULL, 0, 0);
 }
 
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size)
@@ -6324,7 +6460,7 @@ int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence)
         /* SEEK_END not supported */
         return -1;
     }
-    if (f->is_writable) {
+    if (f->put_buffer) {
         qemu_fflush(f);
         f->buf_offset = pos;
     } else {
@@ -6335,6 +6471,14 @@ int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence)
     return pos;
 }
 
+int qemu_file_rate_limit(QEMUFile *f)
+{
+    if (f->rate_limit)
+        return f->rate_limit(f->opaque);
+
+    return 0;
+}
+
 void qemu_put_be16(QEMUFile *f, unsigned int v)
 {
     qemu_put_byte(f, v >> 8);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10  6:52   ` Avi Kivity
  2008-09-10 10:01   ` Daniel P. Berrange
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all() Anthony Liguori
                   ` (8 subsequent siblings)
  10 siblings, 2 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Live migration happens in the background, but it is useful to make the monitor
command appear as if it's blocking.  This allows a management tool to
immediately know when the live migration has completed without having to poll
the migration status.

This patch allows the monitor to be suspended from a monitor callback which
will prevent new monitor commands from being executed.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/console.h b/console.h
index 561ef51..c94386c 100644
--- a/console.h
+++ b/console.h
@@ -168,6 +168,8 @@ void term_flush(void);
 void term_print_help(void);
 void monitor_readline(const char *prompt, int is_password,
                       char *buf, int buf_size);
+void monitor_suspend(void);
+void monitor_resume(void);
 
 /* readline.c */
 typedef void ReadLineFunc(void *opaque, const char *str);
diff --git a/monitor.c b/monitor.c
index 47c5514..14bdbeb 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2687,10 +2687,27 @@ static void term_read(void *opaque, const uint8_t *buf, int size)
 
 static void monitor_start_input(void);
 
+static int monitor_suspended;
+
 static void monitor_handle_command1(void *opaque, const char *cmdline)
 {
     monitor_handle_command(cmdline);
-    monitor_start_input();
+    if (!monitor_suspended)
+        monitor_start_input();
+    else
+        monitor_suspended = 2;
+}
+
+void monitor_suspend(void)
+{
+    monitor_suspended = 1;
+}
+
+void monitor_resume(void)
+{
+    if (monitor_suspended == 2)
+        monitor_start_input();
+    monitor_suspended = 0;
 }
 
 static void monitor_start_input(void)

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10 13:26   ` Chris Lalancette
  2008-09-12 15:43   ` Blue Swirl
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration Anthony Liguori
                   ` (7 subsequent siblings)
  10 siblings, 2 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch adds a bdrv_flush_all() function.  It's necessary to ensure that all
IO operations have been flushed to disk before completely a live migration.

N.B. we don't actually use this now.  We really should flush the block drivers
using an live savevm callback to avoid unnecessary guest down time.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/block.c b/block.c
index 544176f..921d382 100644
--- a/block.c
+++ b/block.c
@@ -884,6 +884,21 @@ void bdrv_flush(BlockDriverState *bs)
         bdrv_flush(bs->backing_hd);
 }
 
+void bdrv_iterate_writeable(void (*it)(BlockDriverState *bs))
+{
+    BlockDriverState *bs;
+
+    for (bs = bdrv_first; bs != NULL; bs = bs->next)
+        if (bs->drv && !bdrv_is_read_only(bs) && 
+            (!bdrv_is_removable(bs) || bdrv_is_inserted(bs)))
+	    it(bs);
+}
+
+void bdrv_flush_all(void)
+{
+    bdrv_iterate_writeable(bdrv_flush);
+}
+
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
diff --git a/block.h b/block.h
index fa741b5..8586dc1 100644
--- a/block.h
+++ b/block.h
@@ -112,6 +112,8 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 #define BIOS_ATA_TRANSLATION_LARGE  3
 #define BIOS_ATA_TRANSLATION_RECHS  4
 
+void bdrv_flush_all(void);
+
 void bdrv_set_geometry_hint(BlockDriverState *bs,
                             int cyls, int heads, int secs);
 void bdrv_set_type_hint(BlockDriverState *bs, int type);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (2 preceding siblings ...)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all() Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10 14:52   ` Glauber Costa
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 5/10] Add network announce function Anthony Liguori
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch adds a dirty tracking bit for live migration.  We use 0x08 because
kqemu uses 0x04.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/cpu-all.h b/cpu-all.h
index d350b30..fdac353 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -944,6 +944,7 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
 
 #define VGA_DIRTY_FLAG  0x01
 #define CODE_DIRTY_FLAG 0x02
+#define MIGRATION_DIRTY_FLAG 0x08
 
 /* read dirty bit (return 0 or 1) */
 static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
@@ -966,6 +967,10 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
                                      int dirty_flags);
 void cpu_tlb_update_dirty(CPUState *env);
 
+int cpu_physical_memory_set_dirty_tracking(int enable);
+
+int cpu_physical_memory_get_dirty_tracking(void);
+
 void dump_exec_info(FILE *f,
                     int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
 
diff --git a/exec.c b/exec.c
index 3ab4ad0..9dba5c8 100644
--- a/exec.c
+++ b/exec.c
@@ -38,6 +38,7 @@
 #include "qemu-common.h"
 #include "tcg.h"
 #include "hw/hw.h"
+#include "osdep.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #endif
@@ -113,6 +114,7 @@ ram_addr_t phys_ram_size;
 int phys_ram_fd;
 uint8_t *phys_ram_base;
 uint8_t *phys_ram_dirty;
+static int in_migration;
 static ram_addr_t phys_ram_alloc_offset = 0;
 #endif
 
@@ -1777,6 +1779,17 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
     }
 }
 
+int cpu_physical_memory_set_dirty_tracking(int enable)
+{
+    in_migration = enable;
+    return 0;
+}
+
+int cpu_physical_memory_get_dirty_tracking(void)
+{
+    return in_migration;
+}
+
 static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
 {
     ram_addr_t ram_addr;
@@ -2932,9 +2945,19 @@ void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
         io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
         io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
     } else {
-        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
-            (addr & ~TARGET_PAGE_MASK);
+        unsigned long addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+        ptr = phys_ram_base + addr1;
         stl_p(ptr, val);
+
+        if (unlikely(in_migration)) {
+            if (!cpu_physical_memory_is_dirty(addr1)) {
+                /* invalidate code */
+                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
+                /* set dirty bit */
+                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
+                    (0xff & ~CODE_DIRTY_FLAG);
+            }
+        }
     }
 }
 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 5/10] Add network announce function
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (3 preceding siblings ...)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10 13:27   ` Chris Lalancette
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol Anthony Liguori
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch adds an ethernet announce function that will minimize downtime
when doing a live migration.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/sysemu.h b/sysemu.h
index b12fae0..cebcc60 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -46,6 +46,8 @@ void do_loadvm(const char *name);
 void do_delvm(const char *name);
 void do_info_snapshots(void);
 
+void qemu_announce_self(void);
+
 void main_loop_wait(int timeout);
 
 /* Polling handling */
diff --git a/vl.c b/vl.c
index ac9f8b0..7093c9c 100644
--- a/vl.c
+++ b/vl.c
@@ -6115,6 +6115,45 @@ void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
 }
 #endif
 
+#define SELF_ANNOUNCE_ROUNDS 5
+#define ETH_P_EXPERIMENTAL 0x01F1 /* just a number */
+//#define ETH_P_EXPERIMENTAL 0x0012 /* make it the size of the packet */
+#define EXPERIMENTAL_MAGIC 0xf1f23f4f
+
+static int announce_self_create(uint8_t *buf, 
+				uint8_t *mac_addr)
+{
+    uint32_t magic = EXPERIMENTAL_MAGIC;
+    uint16_t proto = htons(ETH_P_EXPERIMENTAL);
+
+    /* FIXME: should we send a different packet (arp/rarp/ping)? */
+
+    memset(buf, 0xff, 6);         /* h_dst */
+    memcpy(buf + 6, mac_addr, 6); /* h_src */
+    memcpy(buf + 12, &proto, 2);  /* h_proto */
+    memcpy(buf + 14, &magic, 4);  /* magic */
+
+    return 18; /* len */
+}
+
+void qemu_announce_self(void)
+{
+    int i, j, len;
+    VLANState *vlan;
+    VLANClientState *vc;
+    uint8_t buf[256];
+
+    for (i = 0; i < nb_nics; i++) {
+        len = announce_self_create(buf, nd_table[i].macaddr);
+        vlan = nd_table[i].vlan;
+        for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
+            if (vc->fd_read == tap_receive)  /* send only if tap */
+                for (j=0; j < SELF_ANNOUNCE_ROUNDS; j++)
+                    vc->fd_read(vc->opaque, buf, len);
+        }
+    }
+}
+
 /***********************************************************/
 /* savevm/loadvm support */
 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (4 preceding siblings ...)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 5/10] Add network announce function Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-10  7:09   ` Avi Kivity
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live" Anthony Liguori
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

The current savevm/loadvm protocol has some draw backs.  It does not support
the ability to do progressive saving which means it cannot be used for live
checkpointing or migration.  The sections sizes are 32-bit integers which
means that it will not function when using more than 4GB of memory for a guest.
It attempts to seek within the output file which means it cannot be streamed.
The current protocol also is pretty lax about how it supports forward
compatibility.  If a saved section version is greater than what the restore
code support, the restore code generally treats the saved data as being in
whatever version it supports.  This means that restoring a saved VM on an older
version of QEMU will likely result in silent guest failure.

This patch introduces a new version of the savevm protocol.  It has the
following features:

 * Support for progressive save of sections (for live checkpoint/migration)
 * An asynchronous API for doing save
 * Support for interleaving multiple progressive save sections
   (for future support of memory hot-add/storage migration)
 * Fully streaming format
 * Strong section version checking

Right now, the code is missing to support restoring v2 images.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/hw/hw.h b/hw/hw.h
index 242f315..12ab161 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -92,6 +92,7 @@ int64_t qemu_ftell(QEMUFile *f);
 int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence);
 
 typedef void SaveStateHandler(QEMUFile *f, void *opaque);
+typedef int SaveLiveStateHandler(QEMUFile *f, int stage, void *opaque);
 typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
 
 int register_savevm(const char *idstr,
@@ -101,6 +102,14 @@ int register_savevm(const char *idstr,
                     LoadStateHandler *load_state,
                     void *opaque);
 
+int register_savevm_live(const char *idstr,
+                         int instance_id,
+                         int version_id,
+                         SaveLiveStateHandler *save_live_state,
+                         SaveStateHandler *save_state,
+                         LoadStateHandler *load_state,
+                         void *opaque);
+
 typedef void QEMUResetHandler(void *opaque);
 
 void qemu_register_reset(QEMUResetHandler *func, void *opaque);
diff --git a/sysemu.h b/sysemu.h
index cebcc60..0c5d62b 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -50,6 +50,12 @@ void qemu_announce_self(void);
 
 void main_loop_wait(int timeout);
 
+int qemu_savevm_state_begin(QEMUFile *f);
+int qemu_savevm_state_iterate(QEMUFile *f);
+int qemu_savevm_state_complete(QEMUFile *f);
+int qemu_savevm_state(QEMUFile *f);
+int qemu_loadvm_state(QEMUFile *f);
+
 /* Polling handling */
 
 /* return TRUE if no sleep should be done afterwards */
diff --git a/vl.c b/vl.c
index 3d11024..8044ef5 100644
--- a/vl.c
+++ b/vl.c
@@ -6499,6 +6499,8 @@ typedef struct SaveStateEntry {
     char idstr[256];
     int instance_id;
     int version_id;
+    int section_id;
+    SaveLiveStateHandler *save_live_state;
     SaveStateHandler *save_state;
     LoadStateHandler *load_state;
     void *opaque;
@@ -6511,14 +6513,16 @@ static SaveStateEntry *first_se;
    of the system, so instance_id should be removed/replaced.
    Meanwhile pass -1 as instance_id if you do not already have a clearly
    distinguishing id for all instances of your device class. */
-int register_savevm(const char *idstr,
-                    int instance_id,
-                    int version_id,
-                    SaveStateHandler *save_state,
-                    LoadStateHandler *load_state,
-                    void *opaque)
+int register_savevm_live(const char *idstr,
+                         int instance_id,
+                         int version_id,
+                         SaveLiveStateHandler *save_live_state,
+                         SaveStateHandler *save_state,
+                         LoadStateHandler *load_state,
+                         void *opaque)
 {
     SaveStateEntry *se, **pse;
+    static int global_section_id;
 
     se = qemu_malloc(sizeof(SaveStateEntry));
     if (!se)
@@ -6526,6 +6530,8 @@ int register_savevm(const char *idstr,
     pstrcpy(se->idstr, sizeof(se->idstr), idstr);
     se->instance_id = (instance_id == -1) ? 0 : instance_id;
     se->version_id = version_id;
+    se->section_id = global_section_id++;
+    se->save_live_state = save_live_state;
     se->save_state = save_state;
     se->load_state = load_state;
     se->opaque = opaque;
@@ -6544,25 +6550,105 @@ int register_savevm(const char *idstr,
     return 0;
 }
 
-#define QEMU_VM_FILE_MAGIC   0x5145564d
-#define QEMU_VM_FILE_VERSION 0x00000002
+int register_savevm(const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque)
+{
+    return register_savevm_live(idstr, instance_id, version_id,
+                                NULL, save_state, load_state, opaque);
+}
+
+#define QEMU_VM_FILE_MAGIC   		0x5145564d
+#define QEMU_VM_FILE_VERSION_COMPAT	0x00000002
+#define QEMU_VM_FILE_VERSION 		0x00000003
 
-static int qemu_savevm_state(QEMUFile *f)
+#define QEMU_VM_EOF			0x00
+#define QEMU_VM_SECTION_START		0x01
+#define QEMU_VM_SECTION_PART	        0x02
+#define QEMU_VM_SECTION_END		0x03
+#define QEMU_VM_SECTION_FULL		0x04
+
+int qemu_savevm_state_begin(QEMUFile *f)
 {
     SaveStateEntry *se;
-    int len, ret;
-    int64_t cur_pos, len_pos, total_len_pos;
 
     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
-    total_len_pos = qemu_ftell(f);
-    qemu_put_be64(f, 0); /* total size */
+
+    for (se = first_se; se != NULL; se = se->next) {
+        int len;
+
+        if (se->save_live_state == NULL)
+            continue;
+
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_START);
+        qemu_put_be32(f, se->section_id);
+
+        /* ID string */
+        len = strlen(se->idstr);
+        qemu_put_byte(f, len);
+        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
+
+        qemu_put_be32(f, se->instance_id);
+        qemu_put_be32(f, se->version_id);
+
+        se->save_live_state(f, QEMU_VM_SECTION_START, se->opaque);
+    }
+
+    return 0;
+}
+
+int qemu_savevm_state_iterate(QEMUFile *f)
+{
+    SaveStateEntry *se;
+    int ret = 0;
+
+    for (se = first_se; se != NULL; se = se->next) {
+        if (se->save_live_state == NULL)
+            continue;
+
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_PART);
+        qemu_put_be32(f, se->section_id);
+
+        ret |= se->save_live_state(f, QEMU_VM_SECTION_PART, se->opaque);
+    }
+
+    if (ret)
+        return 1;
+
+    return 0;
+}
+
+int qemu_savevm_state_complete(QEMUFile *f)
+{
+    SaveStateEntry *se;
+
+    for (se = first_se; se != NULL; se = se->next) {
+        if (se->save_live_state == NULL)
+            continue;
+
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_END);
+        qemu_put_be32(f, se->section_id);
+
+        se->save_live_state(f, QEMU_VM_SECTION_END, se->opaque);
+    }
 
     for(se = first_se; se != NULL; se = se->next) {
+        int len;
+
 	if (se->save_state == NULL)
-	    /* this one has a loader only, for backwards compatibility */
 	    continue;
 
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_FULL);
+        qemu_put_be32(f, se->section_id);
+
         /* ID string */
         len = strlen(se->idstr);
         qemu_put_byte(f, len);
@@ -6571,24 +6657,37 @@ static int qemu_savevm_state(QEMUFile *f)
         qemu_put_be32(f, se->instance_id);
         qemu_put_be32(f, se->version_id);
 
-        /* record size: filled later */
-        len_pos = qemu_ftell(f);
-        qemu_put_be32(f, 0);
         se->save_state(f, se->opaque);
-
-        /* fill record size */
-        cur_pos = qemu_ftell(f);
-        len = cur_pos - len_pos - 4;
-        qemu_fseek(f, len_pos, SEEK_SET);
-        qemu_put_be32(f, len);
-        qemu_fseek(f, cur_pos, SEEK_SET);
     }
-    cur_pos = qemu_ftell(f);
-    qemu_fseek(f, total_len_pos, SEEK_SET);
-    qemu_put_be64(f, cur_pos - total_len_pos - 8);
-    qemu_fseek(f, cur_pos, SEEK_SET);
 
-    ret = 0;
+    qemu_put_byte(f, QEMU_VM_EOF);
+
+    return 0;
+}
+
+int qemu_savevm_state(QEMUFile *f)
+{
+    int saved_vm_running;
+    int ret;
+
+    saved_vm_running = vm_running;
+    vm_stop(0);
+
+    ret = qemu_savevm_state_begin(f);
+    if (ret < 0)
+        goto out;
+
+    do {
+        ret = qemu_savevm_state_iterate(f);
+        if (ret < 0)
+            goto out;
+    } while (ret == 1);
+
+    ret = qemu_savevm_state_complete(f);
+
+out:
+    if (saved_vm_running)
+        vm_start();
     return ret;
 }
 
@@ -6604,23 +6703,20 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id)
     return NULL;
 }
 
-static int qemu_loadvm_state(QEMUFile *f)
+typedef struct LoadStateEntry {
+    SaveStateEntry *se;
+    int section_id;
+    int version_id;
+    struct LoadStateEntry *next;
+} LoadStateEntry;
+
+static int qemu_loadvm_state_v2(QEMUFile *f)
 {
     SaveStateEntry *se;
     int len, ret, instance_id, record_len, version_id;
     int64_t total_len, end_pos, cur_pos;
-    unsigned int v;
     char idstr[256];
 
-    v = qemu_get_be32(f);
-    if (v != QEMU_VM_FILE_MAGIC)
-        goto fail;
-    v = qemu_get_be32(f);
-    if (v != QEMU_VM_FILE_VERSION) {
-    fail:
-        ret = -1;
-        goto the_end;
-    }
     total_len = qemu_get_be64(f);
     end_pos = total_len + qemu_ftell(f);
     for(;;) {
@@ -6632,10 +6728,6 @@ static int qemu_loadvm_state(QEMUFile *f)
         instance_id = qemu_get_be32(f);
         version_id = qemu_get_be32(f);
         record_len = qemu_get_be32(f);
-#if 0
-        printf("idstr=%s instance=0x%x version=%d len=%d\n",
-               idstr, instance_id, version_id, record_len);
-#endif
         cur_pos = qemu_ftell(f);
         se = find_se(idstr, instance_id);
         if (!se) {
@@ -6651,8 +6743,104 @@ static int qemu_loadvm_state(QEMUFile *f)
         /* always seek to exact end of record */
         qemu_fseek(f, cur_pos + record_len, SEEK_SET);
     }
+    return 0;
+}
+
+int qemu_loadvm_state(QEMUFile *f)
+{
+    LoadStateEntry *first_le = NULL;
+    uint8_t section_type;
+    unsigned int v;
+    int ret;
+
+    v = qemu_get_be32(f);
+    if (v != QEMU_VM_FILE_MAGIC)
+        return -EINVAL;
+
+    v = qemu_get_be32(f);
+    if (v == QEMU_VM_FILE_VERSION_COMPAT)
+        return qemu_loadvm_state_v2(f);
+    if (v != QEMU_VM_FILE_VERSION)
+        return -ENOTSUP;
+
+    while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
+        uint32_t instance_id, version_id, section_id;
+        LoadStateEntry *le;
+        SaveStateEntry *se;
+        char idstr[257];
+        int len;
+
+        switch (section_type) {
+        case QEMU_VM_SECTION_START:
+        case QEMU_VM_SECTION_FULL:
+            /* Read section start */
+            section_id = qemu_get_be32(f);
+            len = qemu_get_byte(f);
+            qemu_get_buffer(f, (uint8_t *)idstr, len);
+            idstr[len] = 0;
+            instance_id = qemu_get_be32(f);
+            version_id = qemu_get_be32(f);
+
+            /* Find savevm section */
+            se = find_se(idstr, instance_id);
+            if (se == NULL) {
+                fprintf(stderr, "Unknown savevm section or instance '%s' %d\n", idstr, instance_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            /* Validate version */
+            if (version_id > se->version_id) {
+                fprintf(stderr, "savevm: unsupported version %d for '%s' v%d\n",
+                        version_id, idstr, se->version_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            /* Add entry */
+            le = qemu_mallocz(sizeof(*le));
+            if (le == NULL) {
+                ret = -ENOMEM;
+                goto out;
+            }
+
+            le->se = se;
+            le->section_id = section_id;
+            le->version_id = version_id;
+            le->next = first_le;
+            first_le = le;
+
+            le->se->load_state(f, le->se->opaque, le->version_id);
+            break;
+        case QEMU_VM_SECTION_PART:
+        case QEMU_VM_SECTION_END:
+            section_id = qemu_get_be32(f);
+
+            for (le = first_le; le && le->section_id != section_id; le = le->next);
+            if (le == NULL) {
+                fprintf(stderr, "Unknown savevm section %d\n", section_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            le->se->load_state(f, le->se->opaque, le->version_id);
+            break;
+        default:
+            fprintf(stderr, "Unknown savevm section type %d\n", section_type);
+            ret = -EINVAL;
+            goto out;
+        }
+    }
+
     ret = 0;
- the_end:
+
+out:
+    while (first_le) {
+        LoadStateEntry *le = first_le;
+        first_le = first_le->next;
+        qemu_free(le);
+    }
+
     return ret;
 }
 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live"
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (5 preceding siblings ...)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol Anthony Liguori
@ 2008-09-09 19:49 ` Anthony Liguori
  2008-09-09 22:25   ` Jamie Lokier
  2008-09-10  7:17   ` Avi Kivity
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper Anthony Liguori
                   ` (3 subsequent siblings)
  10 siblings, 2 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch replaces the static memory savevm/loadvm handler with a "live" one.
This handler is used even if performing a non-live migration.

The key difference between this handler and the previous is that each page is
prefixed with the address of the page.  The QEMUFile rate limiting code, in
combination with the live migration dirty tracking bits, is used to determine
which pages should be sent and how many should be sent.

The live save code "converges" when the number of dirty pages reaches a fixed
amount.  Currently, this is 10 pages.  This is something that should eventually
be derived from whatever the bandwidth limitation is.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/vl.c b/vl.c
index db87e6f..a55ccb4 100644
--- a/vl.c
+++ b/vl.c
@@ -7293,61 +7293,143 @@ static void ram_decompress_close(RamDecompressState *s)
     inflateEnd(&s->zstream);
 }
 
-static void ram_save(QEMUFile *f, void *opaque)
+#define RAM_SAVE_FLAG_FULL	0x01
+#define RAM_SAVE_FLAG_COMPRESS	0x02
+#define RAM_SAVE_FLAG_MEM_SIZE	0x04
+#define RAM_SAVE_FLAG_PAGE	0x08
+#define RAM_SAVE_FLAG_EOS	0x10
+
+static void ram_save_dead(QEMUFile *f, void *opaque)
 {
-    ram_addr_t i;
     RamCompressState s1, *s = &s1;
     uint8_t buf[10];
+    ram_addr_t i;
+
+    qemu_put_be64(f, phys_ram_size | RAM_SAVE_FLAG_FULL | RAM_SAVE_FLAG_MEM_SIZE);
 
-    qemu_put_be32(f, phys_ram_size);
     if (ram_compress_open(s, f) < 0)
         return;
     for(i = 0; i < phys_ram_size; i+= BDRV_HASH_BLOCK_SIZE) {
-#if 0
-        if (tight_savevm_enabled) {
-            int64_t sector_num;
-            int j;
-
-            /* find if the memory block is available on a virtual
-               block device */
-            sector_num = -1;
-            for(j = 0; j < nb_drives; j++) {
-                sector_num = bdrv_hash_find(drives_table[j].bdrv,
-                                            phys_ram_base + i,
-					    BDRV_HASH_BLOCK_SIZE);
-                if (sector_num >= 0)
-                    break;
+        //        normal_compress:
+        buf[0] = 0;
+        ram_compress_buf(s, buf, 1);
+        ram_compress_buf(s, phys_ram_base + i, BDRV_HASH_BLOCK_SIZE);
+    }
+    ram_compress_close(s);
+}
+
+static int is_dup_page(uint8_t *page, uint8_t ch)
+{
+    uint32_t val = ch << 24 | ch << 16 | ch << 8 | ch;
+    uint32_t *array = (uint32_t *)page;
+    int i;
+
+    for (i = 0; i < (TARGET_PAGE_SIZE / 4); i++) {
+        if (array[i] != val)
+            return 0;
+    }
+
+    return 1;
+}
+
+static int ram_save_block(QEMUFile *f)
+{
+    static ram_addr_t current_addr = 0;
+    ram_addr_t saved_addr = current_addr;
+    ram_addr_t addr = 0;
+    int found = 0;
+
+    while (addr < phys_ram_size) {
+        if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) {
+            uint8_t ch;
+
+            cpu_physical_memory_reset_dirty(current_addr,
+                                            current_addr + TARGET_PAGE_SIZE,
+                                            MIGRATION_DIRTY_FLAG);
+
+            ch = *(phys_ram_base + current_addr);
+
+            if (is_dup_page(phys_ram_base + current_addr, ch)) {
+                qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_COMPRESS);
+                qemu_put_byte(f, ch);
+            } else {
+                qemu_put_be64(f, current_addr | RAM_SAVE_FLAG_PAGE);
+                qemu_put_buffer(f, phys_ram_base + current_addr, TARGET_PAGE_SIZE);
             }
-            if (j == nb_drives)
-                goto normal_compress;
-            buf[0] = 1;
-            buf[1] = j;
-            cpu_to_be64wu((uint64_t *)(buf + 2), sector_num);
-            ram_compress_buf(s, buf, 10);
-        } else
-#endif
-        {
-            //        normal_compress:
-            buf[0] = 0;
-            ram_compress_buf(s, buf, 1);
-            ram_compress_buf(s, phys_ram_base + i, BDRV_HASH_BLOCK_SIZE);
+
+            found = 1;
+            break;
         }
+        addr += TARGET_PAGE_SIZE;
+        current_addr = (saved_addr + addr) % phys_ram_size;
     }
-    ram_compress_close(s);
+
+    return found;
 }
 
-static int ram_load(QEMUFile *f, void *opaque, int version_id)
+static ram_addr_t ram_save_threshold = 10;
+
+static ram_addr_t ram_save_remaining(void)
+{
+    ram_addr_t addr;
+    ram_addr_t count = 0;
+
+    for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) {
+        if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
+            count++;
+    }
+
+    return count;
+}
+
+static int ram_save_live(QEMUFile *f, int stage, void *opaque)
+{
+    ram_addr_t addr;
+
+    /* FIXME handling !vm_running && stage == 3 specially */
+
+    if (stage == 1) {
+        /* Make sure all dirty bits are set */
+        for (addr = 0; addr < phys_ram_size; addr += TARGET_PAGE_SIZE) {
+            if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
+                cpu_physical_memory_set_dirty(addr);
+        }
+        
+        /* Enable dirty memory tracking */
+        cpu_physical_memory_set_dirty_tracking(1);
+
+        qemu_put_be64(f, phys_ram_size | RAM_SAVE_FLAG_MEM_SIZE);
+    }
+
+    while (!qemu_file_rate_limit(f)) {
+        int ret;
+
+        ret = ram_save_block(f);
+        if (ret == 0) /* no more blocks */
+            break;
+    }
+
+    /* try transferring iterative blocks of memory */
+
+    if (stage == 3) {
+        cpu_physical_memory_set_dirty_tracking(0);
+
+        /* flush all remaining blocks regardless of rate limiting */
+        while (ram_save_block(f) != 0);
+    }
+
+out:
+    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+
+    return (stage == 2) && (ram_save_remaining() < ram_save_threshold);
+}
+
+static int ram_load_dead(QEMUFile *f, void *opaque)
 {
     RamDecompressState s1, *s = &s1;
     uint8_t buf[10];
     ram_addr_t i;
 
-    if (version_id == 1)
-        return ram_load_v1(f, opaque);
-    if (version_id != 2)
-        return -EINVAL;
-    if (qemu_get_be32(f) != phys_ram_size)
-        return -EINVAL;
     if (ram_decompress_open(s, f) < 0)
         return -EINVAL;
     for(i = 0; i < phys_ram_size; i+= BDRV_HASH_BLOCK_SIZE) {
@@ -7360,35 +7442,57 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 fprintf(stderr, "Error while reading ram block address=0x%08" PRIx64, (uint64_t)i);
                 goto error;
             }
-        } else
-#if 0
-        if (buf[0] == 1) {
-            int bs_index;
-            int64_t sector_num;
-
-            ram_decompress_buf(s, buf + 1, 9);
-            bs_index = buf[1];
-            sector_num = be64_to_cpupu((const uint64_t *)(buf + 2));
-            if (bs_index >= nb_drives) {
-                fprintf(stderr, "Invalid block device index %d\n", bs_index);
-                goto error;
-            }
-            if (bdrv_read(drives_table[bs_index].bdrv, sector_num,
-	                  phys_ram_base + i,
-                          BDRV_HASH_BLOCK_SIZE / 512) < 0) {
-                fprintf(stderr, "Error while reading sector %d:%" PRId64 "\n",
-                        bs_index, sector_num);
-                goto error;
-            }
-        } else
-#endif
-        {
+        } else {
         error:
             printf("Error block header\n");
             return -EINVAL;
         }
     }
     ram_decompress_close(s);
+
+    return 0;
+}
+
+static int ram_load(QEMUFile *f, void *opaque, int version_id)
+{
+    ram_addr_t addr;
+    int flags;
+
+    if (version_id == 1)
+        return ram_load_v1(f, opaque);
+
+    if (version_id == 2) {
+        if (qemu_get_be32(f) != phys_ram_size)
+            return -EINVAL;
+        return ram_load_dead(f, opaque);
+    }
+
+    if (version_id != 3)
+        return -EINVAL;
+
+    do {
+        addr = qemu_get_be64(f);
+
+        flags = addr & ~TARGET_PAGE_MASK;
+        addr &= TARGET_PAGE_MASK;
+
+        if (flags & RAM_SAVE_FLAG_MEM_SIZE) {
+            if (addr != phys_ram_size)
+                return -EINVAL;
+        }
+
+        if (flags & RAM_SAVE_FLAG_FULL) {
+            if (ram_load_dead(f, opaque) < 0)
+                return -EINVAL;
+        }
+        
+        if (flags & RAM_SAVE_FLAG_COMPRESS) {
+            uint8_t ch = qemu_get_byte(f);
+            memset(phys_ram_base + addr, ch, TARGET_PAGE_SIZE);
+        } else if (flags & RAM_SAVE_FLAG_PAGE)
+            qemu_get_buffer(f, phys_ram_base + addr, TARGET_PAGE_SIZE);
+    } while (!(flags & RAM_SAVE_FLAG_EOS));
+
     return 0;
 }
 
@@ -9354,7 +9458,7 @@ int main(int argc, char **argv)
 	    exit(1);
 
     register_savevm("timer", 0, 2, timer_save, timer_load, NULL);
-    register_savevm("ram", 0, 2, ram_save, ram_load, NULL);
+    register_savevm_live("ram", 0, 3, ram_save_live, NULL, ram_load, NULL);
 
     /* terminal init */
     memset(&display_state, 0, sizeof(display_state));

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (6 preceding siblings ...)
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live" Anthony Liguori
@ 2008-09-09 19:50 ` Anthony Liguori
  2008-09-12 15:16   ` Blue Swirl
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 9/10] Introduce the UI components for live migration Anthony Liguori
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch introduces a buffered QEMUFile wrapper.  This allows QEMUFile's to be
rate limited.  It also allows makes it easier to implement a QEMUFile that is
asynchronous.

The only real non-obvious part of the API is the "frozen" concept.  If the backend
returns EAGAIN, the QEMUFile is said to be "frozen".  This means no additional
output will be sent to the backend until the file is unfrozen.  qemu_file_put_notify
can be used to unfreeze a frozen file.

A synchronous interface is also provided to wait for an unfreeze event.  This is
used during the final part of live migration when the VM is no longer running.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 2e8e0a0..4b4cdd3 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -472,7 +472,7 @@ endif #CONFIG_DARWIN_USER
 # System emulator target
 ifndef CONFIG_USER_ONLY
 
-OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o
+OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o migration.o
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
diff --git a/hw/hw.h b/hw/hw.h
index b7958e4..10fc70a 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -11,8 +11,8 @@
  * The pos argument can be ignored if the file is only being used for
  * streaming.  The handler should try to write all of the data it can.
  */
-typedef void (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
-                                     int64_t pos, int size);
+typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
+                                    int64_t pos, int size);
 
 /* Read a chunk of data from a file at the given position.  The pos argument
  * can be ignored if the file is only be used for streaming.  The number of
diff --git a/migration.c b/migration.c
new file mode 100644
index 0000000..507c9d9
--- /dev/null
+++ b/migration.c
@@ -0,0 +1,200 @@
+#include "qemu-common.h"
+#include "hw/hw.h"
+#include "qemu-timer.h"
+#include "sysemu.h"
+#include "qemu-char.h"
+#include "migration.h"
+
+typedef struct QEMUFileBuffered
+{
+    BufferedPutFunc *put_buffer;
+    BufferedPutReadyFunc *put_ready;
+    BufferedWaitForUnfreezeFunc *wait_for_unfreeze;
+    BufferedCloseFunc *close;
+    void *opaque;
+    QEMUFile *file;
+    int has_error;
+    int freeze_output;
+    size_t bytes_xfer;
+    size_t xfer_limit;
+    uint8_t *buffer;
+    size_t buffer_size;
+    size_t buffer_capacity;
+    QEMUTimer *timer;
+} QEMUFileBuffered;
+
+static void buffered_append(QEMUFileBuffered *s,
+                            const uint8_t *buf, size_t size)
+{
+    if (size > (s->buffer_capacity - s->buffer_size)) {
+        void *tmp;
+
+        s->buffer_capacity += size + 1024;
+
+        tmp = qemu_realloc(s->buffer, s->buffer_capacity);
+        if (tmp == NULL) {
+            fprintf(stderr, "qemu file buffer expansion failed\n");
+            exit(1);
+        }
+
+        s->buffer = tmp;
+    }
+
+    memcpy(s->buffer + s->buffer_size, buf, size);
+    s->buffer_size += size;
+}
+
+static void buffered_flush(QEMUFileBuffered *s)
+{
+    size_t offset = 0;
+
+    if (s->has_error)
+        return;
+
+    while (offset < s->buffer_size) {
+        ssize_t ret;
+
+        ret = s->put_buffer(s->opaque, s->buffer + offset,
+                            s->buffer_size - offset);
+        if (ret == -EAGAIN) {
+            s->freeze_output = 1;
+            break;
+        }
+
+        if (ret <= 0)
+            break;
+
+        offset += ret;
+    }
+
+    memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
+    s->buffer_size -= offset;
+}
+
+static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileBuffered *s = opaque;
+    size_t offset = 0;
+    ssize_t ret;
+
+    if (s->has_error)
+        return -EINVAL;
+
+    s->freeze_output = 0;
+
+    buffered_flush(s);
+
+    while (offset < size) {
+        if (s->bytes_xfer > s->xfer_limit)
+            break;
+
+        ret = s->put_buffer(s->opaque, buf + offset, size - offset);
+        if (ret == -EAGAIN) {
+            s->freeze_output = 1;
+            break;
+        }
+
+        if (ret <= 0) {
+            s->has_error = 1;
+            break;
+        }
+
+        offset += ret;
+        s->bytes_xfer += ret;
+    }
+
+    buffered_append(s, buf + offset, size - offset);
+
+    return offset;
+}
+
+static int buffered_close(void *opaque)
+{
+    QEMUFileBuffered *s = opaque;
+    int ret;
+
+    if (s->has_error)
+        return -1;
+
+    while (s->buffer_size) {
+        buffered_flush(s);
+        if (s->freeze_output)
+            s->wait_for_unfreeze(s);
+    }
+
+    ret = s->close(s->opaque);
+
+    qemu_del_timer(s->timer);
+    qemu_free_timer(s->timer);
+    qemu_free(s->buffer);
+    qemu_free(s);
+
+    return ret;
+}
+
+static int buffered_rate_limit(void *opaque)
+{
+    QEMUFileBuffered *s = opaque;
+
+    if (s->has_error)
+        return 0;
+
+    if (s->freeze_output)
+        return 1;
+
+    if (s->bytes_xfer > s->xfer_limit)
+        return 1;
+
+    return 0;
+}
+
+static void buffered_rate_tick(void *opaque)
+{
+    QEMUFileBuffered *s = opaque;
+
+    if (s->has_error)
+        return;
+
+    qemu_mod_timer(s->timer, qemu_get_clock(rt_clock) + 100);
+
+    if (s->freeze_output)
+        return;
+
+    s->bytes_xfer = 0;
+
+    buffered_flush(s);
+
+    /* Add some checks around this */
+    s->put_ready(s->opaque);
+}
+
+QEMUFile *qemu_fopen_ops_buffered(void *opaque,
+                                  size_t bytes_per_sec,
+                                  BufferedPutFunc *put_buffer,
+                                  BufferedPutReadyFunc *put_ready,
+                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
+                                  BufferedCloseFunc *close)
+{
+    QEMUFileBuffered *s;
+
+    s = qemu_mallocz(sizeof(*s));
+    if (s == NULL)
+        return NULL;
+
+    s->opaque = opaque;
+    s->xfer_limit = bytes_per_sec / 10;
+    s->put_buffer = put_buffer;
+    s->put_ready = put_ready;
+    s->wait_for_unfreeze = wait_for_unfreeze;
+    s->close = close;
+
+    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
+                             buffered_close, buffered_rate_limit);
+
+    s->timer = qemu_new_timer(rt_clock, buffered_rate_tick, s);
+
+    qemu_mod_timer(s->timer, qemu_get_clock(rt_clock) + 100);
+
+    return s->file;
+}
+
diff --git a/migration.h b/migration.h
new file mode 100644
index 0000000..3994fbb
--- /dev/null
+++ b/migration.h
@@ -0,0 +1,17 @@
+#ifndef QEMU_MIGRATION_H
+#define QEMU_MIGRATION_H
+
+#include "hw/hw.h"
+
+typedef ssize_t (BufferedPutFunc)(void *opaque, const void *data, size_t size);
+typedef void (BufferedPutReadyFunc)(void *opaque);
+typedef void (BufferedWaitForUnfreezeFunc)(void *opaque);
+typedef int (BufferedCloseFunc)(void *opaque);
+
+QEMUFile *qemu_fopen_ops_buffered(void *opaque, size_t xfer_limit,
+                                  BufferedPutFunc *put_buffer,
+                                  BufferedPutReadyFunc *put_ready,
+                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
+                                  BufferedCloseFunc *close);
+
+#endif
diff --git a/vl.c b/vl.c
index d02194e..d89435a 100644
--- a/vl.c
+++ b/vl.c
@@ -6275,12 +6275,13 @@ typedef struct QEMUFileUnix
     FILE *outfile;
 } QEMUFileUnix;
 
-static void file_put_buffer(void *opaque, const uint8_t *buf,
-                            int64_t pos, int size)
+static int file_put_buffer(void *opaque, const uint8_t *buf,
+                           int64_t pos, int size)
 {
     QEMUFileUnix *s = opaque;
     fseek(s->outfile, pos, SEEK_SET);
     fwrite(buf, 1, size, s->outfile);
+    return size;
 }
 
 static int file_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
@@ -6328,11 +6329,12 @@ typedef struct QEMUFileBdrv
     int64_t base_offset;
 } QEMUFileBdrv;
 
-static void bdrv_put_buffer(void *opaque, const uint8_t *buf,
-                            int64_t pos, int size)
+static int bdrv_put_buffer(void *opaque, const uint8_t *buf,
+                           int64_t pos, int size)
 {
     QEMUFileBdrv *s = opaque;
     bdrv_pwrite(s->bs, s->base_offset + pos, buf, size);
+    return size;
 }
 
 static int bdrv_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 9/10] Introduce the UI components for live migration
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (7 preceding siblings ...)
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper Anthony Liguori
@ 2008-09-09 19:50 ` Anthony Liguori
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 10/10] TCP based " Anthony Liguori
  2008-09-11 12:13 ` [Qemu-devel] [PATCH 0/10] Live migration for QEMU Atsushi SAKAI
  10 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch introduces a command line parameter and monitor command for starting
a live migration.  The next patch will provide an example of how to use these
parameters.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 4b4cdd3..6bf5229 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -472,7 +472,8 @@ endif #CONFIG_DARWIN_USER
 # System emulator target
 ifndef CONFIG_USER_ONLY
 
-OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o migration.o
+OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o
+OBJS+=migration.o
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
diff --git a/migration.c b/migration.c
index 507c9d9..8a8b4a5 100644
--- a/migration.c
+++ b/migration.c
@@ -198,3 +198,12 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
     return s->file;
 }
 
+void qemu_start_incoming_migration(const char *uri)
+{
+    fprintf(stderr, "unknown migration protocol: %s\n", uri);
+}
+
+void do_migrate(const char *uri)
+{
+    fprintf(stderr, "unknown migration protocol: %s\n", uri);
+}
diff --git a/migration.h b/migration.h
index 3994fbb..2119a59 100644
--- a/migration.h
+++ b/migration.h
@@ -14,4 +14,8 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque, size_t xfer_limit,
                                   BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
                                   BufferedCloseFunc *close);
 
+void qemu_start_incoming_migration(const char *uri);
+
+void do_migrate(const char *uri);
+
 #endif
diff --git a/monitor.c b/monitor.c
index 76a2ddb..5b7a1c5 100644
--- a/monitor.c
+++ b/monitor.c
@@ -36,6 +36,7 @@
 #include "disas.h"
 #include <dirent.h>
 #include "qemu-timer.h"
+#include "migration.h"
 
 //#define DEBUG
 //#define DEBUG_COMPLETION
@@ -1449,6 +1450,7 @@ static term_cmd_t term_cmds[] = {
     { "nmi", "i", do_inject_nmi,
       "cpu", "inject an NMI on the given CPU", },
 #endif
+    { "migrate", "s", do_migrate, "uri", "migrate to URI" },
     { NULL, NULL, },
 };
 
diff --git a/qemu_socket.h b/qemu_socket.h
index 5229c24..a9009d5 100644
--- a/qemu_socket.h
+++ b/qemu_socket.h
@@ -27,5 +27,6 @@
 #endif /* !_WIN32 */
 
 void socket_set_nonblock(int fd);
+int parse_host_port(struct sockaddr_in *saddr, const char *str);
 
 #endif /* QEMU_SOCKET_H */
diff --git a/vl.c b/vl.c
index d89435a..5a7d0ed 100644
--- a/vl.c
+++ b/vl.c
@@ -37,6 +37,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "migration.h"
 
 #include <unistd.h>
 #include <fcntl.h>
@@ -3381,7 +3382,6 @@ static void udp_chr_update_read_handler(CharDriverState *chr)
     }
 }
 
-int parse_host_port(struct sockaddr_in *saddr, const char *str);
 #ifndef _WIN32
 static int parse_unix_path(struct sockaddr_un *uaddr, const char *str);
 #endif
@@ -8380,6 +8380,7 @@ enum {
     QEMU_OPTION_startdate,
     QEMU_OPTION_tb_size,
     QEMU_OPTION_icount,
+    QEMU_OPTION_incoming,
 };
 
 typedef struct QEMUOption {
@@ -8468,6 +8469,7 @@ const QEMUOption qemu_options[] = {
 #ifdef CONFIG_CURSES
     { "curses", 0, QEMU_OPTION_curses },
 #endif
+    { "incoming", HAS_ARG, QEMU_OPTION_incoming },
 
     /* temporary options */
     { "usb", 0, QEMU_OPTION_usb },
@@ -8734,6 +8736,7 @@ int main(int argc, char **argv)
     int tb_size;
     const char *pid_file = NULL;
     VLANState *vlan;
+    const char *incoming = NULL;
 
     LIST_INIT (&vm_change_state_head);
 #ifndef _WIN32
@@ -9342,6 +9345,9 @@ int main(int argc, char **argv)
                     icount_time_shift = strtol(optarg, NULL, 0);
                 }
                 break;
+            case QEMU_OPTION_incoming:
+                incoming = optarg;
+                break;
             }
         }
     }
@@ -9678,6 +9684,11 @@ int main(int argc, char **argv)
     if (loadvm)
         do_loadvm(loadvm);
 
+    if (incoming) {
+        autostart = 0; /* fixme how to deal with -daemonize */
+        qemu_start_incoming_migration(incoming);
+    }
+
     {
         /* XXX: simplify init */
         read_passwords();

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Qemu-devel] [PATCH 10/10] TCP based live migration
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (8 preceding siblings ...)
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 9/10] Introduce the UI components for live migration Anthony Liguori
@ 2008-09-09 19:50 ` Anthony Liguori
  2008-09-10 16:46   ` Blue Swirl
  2008-09-11 12:13 ` [Qemu-devel] [PATCH 0/10] Live migration for QEMU Atsushi SAKAI
  10 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 19:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

This patch introduces a tcp protocol for live migration.  It can be used as
follows:

qemu-system-x86_64 -hda ~/images/linux-test.img -monitor stdio
 <vm runs for a while>
(qemu) migrate tcp:localhost:1025

On the same system:

qemu-system-x86_64 -hda ~/images/linux-test.img -incoming tcp:localhost:1025

The monitor can be interacted with while waiting for an incoming live migration.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 6bf5229..0fa585c 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -473,7 +473,7 @@ endif #CONFIG_DARWIN_USER
 ifndef CONFIG_USER_ONLY
 
 OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o
-OBJS+=migration.o
+OBJS+=migration.o migration-tcp.o
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
diff --git a/migration-tcp.c b/migration-tcp.c
new file mode 100644
index 0000000..6b46bd4
--- /dev/null
+++ b/migration-tcp.c
@@ -0,0 +1,243 @@
+#include "qemu-common.h"
+#include "qemu_socket.h"
+#include "migration.h"
+#include "qemu-char.h"
+#include "sysemu.h"
+#include "console.h"
+
+/* FIXME resume monitor on error */
+
+
+typedef struct FdMigrationState
+{
+    QEMUFile *file;
+    int64_t bandwidth_limit;
+    int fd;
+} FdMigrationState;
+
+static void fd_put_notify(void *opaque)
+{
+    FdMigrationState *s = opaque;
+
+    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
+    qemu_file_put_notify(s->file);
+}
+
+static ssize_t fd_put_buffer(void *opaque, const void *data, size_t size)
+{
+    FdMigrationState *s = opaque;
+    ssize_t ret;
+
+    do {
+        ret = write(s->fd, data, size);
+    } while (ret == -1 && errno == EINTR);
+
+    if (ret == -1)
+        ret = -errno;
+
+    if (ret == -EAGAIN)
+        qemu_set_fd_handler2(s->fd, NULL, NULL, fd_put_notify, s);
+
+    return ret;
+}
+
+static int fd_close(void *opaque)
+{
+    FdMigrationState *s = opaque;
+    return close(s->fd);
+}
+
+static void fd_wait_for_unfreeze(void *opaque)
+{
+    FdMigrationState *s = opaque;
+    int ret;
+
+    do {
+        fd_set wfds;
+
+        FD_ZERO(&wfds);
+        FD_SET(s->fd, &wfds);
+
+        ret = select(s->fd + 1, NULL, &wfds, NULL, NULL);
+    } while (ret == -1 && errno == EINTR);
+}
+
+static void fd_put_ready(void *opaque)
+{
+    FdMigrationState *s = opaque;
+
+    if (qemu_savevm_state_iterate(s->file) == 1) {
+        vm_stop(0);
+        qemu_savevm_state_complete(s->file);
+        qemu_fclose(s->file);
+        qemu_free(s);
+        monitor_resume();
+    }
+}
+
+static void tcp_connect_migrate(FdMigrationState *s)
+{
+    int ret;
+
+    s->file = qemu_fopen_ops_buffered(s,
+                                      s->bandwidth_limit,
+                                      fd_put_buffer,
+                                      fd_put_ready,
+                                      fd_wait_for_unfreeze,
+                                      fd_close);
+
+    ret = qemu_savevm_state_begin(s->file);
+    if (ret < 0) {
+        fprintf(stderr, "savevm failed %d\n", ret);
+        return;
+    }
+
+    monitor_suspend();
+
+    fd_put_ready(s);
+}
+
+static void tcp_wait_for_connect(void *opaque)
+{
+    FdMigrationState *s = opaque;
+    int val, ret;
+    int valsize = sizeof(val);
+
+    do {
+        ret = getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &val, &valsize);
+    } while (ret == -1 && errno == EINTR);
+
+    if (ret < 0) {
+        fprintf(stderr, "Could not query connect success\n");
+        return;
+    }
+
+    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
+
+    if (val == 0)
+        tcp_connect_migrate(s);
+    else {
+        fprintf(stderr, "failed to connect to host\n");
+        close(s->fd);
+        qemu_free(s);
+    }
+}
+
+int tcp_start_outgoing_migration(const char *host_port, int64_t bandwidth_limit)
+{
+    struct sockaddr_in addr;
+    FdMigrationState *s;
+    int ret;
+
+    if (parse_host_port(&addr, host_port) < 0) {
+        fprintf(stderr, "invalid host/port combination: %s\n", host_port);
+        return -EINVAL;
+    }
+
+    s = qemu_mallocz(sizeof(*s));
+    if (s == NULL)
+        return -ENOMEM;
+
+    s->bandwidth_limit = bandwidth_limit;
+    s->fd = socket(PF_INET, SOCK_STREAM, 0);
+    if (s->fd == -1) {
+        qemu_free(s);
+        return -errno;
+    }
+
+    fcntl(s->fd, F_SETFL, O_NONBLOCK);
+
+    do {
+        ret = connect(s->fd, (struct sockaddr *)&addr, sizeof(addr));
+        if (ret == -1)
+            ret = -errno;
+
+        if (ret == -EINPROGRESS)
+            qemu_set_fd_handler2(s->fd, NULL, NULL, tcp_wait_for_connect, s);
+    } while (ret == -EINTR);
+
+    if (ret < 0 && ret != -EINPROGRESS) {
+        fprintf(stderr, "failed to connect to host\n");
+        close(s->fd);
+        qemu_free(s);
+    } else if (ret >= 0)
+        tcp_connect_migrate(s);
+
+    return 0;
+}
+
+static void tcp_accept_incoming_migration(void *opaque)
+{
+    struct sockaddr_in addr;
+    socklen_t addrlen = sizeof(addr);
+    int s = (unsigned long)opaque;
+    QEMUFile *f;
+    int c, ret;
+
+    do {
+        c = accept(s, (struct sockaddr *)&addr, &addrlen);
+    } while (c == -1 && errno == EINTR);
+
+    if (c == -1) {
+        fprintf(stderr, "could not accept migration connection\n");
+        return;
+    }
+
+    f = qemu_fopen_fd(c);
+    if (f == NULL) {
+        fprintf(stderr, "could not qemu_fopen socket\n");
+        goto out;
+    }
+
+    vm_stop(0); /* just in case */
+    ret = qemu_loadvm_state(f);
+    if (ret < 0) {
+        fprintf(stderr, "load of migration failed\n");
+        goto out_fopen;
+    }
+
+    /* we've successfully migrated, close the server socket */
+    qemu_set_fd_handler2(s, NULL, NULL, NULL, NULL);
+    close(s);
+
+    vm_start();
+
+out_fopen:
+    qemu_fclose(f);
+out:
+    close(c);
+}
+
+int tcp_start_incoming_migration(const char *host_port)
+{
+    struct sockaddr_in addr;
+    int val;
+    int s;
+
+    if (parse_host_port(&addr, host_port) < 0) {
+        fprintf(stderr, "invalid host/port combination: %s\n", host_port);
+        return -EINVAL;
+    }
+
+    s = socket(PF_INET, SOCK_STREAM, 0);
+    if (s == -1)
+        return -errno;
+
+    val = 1;
+    setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (const char *)&val, sizeof(val));
+
+    if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
+        goto err;
+
+    if (listen(s, 1) == -1)
+        goto err;
+
+    qemu_set_fd_handler2(s, NULL, tcp_accept_incoming_migration, NULL,
+                         (void *)(unsigned long)s);
+
+    return 0;
+
+err:
+    close(s);
+    return -errno;
+}
diff --git a/migration.c b/migration.c
index 8a8b4a5..6023e02 100644
--- a/migration.c
+++ b/migration.c
@@ -200,10 +200,20 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
 
 void qemu_start_incoming_migration(const char *uri)
 {
-    fprintf(stderr, "unknown migration protocol: %s\n", uri);
+    const char *p;
+
+    if (strstart(uri, "tcp:", &p))
+        tcp_start_incoming_migration(p);
+    else
+        fprintf(stderr, "unknown migration protocol: %s\n", uri);
 }
 
 void do_migrate(const char *uri)
 {
-    fprintf(stderr, "unknown migration protocol: %s\n", uri);
+    const char *p;
+
+    if (strstart(uri, "tcp:", &p))
+        tcp_start_outgoing_migration(p, 20 << 20);
+    else
+        fprintf(stderr, "unknown migration protocol: %s\n", uri);
 }
diff --git a/migration.h b/migration.h
index 2119a59..314cc87 100644
--- a/migration.h
+++ b/migration.h
@@ -18,4 +18,8 @@ void qemu_start_incoming_migration(const char *uri);
 
 void do_migrate(const char *uri);
 
+int tcp_start_incoming_migration(const char *host_port);
+
+int tcp_start_outgoing_migration(const char *host_port, int64_t bandwidth_limit);
+
 #endif

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live"
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live" Anthony Liguori
@ 2008-09-09 22:25   ` Jamie Lokier
  2008-09-09 22:49     ` Anthony Liguori
  2008-09-10  7:17   ` Avi Kivity
  1 sibling, 1 reply; 57+ messages in thread
From: Jamie Lokier @ 2008-09-09 22:25 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> This patch replaces the static memory savevm/loadvm handler with a
> "live" one.  This handler is used even if performing a non-live
> migration.

Excellent.  One of the annoyances of savevm currently is it pauses the
VM for a significant time, so you can't use it to snapshot production
systems being used.

> The key difference between this handler and the previous is that each page is
> prefixed with the address of the page.  The QEMUFile rate limiting code, in
> combination with the live migration dirty tracking bits, is used to determine
> which pages should be sent and how many should be sent.
> 
> The live save code "converges" when the number of dirty pages
> reaches a fixed amount.  Currently, this is 10 pages.  This is
> something that should eventually be derived from whatever the
> bandwidth limitation is.

Does this mean that a snapshot could record the same page many times,
perhaps even unbounded, while the guest is dirtying pages at a high
rate?  Or is the guest dirtying rate limited too to ensure the file
writer will converge in bounded time?

Thanks,
-- Jamie

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live"
  2008-09-09 22:25   ` Jamie Lokier
@ 2008-09-09 22:49     ` Anthony Liguori
  0 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-09 22:49 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>> This patch replaces the static memory savevm/loadvm handler with a
>> "live" one.  This handler is used even if performing a non-live
>> migration.
>>     
>
> Excellent.  One of the annoyances of savevm currently is it pauses the
> VM for a significant time, so you can't use it to snapshot production
> systems being used.
>   

qcow2 needs some modification to allow this, but yeah, that's on my 
todo.  When you do a savevm today, you write everything to a chunk of 
qcow2 file (presumably at the end).  The only thing keeping others from 
allocating over you is that you're essentially holding the big qemu lock 
(because we're single threaded).  With an asynchronous savevm, this no 
longer holds.  So what we really need to do, is let snapshots chain 
within a qcow2 file.  We can then write chunks of savevm data at a time 
and chain the chunks together.

Shouldn't be that hard and should be reasonable to do in a backwards 
compatible way.

>> The key difference between this handler and the previous is that each page is
>> prefixed with the address of the page.  The QEMUFile rate limiting code, in
>> combination with the live migration dirty tracking bits, is used to determine
>> which pages should be sent and how many should be sent.
>>
>> The live save code "converges" when the number of dirty pages
>> reaches a fixed amount.  Currently, this is 10 pages.  This is
>> something that should eventually be derived from whatever the
>> bandwidth limitation is.
>>     
>
> Does this mean that a snapshot could record the same page many times,
> perhaps even unbounded, while the guest is dirtying pages at a high
> rate?  Or is the guest dirtying rate limited too to ensure the file
> writer will converge in bounded time?
>   

With synchronous savevm (non-live), it's all deterministic.  Everything 
starts out dirty and nothing will get dirtied again because the guest 
isn't running.  With asynchronous savevm, it's indeterministic.

In general, you can't avoid the indeterminism.  In practice, you usually 
converge quickly so simply having a max iterations where if you exceed, 
you stop the guest and revert to a synchronous savevm is completely 
reasonable.

The other options would be to fail after a certain number of iterations 
or just completely punt to the management tools and provide a mechanism 
to cancel an existing live migration if it takes too long.  This 
functionality exists in KVM, I simple need to add it to this patch 
series.  It's quite simple really.

Regards,

Anthony Liguori

> Thanks,
> -- Jamie
>   

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op Anthony Liguori
@ 2008-09-10  6:52   ` Avi Kivity
  2008-09-10 10:05     ` Daniel P. Berrange
                       ` (2 more replies)
  2008-09-10 10:01   ` Daniel P. Berrange
  1 sibling, 3 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10  6:52 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> Live migration happens in the background, but it is useful to make the monitor
> command appear as if it's blocking.  This allows a management tool to
> immediately know when the live migration has completed without having to poll
> the migration status.
>
> This patch allows the monitor to be suspended from a monitor callback which
> will prevent new monitor commands from being executed.
>
>   

This means that migration is no longer transparent.  While migration is 
going on, you can't change the cdrom media, look at cpu registers, or do 
anything that requires the monitor.

This both reduces the functionality and complicates management 
applications.  IMO migration should have asynchronous notification (and 
no, I don't think multiple monitors is the correct solution).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol Anthony Liguori
@ 2008-09-10  7:09   ` Avi Kivity
  0 siblings, 0 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10  7:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

iAnthony Liguori wrote:
> The current savevm/loadvm protocol has some draw backs.  It does not support
> the ability to do progressive saving which means it cannot be used for live
> checkpointing or migration.  The sections sizes are 32-bit integers which
> means that it will not function when using more than 4GB of memory for a guest.
> It attempts to seek within the output file which means it cannot be streamed.
> The current protocol also is pretty lax about how it supports forward
> compatibility.  If a saved section version is greater than what the restore
> code support, the restore code generally treats the saved data as being in
> whatever version it supports.  This means that restoring a saved VM on an older
> version of QEMU will likely result in silent guest failure.
>
> This patch introduces a new version of the savevm protocol.  It has the
> following features:
>
>  * Support for progressive save of sections (for live checkpoint/migration)
>  * An asynchronous API for doing save
>  * Support for interleaving multiple progressive save sections
>    (for future support of memory hot-add/storage migration)
>  * Fully streaming format
>  * Strong section version checking
>
> Right now, the code is missing to support restoring v2 images.
>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>
>   

> +int qemu_savevm_state_iterate(QEMUFile *f)
> +{
> +    SaveStateEntry *se;
> +    int ret = 0;
> +
> +    for (se = first_se; se != NULL; se = se->next) {
> +        if (se->save_live_state == NULL)
> +            continue;
> +
> +        /* Section type */
> +        qemu_put_byte(f, QEMU_VM_SECTION_PART);
> +        qemu_put_be32(f, se->section_id);
> +
> +        ret |= se->save_live_state(f, QEMU_VM_SECTION_PART, se->opaque);
>   

What if the callback returns an error?

> +    }
> +
> +    if (ret)
> +        return 1;
> +
> +    return 0;
> +}
> +
>   

An alternative solution that doesn't involve iterating over the saveset 
over and over again involves providing a queue of uncompleted state 
saves.  If a save handler has more work to be done, it queues a 
continuation and returns.  The queue is primed by _begin().  
qemu_savevm_state_iterate() would simply attempt to drain the queue.  I 
don't think it's a significant improvement though.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live"
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live" Anthony Liguori
  2008-09-09 22:25   ` Jamie Lokier
@ 2008-09-10  7:17   ` Avi Kivity
  2008-09-10 13:10     ` Anthony Liguori
  1 sibling, 1 reply; 57+ messages in thread
From: Avi Kivity @ 2008-09-10  7:17 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> This patch replaces the static memory savevm/loadvm handler with a "live" one.
> This handler is used even if performing a non-live migration.
>
> The key difference between this handler and the previous is that each page is
> prefixed with the address of the page.  The QEMUFile rate limiting code, in
> combination with the live migration dirty tracking bits, is used to determine
> which pages should be sent and how many should be sent.
>
> The live save code "converges" when the number of dirty pages reaches a fixed
> amount.  Currently, this is 10 pages.  This is something that should eventually
> be derived from whatever the bandwidth limitation is.
>
> +
> +static int ram_save_block(QEMUFile *f)
> +{
> +    static ram_addr_t current_addr = 0;
> +    ram_addr_t saved_addr = current_addr;
> +    ram_addr_t addr = 0;
> +    int found = 0;
> +
> +    while (addr < phys_ram_size) {
> +        if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) {
> +            uint8_t ch;
> +
> +            cpu_physical_memory_reset_dirty(current_addr,
> +                                            current_addr + TARGET_PAGE_SIZE,
> +                                            MIGRATION_DIRTY_FLAG);
> +
> +            ch = *(phys_ram_base + current_addr);
>   

Looks like you're using qemu ram addresses.  The problem with these is 
that they have no stable meaning.  Switching the initialization order of 
vga and memory would break compatibility.

We should separate RAM saving according to the owners of the RAM 
blocks.  For example vga would be responsible for moving the framebuffer 
(which has no stable hardware address, either), and something else would 
be responsible for migrating RAM.  Of course both would call into common 
code.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op Anthony Liguori
  2008-09-10  6:52   ` Avi Kivity
@ 2008-09-10 10:01   ` Daniel P. Berrange
  2008-09-10 13:11     ` Anthony Liguori
  1 sibling, 1 reply; 57+ messages in thread
From: Daniel P. Berrange @ 2008-09-10 10:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On Tue, Sep 09, 2008 at 02:49:54PM -0500, Anthony Liguori wrote:
> Live migration happens in the background, but it is useful to make the monitor
> command appear as if it's blocking.  This allows a management tool to
> immediately know when the live migration has completed without having to poll
> the migration status.
> 
> This patch allows the monitor to be suspended from a monitor callback which
> will prevent new monitor commands from being executed.

If I'm understanding this correctly, this will cause the monitor to silently
drop & ignore any commands issued ? Might it be better to have it print a
reply on the monitor along the lines of 'command not allowed while migration
is in progress' so people/apps interacting with the monitor understand why 
it is not doing what they ask it to.

Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10  6:52   ` Avi Kivity
@ 2008-09-10 10:05     ` Daniel P. Berrange
  2008-09-10 11:11       ` Avi Kivity
  2008-09-10 13:07     ` Anthony Liguori
  2008-09-10 13:26     ` Chris Lalancette
  2 siblings, 1 reply; 57+ messages in thread
From: Daniel P. Berrange @ 2008-09-10 10:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

On Wed, Sep 10, 2008 at 09:52:33AM +0300, Avi Kivity wrote:
> Anthony Liguori wrote:
> >Live migration happens in the background, but it is useful to make the 
> >monitor
> >command appear as if it's blocking.  This allows a management tool to
> >immediately know when the live migration has completed without having to 
> >poll
> >the migration status.
> >
> >This patch allows the monitor to be suspended from a monitor callback which
> >will prevent new monitor commands from being executed.
> >
> >  
> 
> This means that migration is no longer transparent.  While migration is 
> going on, you can't change the cdrom media, look at cpu registers, or do 
> anything that requires the monitor.

Changing cdrom media while in the middle of migration sounds like a rather
troubleprone thing todo - you'd need to change the media in both active
QEMU instances at the same time to be safe. Keeping access to the readonly
"info" family of commands though would be nice, and perhaps one or two
other commands known to be 'safe' - cont/stop would be two obvious ones

> This both reduces the functionality and complicates management 
> applications.  IMO migration should have asynchronous notification (and 
> no, I don't think multiple monitors is the correct solution).

Even better than async notification would be something like 'info migration'
to get stats on progress, so they could give user some indication of the
progress made migrating - particularly for VMs with large amounts of RAM
which may take non-negligable time to transfer. Once you're polling to get
migration progress info, there's no immediate need for async notification
since you just wait till it gets to '100%' or some other completion state

Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 10:05     ` Daniel P. Berrange
@ 2008-09-10 11:11       ` Avi Kivity
  2008-09-10 11:14         ` Daniel P. Berrange
  2008-09-10 15:58         ` Jamie Lokier
  0 siblings, 2 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10 11:11 UTC (permalink / raw)
  To: Daniel P. Berrange
  Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

Daniel P. Berrange wrote:

>> This means that migration is no longer transparent.  While migration is 
>> going on, you can't change the cdrom media, look at cpu registers, or do 
>> anything that requires the monitor.
>>     
>
> Changing cdrom media while in the middle of migration sounds like a rather
> troubleprone thing todo - you'd need to change the media in both active
> QEMU instances at the same time to be safe.

Or rather, such state should be part of the migration.  There's the 
question of whether to transform the path on the target, but "which 
media is in the drive" is part of the hardware state.

(logically we would copy all of the data of all block devices, but 
that's not very practical, so we assume shared storage).

What other commands are unsafe during migration? I exclude host device 
assignment which is obviously migration unfriendly.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 11:11       ` Avi Kivity
@ 2008-09-10 11:14         ` Daniel P. Berrange
  2008-09-10 15:36           ` Avi Kivity
  2008-09-10 15:58         ` Jamie Lokier
  1 sibling, 1 reply; 57+ messages in thread
From: Daniel P. Berrange @ 2008-09-10 11:14 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

On Wed, Sep 10, 2008 at 02:11:56PM +0300, Avi Kivity wrote:
> Daniel P. Berrange wrote:
> 
> >>This means that migration is no longer transparent.  While migration is 
> >>going on, you can't change the cdrom media, look at cpu registers, or do 
> >>anything that requires the monitor.
> >>    
> >
> >Changing cdrom media while in the middle of migration sounds like a rather
> >troubleprone thing todo - you'd need to change the media in both active
> >QEMU instances at the same time to be safe.
> 
> Or rather, such state should be part of the migration.  There's the 
> question of whether to transform the path on the target, but "which 
> media is in the drive" is part of the hardware state.
> 
> (logically we would copy all of the data of all block devices, but 
> that's not very practical, so we assume shared storage).
> 
> What other commands are unsafe during migration? I exclude host device 
> assignment which is obviously migration unfriendly.

USB + virtio device attach/detach - well at least have the same need as
media change - you'd need to propagate the change to the other side in
some way. Currently migrate just assumes the management tool/admin has
started QEMU on the destinations with the matching hardware config, and
for libvirt to use the monitor to add USB /virtio devices at both ends
has the race condition/synchronization problem .

Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10  6:52   ` Avi Kivity
  2008-09-10 10:05     ` Daniel P. Berrange
@ 2008-09-10 13:07     ` Anthony Liguori
  2008-09-10 13:26     ` Chris Lalancette
  2 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 13:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Live migration happens in the background, but it is useful to make 
>> the monitor
>> command appear as if it's blocking.  This allows a management tool to
>> immediately know when the live migration has completed without having 
>> to poll
>> the migration status.
>>
>> This patch allows the monitor to be suspended from a monitor callback 
>> which
>> will prevent new monitor commands from being executed.
>>
>>   
>
> This means that migration is no longer transparent.  While migration 
> is going on, you can't change the cdrom media, look at cpu registers, 
> or do anything that requires the monitor.

On my TODO list is adding a '-d' option like we have in the KVM tree.  
It'll be there before it gets committed.

Regards,

Anthony Liguori

> This both reduces the functionality and complicates management 
> applications.  IMO migration should have asynchronous notification 
> (and no, I don't think multiple monitors is the correct solution).
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live"
  2008-09-10  7:17   ` Avi Kivity
@ 2008-09-10 13:10     ` Anthony Liguori
  0 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 13:10 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>> This patch replaces the static memory savevm/loadvm handler with a 
>> "live" one.
>> This handler is used even if performing a non-live migration.
>>
>> The key difference between this handler and the previous is that each 
>> page is
>> prefixed with the address of the page.  The QEMUFile rate limiting 
>> code, in
>> combination with the live migration dirty tracking bits, is used to 
>> determine
>> which pages should be sent and how many should be sent.
>>
>> The live save code "converges" when the number of dirty pages reaches 
>> a fixed
>> amount.  Currently, this is 10 pages.  This is something that should 
>> eventually
>> be derived from whatever the bandwidth limitation is.
>>
>> +
>> +static int ram_save_block(QEMUFile *f)
>> +{
>> +    static ram_addr_t current_addr = 0;
>> +    ram_addr_t saved_addr = current_addr;
>> +    ram_addr_t addr = 0;
>> +    int found = 0;
>> +
>> +    while (addr < phys_ram_size) {
>> +        if (cpu_physical_memory_get_dirty(current_addr, 
>> MIGRATION_DIRTY_FLAG)) {
>> +            uint8_t ch;
>> +
>> +            cpu_physical_memory_reset_dirty(current_addr,
>> +                                            current_addr + 
>> TARGET_PAGE_SIZE,
>> +                                            MIGRATION_DIRTY_FLAG);
>> +
>> +            ch = *(phys_ram_base + current_addr);
>>   
>
> Looks like you're using qemu ram addresses.  The problem with these is 
> that they have no stable meaning.  Switching the initialization order 
> of vga and memory would break compatibility.

This is true, but keep in mind, the current save/restore code saves the 
whole chunk of qemu ram.  So switching the initialization order of vga 
and memory would currently break save/restore compatibility.

I agree we should try to solve this.

Regards,

Anthony Liguori

> We should separate RAM saving according to the owners of the RAM 
> blocks.  For example vga would be responsible for moving the 
> framebuffer (which has no stable hardware address, either), and 
> something else would be responsible for migrating RAM.  Of course both 
> would call into common code.
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 10:01   ` Daniel P. Berrange
@ 2008-09-10 13:11     ` Anthony Liguori
  0 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 13:11 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Daniel P. Berrange wrote:
> On Tue, Sep 09, 2008 at 02:49:54PM -0500, Anthony Liguori wrote:
>   
>> Live migration happens in the background, but it is useful to make the monitor
>> command appear as if it's blocking.  This allows a management tool to
>> immediately know when the live migration has completed without having to poll
>> the migration status.
>>
>> This patch allows the monitor to be suspended from a monitor callback which
>> will prevent new monitor commands from being executed.
>>     
>
> If I'm understanding this correctly, this will cause the monitor to silently
> drop & ignore any commands issued ?

No.  It's just like if a monitor command took a really long time.

Regards,

Anthony Liguori

>  Might it be better to have it print a
> reply on the monitor along the lines of 'command not allowed while migration
> is in progress' so people/apps interacting with the monitor understand why 
> it is not doing what they ask it to.
>
> Daniel
>   

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
@ 2008-09-10 13:25   ` Chris Lalancette
  2008-09-10 14:38   ` [Qemu-devel] " Glauber Costa
  2008-09-12 15:40   ` [Qemu-devel] " Blue Swirl
  2 siblings, 0 replies; 57+ messages in thread
From: Chris Lalancette @ 2008-09-10 13:25 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
<snip>
>  void qemu_fflush(QEMUFile *f)
>  {
> -    if (!f->is_writable)
> +    if (!f->put_buffer)
>          return;
> +
>      if (f->buf_index > 0) {
> -        if (f->is_file) {
> -            fseek(f->outfile, f->buf_offset, SEEK_SET);
> -            fwrite(f->buf, 1, f->buf_index, f->outfile);
> -        } else {
> -            bdrv_pwrite(f->bs, f->base_offset + f->buf_offset,
> -                        f->buf, f->buf_index);
> -        }
> +	f->put_buffer(f->opaque, f->buf, f->buf_offset, f->buf_index);

Nit...whitespace damage.

...

Overall, seems to be a good refactoring, and seems to keep the original
semantics of qemu_fopen_bdrv() and qemu_fopen().

Chris Lalancette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10  6:52   ` Avi Kivity
  2008-09-10 10:05     ` Daniel P. Berrange
  2008-09-10 13:07     ` Anthony Liguori
@ 2008-09-10 13:26     ` Chris Lalancette
  2 siblings, 0 replies; 57+ messages in thread
From: Chris Lalancette @ 2008-09-10 13:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Live migration happens in the background, but it is useful to make the monitor
>> command appear as if it's blocking.  This allows a management tool to
>> immediately know when the live migration has completed without having to poll
>> the migration status.
>>
>> This patch allows the monitor to be suspended from a monitor callback which
>> will prevent new monitor commands from being executed.
>>
>>   
> 
> This means that migration is no longer transparent.  While migration is 
> going on, you can't change the cdrom media, look at cpu registers, or do 
> anything that requires the monitor.
> 
> This both reduces the functionality and complicates management 
> applications.  IMO migration should have asynchronous notification (and 
> no, I don't think multiple monitors is the correct solution).
> 

Despite Avi's objections, I'm OK with this, as long as the user of the monitor
has an option.  That is, if the user can pass some switch to say "don't suspend
the monitor while I'm migrating", then that's fine.  Actual people at the
console will pass that switch, and the management tools won't (or vice-versa; I
don't really care what the default is).  This is really no different from the
situation we currently have in KVM, where you pass -d to do it in the
background, and pass no options to do it in the foreground.

(looking ahead into patch 10/10, it doesn't look like there is currently an
option, but we should add one).

Chris Lalancette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all() Anthony Liguori
@ 2008-09-10 13:26   ` Chris Lalancette
  2008-09-10 14:46     ` Glauber Costa
  2008-09-12 15:43   ` Blue Swirl
  1 sibling, 1 reply; 57+ messages in thread
From: Chris Lalancette @ 2008-09-10 13:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> This patch adds a bdrv_flush_all() function.  It's necessary to ensure that all
> IO operations have been flushed to disk before completely a live migration.
> 
> N.B. we don't actually use this now.  We really should flush the block drivers
> using an live savevm callback to avoid unnecessary guest down time.

Simple enough, and follows the pattern in the KVM migration.

Chris Lalancette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 5/10] Add network announce function
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 5/10] Add network announce function Anthony Liguori
@ 2008-09-10 13:27   ` Chris Lalancette
  2008-09-10 13:54     ` Anthony Liguori
  2008-09-10 14:00     ` Avi Kivity
  0 siblings, 2 replies; 57+ messages in thread
From: Chris Lalancette @ 2008-09-10 13:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> +static int announce_self_create(uint8_t *buf, 
> +				uint8_t *mac_addr)
> +{
> +    uint32_t magic = EXPERIMENTAL_MAGIC;
> +    uint16_t proto = htons(ETH_P_EXPERIMENTAL);
> +
> +    /* FIXME: should we send a different packet (arp/rarp/ping)? */
> +
> +    memset(buf, 0xff, 6);         /* h_dst */
> +    memcpy(buf + 6, mac_addr, 6); /* h_src */
> +    memcpy(buf + 12, &proto, 2);  /* h_proto */
> +    memcpy(buf + 14, &magic, 4);  /* magic */
> +
> +    return 18; /* len */
> +}
> +
> +void qemu_announce_self(void)
> +{
> +    int i, j, len;
> +    VLANState *vlan;
> +    VLANClientState *vc;
> +    uint8_t buf[256];
> +
> +    for (i = 0; i < nb_nics; i++) {
> +        len = announce_self_create(buf, nd_table[i].macaddr);
> +        vlan = nd_table[i].vlan;
> +        for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
> +            if (vc->fd_read == tap_receive)  /* send only if tap */
> +                for (j=0; j < SELF_ANNOUNCE_ROUNDS; j++)
> +                    vc->fd_read(vc->opaque, buf, len);
> +        }
> +    }
> +}
> +

This one is yucky, as the FIXME points out.  First, I'm guessing the point of
this is to do an ARP poison on the switch?  If so, we probably want to do some
kind of ARP packet, although I'm not an expert on this.  Also, why are we trying
 SELF_ANNOUNCE_ROUNDS times?  Is there some issue where some of these get
dropped, or is it just being safe about it?

Chris Lalancette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 5/10] Add network announce function
  2008-09-10 13:27   ` Chris Lalancette
@ 2008-09-10 13:54     ` Anthony Liguori
  2008-09-10 14:00     ` Avi Kivity
  1 sibling, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 13:54 UTC (permalink / raw)
  To: Chris Lalancette; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Chris Lalancette wrote:
> Anthony Liguori wrote:
>   
>> +static int announce_self_create(uint8_t *buf, 
>> +				uint8_t *mac_addr)
>> +{
>> +    uint32_t magic = EXPERIMENTAL_MAGIC;
>> +    uint16_t proto = htons(ETH_P_EXPERIMENTAL);
>> +
>> +    /* FIXME: should we send a different packet (arp/rarp/ping)? */
>> +
>> +    memset(buf, 0xff, 6);         /* h_dst */
>> +    memcpy(buf + 6, mac_addr, 6); /* h_src */
>> +    memcpy(buf + 12, &proto, 2);  /* h_proto */
>> +    memcpy(buf + 14, &magic, 4);  /* magic */
>> +
>> +    return 18; /* len */
>> +}
>> +
>> +void qemu_announce_self(void)
>> +{
>> +    int i, j, len;
>> +    VLANState *vlan;
>> +    VLANClientState *vc;
>> +    uint8_t buf[256];
>> +
>> +    for (i = 0; i < nb_nics; i++) {
>> +        len = announce_self_create(buf, nd_table[i].macaddr);
>> +        vlan = nd_table[i].vlan;
>> +        for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
>> +            if (vc->fd_read == tap_receive)  /* send only if tap */
>> +                for (j=0; j < SELF_ANNOUNCE_ROUNDS; j++)
>> +                    vc->fd_read(vc->opaque, buf, len);
>> +        }
>> +    }
>> +}
>> +
>>     
>
> This one is yucky, as the FIXME points out.  First, I'm guessing the point of
> this is to do an ARP poison on the switch?  If so, we probably want to do some
> kind of ARP packet, although I'm not an expert on this.  Also, why are we trying
>  SELF_ANNOUNCE_ROUNDS times?  Is there some issue where some of these get
> dropped, or is it just being safe about it?
>   

I didn't write this code originally.  Perhaps Uri or Avi know who did 
and can provide some insight?

Regards,

Anthony Liguori

> Chris Lalancette
>
>   

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 5/10] Add network announce function
  2008-09-10 13:27   ` Chris Lalancette
  2008-09-10 13:54     ` Anthony Liguori
@ 2008-09-10 14:00     ` Avi Kivity
  1 sibling, 0 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10 14:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Chris Lalancette wrote:
> Anthony Liguori wrote:
>   
>> +static int announce_self_create(uint8_t *buf, 
>> +				uint8_t *mac_addr)
>> +{
>> +    uint32_t magic = EXPERIMENTAL_MAGIC;
>> +    uint16_t proto = htons(ETH_P_EXPERIMENTAL);
>> +
>> +    /* FIXME: should we send a different packet (arp/rarp/ping)? */
>> +
>> +    memset(buf, 0xff, 6);         /* h_dst */
>> +    memcpy(buf + 6, mac_addr, 6); /* h_src */
>> +    memcpy(buf + 12, &proto, 2);  /* h_proto */
>> +    memcpy(buf + 14, &magic, 4);  /* magic */
>> +
>> +    return 18; /* len */
>> +}
>> +
>> +void qemu_announce_self(void)
>> +{
>> +    int i, j, len;
>> +    VLANState *vlan;
>> +    VLANClientState *vc;
>> +    uint8_t buf[256];
>> +
>> +    for (i = 0; i < nb_nics; i++) {
>> +        len = announce_self_create(buf, nd_table[i].macaddr);
>> +        vlan = nd_table[i].vlan;
>> +        for(vc = vlan->first_client; vc != NULL; vc = vc->next) {
>> +            if (vc->fd_read == tap_receive)  /* send only if tap */
>> +                for (j=0; j < SELF_ANNOUNCE_ROUNDS; j++)
>> +                    vc->fd_read(vc->opaque, buf, len);
>> +        }
>> +    }
>> +}
>> +
>>     
>
> This one is yucky, as the FIXME points out.  First, I'm guessing the point of
> this is to do an ARP poison on the switch?  If so, we probably want to do some
> kind of ARP packet, although I'm not an expert on this.

ARP is to announce an IP address moving to a different MAC address.  
Here the MAC address stays the same, but we want to announce to the 
switch that the MAC address is moving to a different port.

There may not even be an IP stack running in the guest, and we don't 
know the IP address(es) the guest is using, so I don't see how ARP fits 
here.

Since switches learn the MAC address to port association, all we need is 
to send _some_ packet.

>   Also, why are we trying
>  SELF_ANNOUNCE_ROUNDS times?  Is there some issue where some of these get
> dropped, or is it just being safe about it

Some switches drop the first packet; and if those switches are chained, 
we need to send multiple packets.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [Qemu-devel] Re: [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
  2008-09-10 13:25   ` Chris Lalancette
@ 2008-09-10 14:38   ` Glauber Costa
  2008-09-10 15:05     ` Avi Kivity
  2008-09-10 15:16     ` Anthony Liguori
  2008-09-12 15:40   ` [Qemu-devel] " Blue Swirl
  2 siblings, 2 replies; 57+ messages in thread
From: Glauber Costa @ 2008-09-10 14:38 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

On Tue, Sep 9, 2008 at 4:49 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
> To support live migration, we override QEMUFile so that instead of writing to
> disk, the save/restore state happens over a network connection.
>
> This patch makes QEMUFile read/write operations function pointers so that we
> can override them for live migration.
>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>
> diff --git a/hw/hw.h b/hw/hw.h
> index b84ace0..771dbd4 100644
> --- a/hw/hw.h
> +++ b/hw/hw.h
> @@ -7,9 +7,36 @@
>
>  /* VM Load/Save */
>
> +/* This function writes a chunk of data to a file at the given position.
> + * The pos argument can be ignored if the file is only being used for
> + * streaming.  The handler should try to write all of the data it can.
> + */
> +typedef void (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
> +                                     int64_t pos, int size);
> +
> +/* Read a chunk of data from a file at the given position.  The pos argument
> + * can be ignored if the file is only be used for streaming.  The number of
> + * bytes actually read should be returned.
> + */
> +typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
> +                                    int64_t pos, int size);
> +
> +/* Close a file and return an error code */
> +typedef int (QEMUFileCloseFunc)(void *opaque);
> +
> +/* Called to determine if the file has exceeded it's bandwidth allocation.  The
> + * bandwidth capping is a soft limit, not a hard limit.
> + */
> +typedef int (QEMUFileRateLimit)(void *opaque);
> +
> +QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
> +                        QEMUFileGetBufferFunc *get_buffer,
> +                        QEMUFileCloseFunc *close,
> +                         QEMUFileRateLimit *rate_limit);
>  QEMUFile *qemu_fopen(const char *filename, const char *mode);
> +QEMUFile *qemu_fopen_fd(int fd);
>  void qemu_fflush(QEMUFile *f);
> -void qemu_fclose(QEMUFile *f);
> +int qemu_fclose(QEMUFile *f);
>  void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
>  void qemu_put_byte(QEMUFile *f, int v);
>  void qemu_put_be16(QEMUFile *f, unsigned int v);
> @@ -20,6 +47,12 @@ int qemu_get_byte(QEMUFile *f);
>  unsigned int qemu_get_be16(QEMUFile *f);
>  unsigned int qemu_get_be32(QEMUFile *f);
>  uint64_t qemu_get_be64(QEMUFile *f);
> +int qemu_file_rate_limit(QEMUFile *f);
> +
> +/* Try to send any outstanding data.  This function is useful when output is
> + * halted due to rate limiting or EAGAIN errors occur as it can be used to
> + * resume output. */
> +void qemu_file_put_notify(QEMUFile *f);
>
>  static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>  {
> diff --git a/vl.c b/vl.c
> index 0a457a9..659fd95 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -6152,11 +6152,12 @@ void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
>  #define IO_BUF_SIZE 32768
>
>  struct QEMUFile {
> -    FILE *outfile;
> -    BlockDriverState *bs;
> -    int is_file;
> -    int is_writable;
> -    int64_t base_offset;
> +    QEMUFilePutBufferFunc *put_buffer;
> +    QEMUFileGetBufferFunc *get_buffer;
> +    QEMUFileCloseFunc *close;
> +    QEMUFileRateLimit *rate_limit;
> +    void *opaque;
> +
>     int64_t buf_offset; /* start of buffer when writing, end of buffer
>                            when reading */
>     int buf_index;
> @@ -6164,58 +6165,194 @@ struct QEMUFile {
>     uint8_t buf[IO_BUF_SIZE];
>  };
>
> +typedef struct QEMUFileFD
> +{
> +    int fd;
> +    QEMUFile *file;
> +} QEMUFileFD;
> +
> +static void fd_put_notify(void *opaque)
> +{
> +    QEMUFileFD *s = opaque;
> +
> +    /* Remove writable callback and do a put notify */
> +    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> +    qemu_file_put_notify(s->file);
> +}
> +
> +static int fd_put_buffer(void *opaque, const uint8_t *buf,
> +                         int64_t pos, int size)
> +{
> +    QEMUFileFD *s = opaque;
> +    ssize_t len;
> +
> +    do {
> +        len = write(s->fd, buf, size);
> +    } while (len == -1 && errno == EINTR);

What about the len == size case ?

> +static int fd_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileFD *s = opaque;
> +    ssize_t len;
> +
> +    do {
> +        len = read(s->fd, buf, size);
> +    } while (len == -1 && errno == EINTR);

ditto.

> +
> +    if (len == -1)
> +        len = -errno;
> +
> +    return len;
> +}
nitpicking, but... maybe "ret" is a better name here?

> +
> +static int fd_close(void *opaque)
> +{
> +    QEMUFileFD *s = opaque;
> +    qemu_free(s);
> +    return 0;
> +}
Why don't we need to do any specific callback for closing the file descriptor?
In the case of a socket, I imagine we may want to shut the socket down, for ex.

> +
> +QEMUFile *qemu_fopen_fd(int fd)
> +{
> +    QEMUFileFD *s = qemu_mallocz(sizeof(QEMUFileFD));

can't it fail?
> +    s->fd = fd;
> +    s->file = qemu_fopen_ops(s, fd_put_buffer, fd_get_buffer, fd_close, NULL);
> +    return s->file;
> +}


> +typedef struct QEMUFileUnix
> +{
> +    FILE *outfile;
> +} QEMUFileUnix;
> +
> +static void file_put_buffer(void *opaque, const uint8_t *buf,
> +                            int64_t pos, int size)
> +{
> +    QEMUFileUnix *s = opaque;
> +    fseek(s->outfile, pos, SEEK_SET);
> +    fwrite(buf, 1, size, s->outfile);
> +}
> +
> +static int file_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileUnix *s = opaque;
> +    fseek(s->outfile, pos, SEEK_SET);
> +    return fread(buf, 1, size, s->outfile);
> +}
> +
> +static int file_close(void *opaque)
> +{
> +    QEMUFileUnix *s = opaque;
> +    fclose(s->outfile);
> +    qemu_free(s);
> +    return 0;
> +}
> +
>  QEMUFile *qemu_fopen(const char *filename, const char *mode)
>  {
> -    QEMUFile *f;
> +    QEMUFileUnix *s;
>
> -    f = qemu_mallocz(sizeof(QEMUFile));
> -    if (!f)
> +    s = qemu_mallocz(sizeof(QEMUFileUnix));
> +    if (!s)
>         return NULL;
> -    if (!strcmp(mode, "wb")) {
> -        f->is_writable = 1;
> -    } else if (!strcmp(mode, "rb")) {
> -        f->is_writable = 0;
> -    } else {
> -        goto fail;
> -    }
> -    f->outfile = fopen(filename, mode);
> -    if (!f->outfile)
> +
> +    s->outfile = fopen(filename, mode);
> +    if (!s->outfile)
>         goto fail;
> -    f->is_file = 1;
> -    return f;
> - fail:
> -    if (f->outfile)
> -        fclose(f->outfile);
> -    qemu_free(f);
> +
> +    if (!strcmp(mode, "wb"))
> +       return qemu_fopen_ops(s, file_put_buffer, NULL, file_close, NULL);
> +    else if (!strcmp(mode, "rb"))
> +       return qemu_fopen_ops(s, NULL, file_get_buffer, file_close, NULL);
> +
> +fail:
> +    if (s->outfile)
> +        fclose(s->outfile);
> +    qemu_free(s);
>     return NULL;
>  }
>
> -static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int64_t offset, int is_writable)
> +typedef struct QEMUFileBdrv
> +{
> +    BlockDriverState *bs;
> +    int64_t base_offset;
> +} QEMUFileBdrv;

Isn't it possible to abstract the differences between bdrv and file so
to have a common implementation
between them? Do you think it's worthwhile ?

> +
> +QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int64_t offset, int is_writable)
> +{
> +    QEMUFileBdrv *s;
> +
> +    s = qemu_mallocz(sizeof(QEMUFileBdrv));
> +    if (!s)
> +        return NULL;
> +
> +    s->bs = bs;
> +    s->base_offset = offset;
> +
> +    if (is_writable)
> +       return qemu_fopen_ops(s, bdrv_put_buffer, NULL, bdrv_fclose, NULL);
> +
> +    return qemu_fopen_ops(s, NULL, bdrv_get_buffer, bdrv_fclose, NULL);
> +}
> +
> +QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
> +                        QEMUFileGetBufferFunc *get_buffer,
> +                        QEMUFileCloseFunc *close,
> +                         QEMUFileRateLimit *rate_limit)
>  {
>     QEMUFile *f;
>
>     f = qemu_mallocz(sizeof(QEMUFile));
>     if (!f)
> -        return NULL;
> -    f->is_file = 0;
> -    f->bs = bs;
> -    f->is_writable = is_writable;
> -    f->base_offset = offset;
> +       return NULL;
> +
> +    f->opaque = opaque;
> +    f->put_buffer = put_buffer;
> +    f->get_buffer = get_buffer;
> +    f->close = close;
> +    f->rate_limit = rate_limit;
> +
>     return f;
>  }
>
>  void qemu_fflush(QEMUFile *f)
>  {
> -    if (!f->is_writable)
> +    if (!f->put_buffer)
>         return;
> +
>     if (f->buf_index > 0) {
> -        if (f->is_file) {
> -            fseek(f->outfile, f->buf_offset, SEEK_SET);
> -            fwrite(f->buf, 1, f->buf_index, f->outfile);
> -        } else {
> -            bdrv_pwrite(f->bs, f->base_offset + f->buf_offset,
> -                        f->buf, f->buf_index);
> -        }
> +       f->put_buffer(f->opaque, f->buf, f->buf_offset, f->buf_index);
>         f->buf_offset += f->buf_index;
>         f->buf_index = 0;
>     }
> @@ -6225,32 +6362,31 @@ static void qemu_fill_buffer(QEMUFile *f)
>  {
>     int len;
>
> -    if (f->is_writable)
> +    if (!f->get_buffer)
>         return;
> -    if (f->is_file) {
> -        fseek(f->outfile, f->buf_offset, SEEK_SET);
> -        len = fread(f->buf, 1, IO_BUF_SIZE, f->outfile);
> -        if (len < 0)
> -            len = 0;
> -    } else {
> -        len = bdrv_pread(f->bs, f->base_offset + f->buf_offset,
> -                         f->buf, IO_BUF_SIZE);
> -        if (len < 0)
> -            len = 0;
> -    }
> +
> +    len = f->get_buffer(f->opaque, f->buf, f->buf_offset, IO_BUF_SIZE);
> +    if (len < 0)
> +       len = 0;
> +
>     f->buf_index = 0;
>     f->buf_size = len;
>     f->buf_offset += len;
>  }
>
> -void qemu_fclose(QEMUFile *f)
> +int qemu_fclose(QEMUFile *f)
>  {
> -    if (f->is_writable)
> -        qemu_fflush(f);
> -    if (f->is_file) {
> -        fclose(f->outfile);
> -    }
> +    int ret = 0;
> +    qemu_fflush(f);
> +    if (f->close)
> +       ret = f->close(f->opaque);
>     qemu_free(f);
> +    return ret;
> +}
> +
> +void qemu_file_put_notify(QEMUFile *f)
> +{
> +    f->put_buffer(f->opaque, NULL, 0, 0);
>  }
>
>  void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size)
> @@ -6324,7 +6460,7 @@ int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence)
>         /* SEEK_END not supported */
>         return -1;
>     }
> -    if (f->is_writable) {
> +    if (f->put_buffer) {
>         qemu_fflush(f);
>         f->buf_offset = pos;
>     } else {
> @@ -6335,6 +6471,14 @@ int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence)
>     return pos;
>  }
>
> +int qemu_file_rate_limit(QEMUFile *f)
> +{
> +    if (f->rate_limit)
> +        return f->rate_limit(f->opaque);
> +
> +    return 0;
> +}
> +
>  void qemu_put_be16(QEMUFile *f, unsigned int v)
>  {
>     qemu_put_byte(f, v >> 8);
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
Glauber Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-10 13:26   ` Chris Lalancette
@ 2008-09-10 14:46     ` Glauber Costa
  2008-09-10 15:19       ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Glauber Costa @ 2008-09-10 14:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On Wed, Sep 10, 2008 at 10:26 AM, Chris Lalancette <clalance@redhat.com> wrote:
> Anthony Liguori wrote:
>> This patch adds a bdrv_flush_all() function.  It's necessary to ensure that all
>> IO operations have been flushed to disk before completely a live migration.
>>
>> N.B. we don't actually use this now.  We really should flush the block drivers
>> using an live savevm callback to avoid unnecessary guest down time.
>
> Simple enough, and follows the pattern in the KVM migration.
>
> Chris Lalancette
>
sounds ok.

>



-- 
Glauber Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration Anthony Liguori
@ 2008-09-10 14:52   ` Glauber Costa
  2008-09-10 14:56     ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Glauber Costa @ 2008-09-10 14:52 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On Tue, Sep 9, 2008 at 4:49 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch adds a dirty tracking bit for live migration.  We use 0x08 because
> kqemu uses 0x04.

For which purpose, and where is it? I think it deserves at least a
comment on the source itself for future generations.

>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>
> diff --git a/cpu-all.h b/cpu-all.h
> index d350b30..fdac353 100644
> --- a/cpu-all.h
> +++ b/cpu-all.h
> @@ -944,6 +944,7 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
>
>  #define VGA_DIRTY_FLAG  0x01
>  #define CODE_DIRTY_FLAG 0x02
> +#define MIGRATION_DIRTY_FLAG 0x08
>
>  /* read dirty bit (return 0 or 1) */
>  static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
> @@ -966,6 +967,10 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
>                                      int dirty_flags);
>  void cpu_tlb_update_dirty(CPUState *env);
>
> +int cpu_physical_memory_set_dirty_tracking(int enable);
> +
> +int cpu_physical_memory_get_dirty_tracking(void);
> +
>  void dump_exec_info(FILE *f,
>                     int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
>
> diff --git a/exec.c b/exec.c
> index 3ab4ad0..9dba5c8 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -38,6 +38,7 @@
>  #include "qemu-common.h"
>  #include "tcg.h"
>  #include "hw/hw.h"
> +#include "osdep.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
>  #endif
> @@ -113,6 +114,7 @@ ram_addr_t phys_ram_size;
>  int phys_ram_fd;
>  uint8_t *phys_ram_base;
>  uint8_t *phys_ram_dirty;
> +static int in_migration;
>  static ram_addr_t phys_ram_alloc_offset = 0;
>  #endif
>
> @@ -1777,6 +1779,17 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
>     }
>  }
>
> +int cpu_physical_memory_set_dirty_tracking(int enable)
> +{
> +    in_migration = enable;
> +    return 0;
> +}
> +
> +int cpu_physical_memory_get_dirty_tracking(void)
> +{
> +    return in_migration;
> +}
> +
>  static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
>  {
>     ram_addr_t ram_addr;
> @@ -2932,9 +2945,19 @@ void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
>         io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
>         io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
>     } else {
> -        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
> -            (addr & ~TARGET_PAGE_MASK);
> +        unsigned long addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
> +        ptr = phys_ram_base + addr1;
>         stl_p(ptr, val);
> +
> +        if (unlikely(in_migration)) {
> +            if (!cpu_physical_memory_is_dirty(addr1)) {
> +                /* invalidate code */
> +                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
> +                /* set dirty bit */
> +                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
> +                    (0xff & ~CODE_DIRTY_FLAG);
> +            }
> +        }
>     }
>  }
did you mean MIGRATION_DIRTY_FLAG?

-- 
Glauber Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration
  2008-09-10 14:52   ` Glauber Costa
@ 2008-09-10 14:56     ` Anthony Liguori
  2008-09-10 15:01       ` Glauber Costa
  0 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 14:56 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

Glauber Costa wrote:
> On Tue, Sep 9, 2008 at 4:49 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> This patch adds a dirty tracking bit for live migration.  We use 0x08 because
>> kqemu uses 0x04.
>>     
>
> For which purpose, and where is it? I think it deserves at least a
> comment on the source itself for future generations.
>   

This patch was originally written before kqemu was Open Sourced.  I only 
knew that kqemu used 0x08 because Fabrice mentioned it in reviewing the 
earliest version of this series.

I'll add a comment to the code.

>> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>>
>> diff --git a/cpu-all.h b/cpu-all.h
>> index d350b30..fdac353 100644
>> --- a/cpu-all.h
>> +++ b/cpu-all.h
>> @@ -944,6 +944,7 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
>>
>>  #define VGA_DIRTY_FLAG  0x01
>>  #define CODE_DIRTY_FLAG 0x02
>> +#define MIGRATION_DIRTY_FLAG 0x08
>>
>>  /* read dirty bit (return 0 or 1) */
>>  static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
>> @@ -966,6 +967,10 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
>>                                      int dirty_flags);
>>  void cpu_tlb_update_dirty(CPUState *env);
>>
>> +int cpu_physical_memory_set_dirty_tracking(int enable);
>> +
>> +int cpu_physical_memory_get_dirty_tracking(void);
>> +
>>  void dump_exec_info(FILE *f,
>>                     int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
>>
>> diff --git a/exec.c b/exec.c
>> index 3ab4ad0..9dba5c8 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -38,6 +38,7 @@
>>  #include "qemu-common.h"
>>  #include "tcg.h"
>>  #include "hw/hw.h"
>> +#include "osdep.h"
>>  #if defined(CONFIG_USER_ONLY)
>>  #include <qemu.h>
>>  #endif
>> @@ -113,6 +114,7 @@ ram_addr_t phys_ram_size;
>>  int phys_ram_fd;
>>  uint8_t *phys_ram_base;
>>  uint8_t *phys_ram_dirty;
>> +static int in_migration;
>>  static ram_addr_t phys_ram_alloc_offset = 0;
>>  #endif
>>
>> @@ -1777,6 +1779,17 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
>>     }
>>  }
>>
>> +int cpu_physical_memory_set_dirty_tracking(int enable)
>> +{
>> +    in_migration = enable;
>> +    return 0;
>> +}
>> +
>> +int cpu_physical_memory_get_dirty_tracking(void)
>> +{
>> +    return in_migration;
>> +}
>> +
>>  static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
>>  {
>>     ram_addr_t ram_addr;
>> @@ -2932,9 +2945,19 @@ void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
>>         io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
>>         io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
>>     } else {
>> -        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
>> -            (addr & ~TARGET_PAGE_MASK);
>> +        unsigned long addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
>> +        ptr = phys_ram_base + addr1;
>>         stl_p(ptr, val);
>> +
>> +        if (unlikely(in_migration)) {
>> +            if (!cpu_physical_memory_is_dirty(addr1)) {
>> +                /* invalidate code */
>> +                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
>> +                /* set dirty bit */
>> +                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
>> +                    (0xff & ~CODE_DIRTY_FLAG);
>> +            }
>> +        }
>>     }
>>  }
>>     
> did you mean MIGRATION_DIRTY_FLAG?
>   

No.  We want to set all of the dirty bits except for the 
CODE_DIRTY_FLAG.  If you look around the rest of the code, it's pretty 
much the standard thing to do.  Self-modifying code has to be handled 
specially.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration
  2008-09-10 14:56     ` Anthony Liguori
@ 2008-09-10 15:01       ` Glauber Costa
  0 siblings, 0 replies; 57+ messages in thread
From: Glauber Costa @ 2008-09-10 15:01 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

On Wed, Sep 10, 2008 at 11:56 AM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Glauber Costa wrote:
>>
>> On Tue, Sep 9, 2008 at 4:49 PM, Anthony Liguori <aliguori@us.ibm.com>
>> wrote:
>>
>>>
>>> This patch adds a dirty tracking bit for live migration.  We use 0x08
>>> because
>>> kqemu uses 0x04.
>>>
>>
>> For which purpose, and where is it? I think it deserves at least a
>> comment on the source itself for future generations.
>>
>
> This patch was originally written before kqemu was Open Sourced.  I only
> knew that kqemu used 0x08 because Fabrice mentioned it in reviewing the
> earliest version of this series.
>
> I'll add a comment to the code.
>
>>> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>>>
>>> diff --git a/cpu-all.h b/cpu-all.h
>>> index d350b30..fdac353 100644
>>> --- a/cpu-all.h
>>> +++ b/cpu-all.h
>>> @@ -944,6 +944,7 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong
>>> addr,
>>>
>>>  #define VGA_DIRTY_FLAG  0x01
>>>  #define CODE_DIRTY_FLAG 0x02
>>> +#define MIGRATION_DIRTY_FLAG 0x08
>>>
>>>  /* read dirty bit (return 0 or 1) */
>>>  static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
>>> @@ -966,6 +967,10 @@ void cpu_physical_memory_reset_dirty(ram_addr_t
>>> start, ram_addr_t end,
>>>                                     int dirty_flags);
>>>  void cpu_tlb_update_dirty(CPUState *env);
>>>
>>> +int cpu_physical_memory_set_dirty_tracking(int enable);
>>> +
>>> +int cpu_physical_memory_get_dirty_tracking(void);
>>> +
>>>  void dump_exec_info(FILE *f,
>>>                    int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
>>>
>>> diff --git a/exec.c b/exec.c
>>> index 3ab4ad0..9dba5c8 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -38,6 +38,7 @@
>>>  #include "qemu-common.h"
>>>  #include "tcg.h"
>>>  #include "hw/hw.h"
>>> +#include "osdep.h"
>>>  #if defined(CONFIG_USER_ONLY)
>>>  #include <qemu.h>
>>>  #endif
>>> @@ -113,6 +114,7 @@ ram_addr_t phys_ram_size;
>>>  int phys_ram_fd;
>>>  uint8_t *phys_ram_base;
>>>  uint8_t *phys_ram_dirty;
>>> +static int in_migration;
>>>  static ram_addr_t phys_ram_alloc_offset = 0;
>>>  #endif
>>>
>>> @@ -1777,6 +1779,17 @@ void cpu_physical_memory_reset_dirty(ram_addr_t
>>> start, ram_addr_t end,
>>>    }
>>>  }
>>>
>>> +int cpu_physical_memory_set_dirty_tracking(int enable)
>>> +{
>>> +    in_migration = enable;
>>> +    return 0;
>>> +}
>>> +
>>> +int cpu_physical_memory_get_dirty_tracking(void)
>>> +{
>>> +    return in_migration;
>>> +}
>>> +
>>>  static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
>>>  {
>>>    ram_addr_t ram_addr;
>>> @@ -2932,9 +2945,19 @@ void stl_phys_notdirty(target_phys_addr_t addr,
>>> uint32_t val)
>>>        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
>>>        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
>>>    } else {
>>> -        ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
>>> -            (addr & ~TARGET_PAGE_MASK);
>>> +        unsigned long addr1 = (pd & TARGET_PAGE_MASK) + (addr &
>>> ~TARGET_PAGE_MASK);
>>> +        ptr = phys_ram_base + addr1;
>>>        stl_p(ptr, val);
>>> +
>>> +        if (unlikely(in_migration)) {
>>> +            if (!cpu_physical_memory_is_dirty(addr1)) {
>>> +                /* invalidate code */
>>> +                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
>>> +                /* set dirty bit */
>>> +                phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
>>> +                    (0xff & ~CODE_DIRTY_FLAG);
>>> +            }
>>> +        }
>>>    }
>>>  }
>>>
>>
>> did you mean MIGRATION_DIRTY_FLAG?
>>
>
> No.  We want to set all of the dirty bits except for the CODE_DIRTY_FLAG.
>  If you look around the rest of the code, it's pretty much the standard
> thing to do.  Self-modifying code has to be handled specially.

oh, you're right.

my bad.

> Regards,
>
> Anthony Liguori
>
>
>



-- 
Glauber Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-10 14:38   ` [Qemu-devel] " Glauber Costa
@ 2008-09-10 15:05     ` Avi Kivity
  2008-09-10 15:16     ` Anthony Liguori
  1 sibling, 0 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10 15:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Glauber Costa wrote:
>> +static int fd_put_buffer(void *opaque, const uint8_t *buf,
>> +                         int64_t pos, int size)
>> +{
>> +    QEMUFileFD *s = opaque;
>> +    ssize_t len;
>> +
>> +    do {
>> +        len = write(s->fd, buf, size);
>> +    } while (len == -1 && errno == EINTR);
>>     
>
> What about the len == size case ?
>
>   

Should work, no?

>> +
>> +QEMUFile *qemu_fopen_fd(int fd)
>> +{
>> +    QEMUFileFD *s = qemu_mallocz(sizeof(QEMUFileFD));
>>     
>
> can't it fail?
>   

If it does, the entire world will disappear in a black hole.  Qemu isn't 
prepared to handle allocation failures.

> Isn't it possible to abstract the differences between bdrv and file so
> to have a common implementation
> between them? Do you think it's worthwhile ?
>
>   

They're very different.  bdrvs are sector-granularity, random access, 
concurrent, and want dma.  QEMUFiles are byte-granularity, sequential 
access, serial, and aren't too worried about dma (though it could be nice).

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [Qemu-devel] Re: [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-10 14:38   ` [Qemu-devel] " Glauber Costa
  2008-09-10 15:05     ` Avi Kivity
@ 2008-09-10 15:16     ` Anthony Liguori
  1 sibling, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 15:16 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

>> +static int fd_put_buffer(void *opaque, const uint8_t *buf,
>> +                         int64_t pos, int size)
>> +{
>> +    QEMUFileFD *s = opaque;
>> +    ssize_t len;
>> +
>> +    do {
>> +        len = write(s->fd, buf, size);
>> +    } while (len == -1 && errno == EINTR);
>>     
>
> What about the len == size case ?
>   

I don't follow what your concern is.

>> +
>> +static int fd_close(void *opaque)
>> +{
>> +    QEMUFileFD *s = opaque;
>> +    qemu_free(s);
>> +    return 0;
>> +}
>>     
> Why don't we need to do any specific callback for closing the file descriptor?
> In the case of a socket, I imagine we may want to shut the socket down, for ex.
>   

Since qemu_fopen_fd takes a previously open file descriptor, the 
expectation is that you're going to be able to close it yourself at some 
point.  This worked out fine for the migration code and I think it'll 
also work out okay for other code.  Plus, you would have to add 
callbacks to qemu_fopen_fd() which gets pretty nasty.

>> +
>> +QEMUFile *qemu_fopen_fd(int fd)
>> +{
>> +    QEMUFileFD *s = qemu_mallocz(sizeof(QEMUFileFD));
>>     
>
> can't it fail?
>   

Yeah, I should add error checking.

>> -static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int64_t offset, int is_writable)
>> +typedef struct QEMUFileBdrv
>> +{
>> +    BlockDriverState *bs;
>> +    int64_t base_offset;
>> +} QEMUFileBdrv;
>>     
>
> Isn't it possible to abstract the differences between bdrv and file so
> to have a common implementation
> between them? Do you think it's worthwhile ?
>   

It's a lot of work.  QEMUFile is optimized to batch short read/write 
operations whereas BlockDriverState is meant to be sector based.  
QEMUFile is also evolving into a stream mechanism where BlockDriver will 
always be random access.

It's certainly possible, but I don't think it's worth it at this stage.

Thanks for the review!

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-10 14:46     ` Glauber Costa
@ 2008-09-10 15:19       ` Anthony Liguori
  2008-09-10 15:32         ` Glauber Costa
                           ` (2 more replies)
  0 siblings, 3 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 15:19 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Glauber Costa wrote:
> On Wed, Sep 10, 2008 at 10:26 AM, Chris Lalancette <clalance@redhat.com> wrote:
>   
>> Anthony Liguori wrote:
>>     
>>> This patch adds a bdrv_flush_all() function.  It's necessary to ensure that all
>>> IO operations have been flushed to disk before completely a live migration.
>>>
>>> N.B. we don't actually use this now.  We really should flush the block drivers
>>> using an live savevm callback to avoid unnecessary guest down time.
>>>       
>> Simple enough, and follows the pattern in the KVM migration.
>>
>> Chris Lalancette
>>
>>     
> sounds ok.
>   

I'm actually liking bdrv_flush_all() less and less.  If there are any 
outstanding IO requests, it will increase the down time associated with 
live migration.  I think we definitely need to add a live save handler 
that waits until there are no outstanding IO requests to converge.  I'm 
concerned though that it's not totally unreasonable to expect a guest to 
always have an IO request in flight.  That leads me to think that maybe 
we should be cancelling outstanding requests, and somehow saving their 
state?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-10 15:19       ` Anthony Liguori
@ 2008-09-10 15:32         ` Glauber Costa
  2008-09-10 15:39         ` Avi Kivity
  2008-09-10 16:37         ` Paul Brook
  2 siblings, 0 replies; 57+ messages in thread
From: Glauber Costa @ 2008-09-10 15:32 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

On Wed, Sep 10, 2008 at 12:19 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
> Glauber Costa wrote:
>>
>> On Wed, Sep 10, 2008 at 10:26 AM, Chris Lalancette <clalance@redhat.com>
>> wrote:
>>
>>>
>>> Anthony Liguori wrote:
>>>
>>>>
>>>> This patch adds a bdrv_flush_all() function.  It's necessary to ensure
>>>> that all
>>>> IO operations have been flushed to disk before completely a live
>>>> migration.
>>>>
>>>> N.B. we don't actually use this now.  We really should flush the block
>>>> drivers
>>>> using an live savevm callback to avoid unnecessary guest down time.
>>>>
>>>
>>> Simple enough, and follows the pattern in the KVM migration.
>>>
>>> Chris Lalancette
>>>
>>>
>>
>> sounds ok.
>>
>
> I'm actually liking bdrv_flush_all() less and less.  If there are any
> outstanding IO requests, it will increase the down time associated with live
> migration.  I think we definitely need to add a live save handler that waits
> until there are no outstanding IO requests to converge.  I'm concerned
> though that it's not totally unreasonable to expect a guest to always have
> an IO request in flight.  That leads me to think that maybe we should be
> cancelling outstanding requests, and somehow saving their state?

Maybe we can avoid new requests to start, and save their state. For
the in flight ones, we wait
for them to quiesce. That can probably make it less messy, but maybe not


-- 
Glauber Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 11:14         ` Daniel P. Berrange
@ 2008-09-10 15:36           ` Avi Kivity
  2008-09-10 15:40             ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Avi Kivity @ 2008-09-10 15:36 UTC (permalink / raw)
  To: Daniel P. Berrange
  Cc: Chris Wright, Uri Lublin, Anthony Liguori, qemu-devel, kvm

Daniel P. Berrange wrote:
>>
>> Or rather, such state should be part of the migration.  There's the 
>> question of whether to transform the path on the target, but "which 
>> media is in the drive" is part of the hardware state.
>>
>> (logically we would copy all of the data of all block devices, but 
>> that's not very practical, so we assume shared storage).
>>
>> What other commands are unsafe during migration? I exclude host device 
>> assignment which is obviously migration unfriendly.
>>     
>
> USB + virtio device attach/detach - well at least have the same need as
> media change - you'd need to propagate the change to the other side in
> some way. Currently migrate just assumes the management tool/admin has
> started QEMU on the destinations with the matching hardware config, and
> for libvirt to use the monitor to add USB /virtio devices at both ends
> has the race condition/synchronization problem .
>   

Any hotplug, for that matter.

The hardware topology should be part of the state; there is no other way 
to deal with hotplug.  Hotplug can create configurations that are 
impossible to recreate using the command line.

In general a hardware device has two parts: a guest visible part and a 
host interface part.  For networking, that's easily visible (-net nic 
and -net tap/user).  For block devices, the filename and caching mode is 
the host interface while, the interface type and index is the guest part.

Migration should transfer the guest part, and the host parts should be 
specified from the command line or monitor on the migration target, as 
they can change from host to host.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-10 15:19       ` Anthony Liguori
  2008-09-10 15:32         ` Glauber Costa
@ 2008-09-10 15:39         ` Avi Kivity
  2008-09-10 16:37         ` Paul Brook
  2 siblings, 0 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-10 15:39 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Glauber Costa, kvm, Uri Lublin

Anthony Liguori wrote:
>
> I'm actually liking bdrv_flush_all() less and less.  If there are any 
> outstanding IO requests, it will increase the down time associated 
> with live migration.  I think we definitely need to add a live save 
> handler that waits until there are no outstanding IO requests to 
> converge.  I'm concerned though that it's not totally unreasonable to 
> expect a guest to always have an IO request in flight.  

I/O requests should complete in milliseconds; I don't see them as 
increasing migration latency.

> That leads me to think that maybe we should be cancelling outstanding 
> requests, and somehow saving their state?

Migration is enough of a special case that we should avoid special cases 
within it as much as possible.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 15:36           ` Avi Kivity
@ 2008-09-10 15:40             ` Anthony Liguori
  0 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 15:40 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Anthony Liguori, kvm, qemu-devel, Uri Lublin

Avi Kivity wrote:
> Daniel P. Berrange wrote:
>>>
>>> Or rather, such state should be part of the migration.  There's the 
>>> question of whether to transform the path on the target, but "which 
>>> media is in the drive" is part of the hardware state.
>>>
>>> (logically we would copy all of the data of all block devices, but 
>>> that's not very practical, so we assume shared storage).
>>>
>>> What other commands are unsafe during migration? I exclude host 
>>> device assignment which is obviously migration unfriendly.
>>>     
>>
>> USB + virtio device attach/detach - well at least have the same need as
>> media change - you'd need to propagate the change to the other side in
>> some way. Currently migrate just assumes the management tool/admin has
>> started QEMU on the destinations with the matching hardware config, and
>> for libvirt to use the monitor to add USB /virtio devices at both ends
>> has the race condition/synchronization problem .
>>   
>
> Any hotplug, for that matter.
>
> The hardware topology should be part of the state; there is no other 
> way to deal with hotplug.  Hotplug can create configurations that are 
> impossible to recreate using the command line.

Yes, I agree.  I'd like to introduce a "machine" section that contains 
nothing but a version id.  v1 will simply be a placeholder.  If we add 
new hardware to the machine type, we'll bump to v2.  When we merge hot 
plug support, we need to add actual contents to the save section 
describing any hot plugged hardware.

Regards,

Anthony Liguori

> In general a hardware device has two parts: a guest visible part and a 
> host interface part.  For networking, that's easily visible (-net nic 
> and -net tap/user).  For block devices, the filename and caching mode 
> is the host interface while, the interface type and index is the guest 
> part.
>
> Migration should transfer the guest part, and the host parts should be 
> specified from the command line or monitor on the migration target, as 
> they can change from host to host.
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 11:11       ` Avi Kivity
  2008-09-10 11:14         ` Daniel P. Berrange
@ 2008-09-10 15:58         ` Jamie Lokier
  2008-09-11 10:16           ` Avi Kivity
  1 sibling, 1 reply; 57+ messages in thread
From: Jamie Lokier @ 2008-09-10 15:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Avi Kivity wrote:
> (logically we would copy all of the data of all block devices, but 
> that's not very practical, so we assume shared storage).

Speaking of that, if the guest RAM were a memory-mapped file, couldn't
that use shared storage too?

You'd have to be careful: it would need a distributed filesystem with
coherent mappings (i.e. not NFS), but they do exist.

I'm guessing that the bulk of time spent in migration/checkpointing is
saving the RAM image.  Using a memory-mapped file on shared storage
for RAM might make that faster.  (Or slower!).

-- Jamie

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-10 15:19       ` Anthony Liguori
  2008-09-10 15:32         ` Glauber Costa
  2008-09-10 15:39         ` Avi Kivity
@ 2008-09-10 16:37         ` Paul Brook
  2 siblings, 0 replies; 57+ messages in thread
From: Paul Brook @ 2008-09-10 16:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Glauber Costa, Anthony Liguori, kvm, Uri Lublin

> I'm actually liking bdrv_flush_all() less and less.  If there are any
> outstanding IO requests, it will increase the down time associated with
> live migration.  I think we definitely need to add a live save handler
> that waits until there are no outstanding IO requests to converge.  I'm
> concerned though that it's not totally unreasonable to expect a guest to
> always have an IO request in flight.  That leads me to think that maybe
> we should be cancelling outstanding requests, and somehow saving their
> state?

That's not possible with the current code because the IO callbacks 
(particularly when you start involving the SCSI layer) are generated 
dynamically.

Paul

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 10/10] TCP based live migration
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 10/10] TCP based " Anthony Liguori
@ 2008-09-10 16:46   ` Blue Swirl
  2008-09-10 16:51     ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Blue Swirl @ 2008-09-10 16:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On 9/9/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch introduces a tcp protocol for live migration.  It can be used as
>  follows:
>
>  qemu-system-x86_64 -hda ~/images/linux-test.img -monitor stdio
>   <vm runs for a while>
>  (qemu) migrate tcp:localhost:1025
>
>  On the same system:
>
>  qemu-system-x86_64 -hda ~/images/linux-test.img -incoming tcp:localhost:1025
>
>  The monitor can be interacted with while waiting for an incoming live migration.

With command line switches, you have to know when starting the VM that
it is going to accept an incoming migration. Would it be possible to
change this so that the incoming migration could be triggered from
monitor? For example to migrate back to original host?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 10/10] TCP based live migration
  2008-09-10 16:46   ` Blue Swirl
@ 2008-09-10 16:51     ` Anthony Liguori
  0 siblings, 0 replies; 57+ messages in thread
From: Anthony Liguori @ 2008-09-10 16:51 UTC (permalink / raw)
  To: Blue Swirl; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Blue Swirl wrote:
> On 9/9/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> This patch introduces a tcp protocol for live migration.  It can be used as
>>  follows:
>>
>>  qemu-system-x86_64 -hda ~/images/linux-test.img -monitor stdio
>>   <vm runs for a while>
>>  (qemu) migrate tcp:localhost:1025
>>
>>  On the same system:
>>
>>  qemu-system-x86_64 -hda ~/images/linux-test.img -incoming tcp:localhost:1025
>>
>>  The monitor can be interacted with while waiting for an incoming live migration.
>>     
>
> With command line switches, you have to know when starting the VM that
> it is going to accept an incoming migration. Would it be possible to
> change this so that the incoming migration could be triggered from
> monitor? For example to migrate back to original host?
>   

Yup.  The main reason for an -incoming flag is it makes the ssh: 
protocol easier to implement (not included here).

I'll add a monitor command too.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-10 15:58         ` Jamie Lokier
@ 2008-09-11 10:16           ` Avi Kivity
  2008-09-11 11:59             ` Jamie Lokier
  0 siblings, 1 reply; 57+ messages in thread
From: Avi Kivity @ 2008-09-11 10:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> (logically we would copy all of the data of all block devices, but 
>> that's not very practical, so we assume shared storage).
>>     
>
> Speaking of that, if the guest RAM were a memory-mapped file, couldn't
> that use shared storage too?
>
>   

You would need a clustered filesystem that supports coherent mmap()s.

> You'd have to be careful: it would need a distributed filesystem with
> coherent mappings (i.e. not NFS), but they do exist.
>
> I'm guessing that the bulk of time spent in migration/checkpointing is
> saving the RAM image.  Using a memory-mapped file on shared storage
> for RAM might make that faster.  (Or slower!).
>   

The memory needs to be transferred anyway, so total time would not
change.  You could start running on the target sooner, though.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op
  2008-09-11 10:16           ` Avi Kivity
@ 2008-09-11 11:59             ` Jamie Lokier
  0 siblings, 0 replies; 57+ messages in thread
From: Jamie Lokier @ 2008-09-11 11:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Avi Kivity wrote:
> Jamie Lokier wrote:
> > Avi Kivity wrote:
> >   
> >> (logically we would copy all of the data of all block devices, but 
> >> that's not very practical, so we assume shared storage).
> >>     
> >
> > Speaking of that, if the guest RAM were a memory-mapped file, couldn't
> > that use shared storage too?
> 
> You would need a clustered filesystem that supports coherent mmap()s.

Yes, something like GFS.

> > You'd have to be careful: it would need a distributed filesystem with
> > coherent mappings (i.e. not NFS), but they do exist.
> >
> > I'm guessing that the bulk of time spent in migration/checkpointing is
> > saving the RAM image.  Using a memory-mapped file on shared storage
> > for RAM might make that faster.  (Or slower!).
> >   
> The memory needs to be transferred anyway, so total time would not
> change.  You could start running on the target sooner, though.

Actually the memory doesn't need to be tranferred.  Only actively used
pages need to be.  If you have an 8GB guest, 7.75GB of which is the
guest's filesystem cache from something you did earlier and is no
longer used, you just need to transfer 250MB and it can continue
running on the target.

-- Jamie

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
                   ` (9 preceding siblings ...)
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 10/10] TCP based " Anthony Liguori
@ 2008-09-11 12:13 ` Atsushi SAKAI
  2008-09-11 13:06   ` Anthony Liguori
  10 siblings, 1 reply; 57+ messages in thread
From: Atsushi SAKAI @ 2008-09-11 12:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Hi,

It may be stupid question.
This migration is SMP aware?

Thanks
Atsushi SAKAI

Anthony Liguori <aliguori@us.ibm.com> wrote:

> This series adds live migration support to QEMU.  It's inspired by the
> implementation of live migration in KVM, but at this point, is almost
> a full rewrite.  Uri Lublin did a large amount of the work on the live
> migration implementation in KVM.
> 
> This patch series is not yet ready to apply.  There are a few FIXMEs
> and I have to add back support for restoring v2 saved images.  I wanted
> to get these patches out on the list though for review since it's a rather
> large series.
> 
> Live migration will work with any target that supports save/restore.
> 
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 12:13 ` [Qemu-devel] [PATCH 0/10] Live migration for QEMU Atsushi SAKAI
@ 2008-09-11 13:06   ` Anthony Liguori
  2008-09-11 13:30     ` Jamie Lokier
  0 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-11 13:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Atsushi SAKAI wrote:
> Hi,
>
> It may be stupid question.
> This migration is SMP aware?
>   

Yes it is (SMP aware).

Regards,

Anthony Liguori

> Thanks
> Atsushi SAKAI
>
> Anthony Liguori <aliguori@us.ibm.com> wrote:
>
>   
>> This series adds live migration support to QEMU.  It's inspired by the
>> implementation of live migration in KVM, but at this point, is almost
>> a full rewrite.  Uri Lublin did a large amount of the work on the live
>> migration implementation in KVM.
>>
>> This patch series is not yet ready to apply.  There are a few FIXMEs
>> and I have to add back support for restoring v2 saved images.  I wanted
>> to get these patches out on the list though for review since it's a rather
>> large series.
>>
>> Live migration will work with any target that supports save/restore.
>>
>>
>>     
>
>
>
>
>   

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 13:06   ` Anthony Liguori
@ 2008-09-11 13:30     ` Jamie Lokier
  2008-09-11 14:12       ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Jamie Lokier @ 2008-09-11 13:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

Anthony Liguori wrote:
> >It may be stupid question.
> >This migration is SMP aware?
> 
> Yes it is (SMP aware).

Another stupid question from me.

How stable is it likely to be between versions of QEMU and KVM?  Also,
is there any hope of migrating between QEMU and KVM, so long as the
same device config is used, it's not SMP, etc.?

-- Jamie

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 13:30     ` Jamie Lokier
@ 2008-09-11 14:12       ` Anthony Liguori
  2008-09-11 15:32         ` Avi Kivity
  0 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-11 14:12 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>>> It may be stupid question.
>>> This migration is SMP aware?
>>>       
>> Yes it is (SMP aware).
>>     
>
> Another stupid question from me.
>
> How stable is it likely to be between versions of QEMU and KVM?

Once I merge the migration patches, I'll help Avi merge this into KVM.  
The whole point of this exercise is to merge the KVM bits into QEMU but 
in the process, I decided the bits needed to be written :-)

>   Also,
> is there any hope of migrating between QEMU and KVM, so long as the
> same device config is used, it's not SMP, etc.?
>   

Yes.  The primary reason that hasn't been possible in the past was 
because of how memory was migrated.  The new memory migration protocol 
happens to make it easier to let QEMU and KVM be compatible.  That 
wasn't an accident :-)

Regards,

Anthony Liguori

> -- Jamie
>   

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 14:12       ` Anthony Liguori
@ 2008-09-11 15:32         ` Avi Kivity
  2008-09-11 16:22           ` Anthony Liguori
  0 siblings, 1 reply; 57+ messages in thread
From: Avi Kivity @ 2008-09-11 15:32 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, kvm

Anthony Liguori wrote:
>
> Yes.  The primary reason that hasn't been possible in the past was 
> because of how memory was migrated.  The new memory migration protocol 
> happens to make it easier to let QEMU and KVM be compatible.  That 
> wasn't an accident :-)
>

Well, it's still broken IMO (migration ram_addr_t rather than physical 
addresses).

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 15:32         ` Avi Kivity
@ 2008-09-11 16:22           ` Anthony Liguori
  2008-09-11 16:32             ` Avi Kivity
  0 siblings, 1 reply; 57+ messages in thread
From: Anthony Liguori @ 2008-09-11 16:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Avi Kivity wrote:
> Anthony Liguori wrote:
>>
>> Yes.  The primary reason that hasn't been possible in the past was 
>> because of how memory was migrated.  The new memory migration 
>> protocol happens to make it easier to let QEMU and KVM be 
>> compatible.  That wasn't an accident :-)
>>
>
> Well, it's still broken IMO (migration ram_addr_t rather than physical 
> addresses).

Have you thought of a solution other than make "mem" only save physical 
memory and have everything else save their own memory?

That gets really funky because then everything needs live save/restore 
tracking.  It's quite messy.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 0/10] Live migration for QEMU
  2008-09-11 16:22           ` Anthony Liguori
@ 2008-09-11 16:32             ` Avi Kivity
  0 siblings, 0 replies; 57+ messages in thread
From: Avi Kivity @ 2008-09-11 16:32 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Chris Wright, Uri Lublin, qemu-devel, kvm

Anthony Liguori wrote:
>>>
>>> Yes.  The primary reason that hasn't been possible in the past was 
>>> because of how memory was migrated.  The new memory migration 
>>> protocol happens to make it easier to let QEMU and KVM be 
>>> compatible.  That wasn't an accident :-)
>>>
>>
>> Well, it's still broken IMO (migration ram_addr_t rather than 
>> physical addresses).
>
> Have you thought of a solution other than make "mem" only save 
> physical memory and have everything else save their own memory?
>

Even worse, have each slot (0-640K, 1M-pci, 4GB-eom, hotplug slots, 
writeable option roms) be an independent save/restore entity.

> That gets really funky because then everything needs live save/restore 
> tracking.  It's quite messy.

Why? they can all reuse the same code.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper
  2008-09-09 19:50 ` [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper Anthony Liguori
@ 2008-09-12 15:16   ` Blue Swirl
  0 siblings, 0 replies; 57+ messages in thread
From: Blue Swirl @ 2008-09-12 15:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On 9/9/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch introduces a buffered QEMUFile wrapper.  This allows QEMUFile's to be
>  rate limited.  It also allows makes it easier to implement a QEMUFile that is
>  asynchronous.
>
>  The only real non-obvious part of the API is the "frozen" concept.  If the backend
>  returns EAGAIN, the QEMUFile is said to be "frozen".  This means no additional
>  output will be sent to the backend until the file is unfrozen.  qemu_file_put_notify
>  can be used to unfreeze a frozen file.
>
>  A synchronous interface is also provided to wait for an unfreeze event.  This is
>  used during the final part of live migration when the VM is no longer running.

>  +static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
>  +{
>  +    QEMUFileBuffered *s = opaque;
>  +    size_t offset = 0;
>  +    ssize_t ret;
>  +
>  +    if (s->has_error)
>  +        return -EINVAL;
>  +
>  +    s->freeze_output = 0;
>  +
>  +    buffered_flush(s);
>  +
>  +    while (offset < size) {
>  +        if (s->bytes_xfer > s->xfer_limit)
>  +            break;
>  +
>  +        ret = s->put_buffer(s->opaque, buf + offset, size - offset);
>  +        if (ret == -EAGAIN) {
>  +            s->freeze_output = 1;
>  +            break;
>  +        }
>  +
>  +        if (ret <= 0) {
>  +            s->has_error = 1;
>  +            break;
>  +        }
>  +
>  +        offset += ret;
>  +        s->bytes_xfer += ret;
>  +    }
>  +
>  +    buffered_append(s, buf + offset, size - offset);
>  +
>  +    return offset;
>  +}

I'd change the types of the return value and parameter "size" to ssize_t.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
  2008-09-10 13:25   ` Chris Lalancette
  2008-09-10 14:38   ` [Qemu-devel] " Glauber Costa
@ 2008-09-12 15:40   ` Blue Swirl
  2 siblings, 0 replies; 57+ messages in thread
From: Blue Swirl @ 2008-09-12 15:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On 9/9/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> To support live migration, we override QEMUFile so that instead of writing to
>  disk, the save/restore state happens over a network connection.
>
>  This patch makes QEMUFile read/write operations function pointers so that we
>  can override them for live migration.

>  +typedef struct QEMUFileUnix
>  +{
>  +    FILE *outfile;
>  +} QEMUFileUnix;

I'd rather use name stdio/StdIO instead of Unix, this code is not Unix
dependent.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all()
  2008-09-09 19:49 ` [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all() Anthony Liguori
  2008-09-10 13:26   ` Chris Lalancette
@ 2008-09-12 15:43   ` Blue Swirl
  1 sibling, 0 replies; 57+ messages in thread
From: Blue Swirl @ 2008-09-12 15:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chris Wright, Uri Lublin, Anthony Liguori, kvm

On 9/9/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch adds a bdrv_flush_all() function.  It's necessary to ensure that all
>  IO operations have been flushed to disk before completely a live migration.
>
>  N.B. we don't actually use this now.  We really should flush the block drivers
>  using an live savevm callback to avoid unnecessary guest down time.
>
>  Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>
>  diff --git a/block.c b/block.c
>  index 544176f..921d382 100644
>  --- a/block.c
>  +++ b/block.c
>  @@ -884,6 +884,21 @@ void bdrv_flush(BlockDriverState *bs)
>          bdrv_flush(bs->backing_hd);
>   }
>
>  +void bdrv_iterate_writeable(void (*it)(BlockDriverState *bs))

Forgot "static"?

>  +{
>  +    BlockDriverState *bs;
>  +
>  +    for (bs = bdrv_first; bs != NULL; bs = bs->next)
>  +        if (bs->drv && !bdrv_is_read_only(bs) &&
>  +            (!bdrv_is_removable(bs) || bdrv_is_inserted(bs)))
>  +           it(bs);
>  +}

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2008-09-12 15:43 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-09 19:49 [Qemu-devel] [PATCH 0/10] Live migration for QEMU Anthony Liguori
2008-09-09 19:49 ` [Qemu-devel] [PATCH 1/10] Refactor QEMUFile for live migration Anthony Liguori
2008-09-10 13:25   ` Chris Lalancette
2008-09-10 14:38   ` [Qemu-devel] " Glauber Costa
2008-09-10 15:05     ` Avi Kivity
2008-09-10 15:16     ` Anthony Liguori
2008-09-12 15:40   ` [Qemu-devel] " Blue Swirl
2008-09-09 19:49 ` [Qemu-devel] [PATCH 2/10] Allow the monitor to be suspended during non-blocking op Anthony Liguori
2008-09-10  6:52   ` Avi Kivity
2008-09-10 10:05     ` Daniel P. Berrange
2008-09-10 11:11       ` Avi Kivity
2008-09-10 11:14         ` Daniel P. Berrange
2008-09-10 15:36           ` Avi Kivity
2008-09-10 15:40             ` Anthony Liguori
2008-09-10 15:58         ` Jamie Lokier
2008-09-11 10:16           ` Avi Kivity
2008-09-11 11:59             ` Jamie Lokier
2008-09-10 13:07     ` Anthony Liguori
2008-09-10 13:26     ` Chris Lalancette
2008-09-10 10:01   ` Daniel P. Berrange
2008-09-10 13:11     ` Anthony Liguori
2008-09-09 19:49 ` [Qemu-devel] [PATCH 3/10] Add bdrv_flush_all() Anthony Liguori
2008-09-10 13:26   ` Chris Lalancette
2008-09-10 14:46     ` Glauber Costa
2008-09-10 15:19       ` Anthony Liguori
2008-09-10 15:32         ` Glauber Costa
2008-09-10 15:39         ` Avi Kivity
2008-09-10 16:37         ` Paul Brook
2008-09-12 15:43   ` Blue Swirl
2008-09-09 19:49 ` [Qemu-devel] [PATCH 4/10] Add dirty tracking for live migration Anthony Liguori
2008-09-10 14:52   ` Glauber Costa
2008-09-10 14:56     ` Anthony Liguori
2008-09-10 15:01       ` Glauber Costa
2008-09-09 19:49 ` [Qemu-devel] [PATCH 5/10] Add network announce function Anthony Liguori
2008-09-10 13:27   ` Chris Lalancette
2008-09-10 13:54     ` Anthony Liguori
2008-09-10 14:00     ` Avi Kivity
2008-09-09 19:49 ` [Qemu-devel] [PATCH 6/10] Introduce v3 of savevm protocol Anthony Liguori
2008-09-10  7:09   ` Avi Kivity
2008-09-09 19:49 ` [Qemu-devel] [PATCH 7/10] Switch the memory savevm handler to be "live" Anthony Liguori
2008-09-09 22:25   ` Jamie Lokier
2008-09-09 22:49     ` Anthony Liguori
2008-09-10  7:17   ` Avi Kivity
2008-09-10 13:10     ` Anthony Liguori
2008-09-09 19:50 ` [Qemu-devel] [PATCH 8/10] Introduce a buffered QEMUFile wrapper Anthony Liguori
2008-09-12 15:16   ` Blue Swirl
2008-09-09 19:50 ` [Qemu-devel] [PATCH 9/10] Introduce the UI components for live migration Anthony Liguori
2008-09-09 19:50 ` [Qemu-devel] [PATCH 10/10] TCP based " Anthony Liguori
2008-09-10 16:46   ` Blue Swirl
2008-09-10 16:51     ` Anthony Liguori
2008-09-11 12:13 ` [Qemu-devel] [PATCH 0/10] Live migration for QEMU Atsushi SAKAI
2008-09-11 13:06   ` Anthony Liguori
2008-09-11 13:30     ` Jamie Lokier
2008-09-11 14:12       ` Anthony Liguori
2008-09-11 15:32         ` Avi Kivity
2008-09-11 16:22           ` Anthony Liguori
2008-09-11 16:32             ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).