[RFC PATCH v3 0/4] Separate thread for VM migration

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH v3 0/4] Separate thread for VM migration
@ 2011-08-11 15:32 Umesh Deshpande
  2011-08-11 15:32 ` [RFC PATCH v3 1/4] separate " Umesh Deshpande
                   ` (5 more replies)
  0 siblings, 6 replies; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 15:32 UTC (permalink / raw)
  To: kvm; +Cc: quintela, mtosatti, pbonzini, Umesh Deshpande

Following patch series deals with VCPU and iothread starvation during the
migration of
a guest. Currently the iothread is responsible for performing the guest
migration. It holds qemu_mutex during the migration and doesn't allow VCPU to
enter the qemu mode and delays its return to the guest. The guest migration,
executed as an iohandler also delays the execution of other iohandlers.
In the following patch series,

The migration has been moved to a separate thread to
reduce the qemu_mutex contention and iohandler starvation.

Umesh Deshpande (4):
  separate thread for VM migration
  synchronous migrate_cancel
  lock to protect memslots
  separate migration bitmap

 arch_init.c         |   26 ++++++++++----
 buffered_file.c     |  100 +++++++++++++++++++++++++++++++++------------------
 buffered_file.h     |    4 ++
 cpu-all.h           |   39 ++++++++++++++++++++
 cpus.c              |   12 ++++++
 exec.c              |   74 +++++++++++++++++++++++++++++++++++++
 hw/hw.h             |    5 ++-
 migration.c         |   50 ++++++++++++--------------
 migration.h         |    6 +++
 qemu-common.h       |    2 +
 qemu-thread-posix.c |   10 +++++
 qemu-thread.h       |    1 +
 savevm.c            |   31 +++++++++++-----
 13 files changed, 280 insertions(+), 80 deletions(-)

-- 
1.7.4.1

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [RFC PATCH v3 1/4] separate thread for VM migration
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
@ 2011-08-11 15:32 ` Umesh Deshpande
  2011-08-11 16:18   ` Paolo Bonzini
  2011-08-11 15:32 ` [RFC PATCH v3 2/4] Making iothread block for migrate_cancel Umesh Deshpande
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 15:32 UTC (permalink / raw)
  To: kvm; +Cc: quintela, mtosatti, pbonzini, Umesh Deshpande

This patch creates a separate thread for the guest migration on the source side.
migrate_cancel request from the iothread is handled asynchronously. That is,
iothread submits migrate_cancel to the migration thread and returns, while the
migration thread attends this request at the next iteration to terminate its
execution.

Signed-off-by: Umesh Deshpande <udeshpan@redhat.com>
---
 buffered_file.c |   85 ++++++++++++++++++++++++++++++++----------------------
 buffered_file.h |    4 ++
 migration.c     |   49 ++++++++++++++-----------------
 migration.h     |    6 ++++
 4 files changed, 82 insertions(+), 62 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 41b42c3..19932b6 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -16,6 +16,8 @@
 #include "qemu-timer.h"
 #include "qemu-char.h"
 #include "buffered_file.h"
+#include "migration.h"
+#include "qemu-thread.h"
 
 //#define DEBUG_BUFFERED_FILE
 
@@ -28,13 +30,14 @@ typedef struct QEMUFileBuffered
     void *opaque;
     QEMUFile *file;
     int has_error;
+    int closed;
     int freeze_output;
     size_t bytes_xfer;
     size_t xfer_limit;
     uint8_t *buffer;
     size_t buffer_size;
     size_t buffer_capacity;
-    QEMUTimer *timer;
+    QemuThread thread;
 } QEMUFileBuffered;
 
 #ifdef DEBUG_BUFFERED_FILE
@@ -155,14 +158,6 @@ static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, in
         offset = size;
     }
 
-    if (pos == 0 && size == 0) {
-        DPRINTF("file is ready\n");
-        if (s->bytes_xfer <= s->xfer_limit) {
-            DPRINTF("notifying client\n");
-            s->put_ready(s->opaque);
-        }
-    }
-
     return offset;
 }
 
@@ -175,20 +170,20 @@ static int buffered_close(void *opaque)
 
     while (!s->has_error && s->buffer_size) {
         buffered_flush(s);
-        if (s->freeze_output)
+        if (s->freeze_output) {
             s->wait_for_unfreeze(s);
+        }
     }
 
-    ret = s->close(s->opaque);
+    s->closed = 1;
 
-    qemu_del_timer(s->timer);
-    qemu_free_timer(s->timer);
+    ret = s->close(s->opaque);
     qemu_free(s->buffer);
-    qemu_free(s);
 
     return ret;
 }
 
+
 static int buffered_rate_limit(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
@@ -228,34 +223,55 @@ static int64_t buffered_get_rate_limit(void *opaque)
     return s->xfer_limit;
 }
 
-static void buffered_rate_tick(void *opaque)
+static void *migrate_vm(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
+    int64_t current_time, expire_time = qemu_get_clock_ms(rt_clock) + 100;
+    struct timeval tv = { .tv_sec = 0, .tv_usec = 100000};
 
-    if (s->has_error) {
-        buffered_close(s);
-        return;
-    }
+    qemu_mutex_lock_iothread();
 
-    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
+    while (!s->closed) {
+        if (s->freeze_output) {
+            s->wait_for_unfreeze(s);
+            s->freeze_output = 0;
+            continue;
+        }
 
-    if (s->freeze_output)
-        return;
+        if (s->has_error) {
+            break;
+        }
+
+        current_time = qemu_get_clock_ms(rt_clock);
+        if (!s->closed && (expire_time > current_time)) {
+            tv.tv_usec = 1000 * (expire_time - current_time);
+            select(0, NULL, NULL, NULL, &tv);
+            continue;
+        }
 
-    s->bytes_xfer = 0;
+        s->bytes_xfer = 0;
+        buffered_flush(s);
 
-    buffered_flush(s);
+        expire_time = qemu_get_clock_ms(rt_clock) + 100;
+        s->put_ready(s->opaque);
+    }
 
-    /* Add some checks around this */
-    s->put_ready(s->opaque);
+    if (s->has_error) {
+        buffered_close(s);
+    }
+    qemu_free(s);
+
+    qemu_mutex_unlock_iothread();
+
+    return NULL;
 }
 
 QEMUFile *qemu_fopen_ops_buffered(void *opaque,
-                                  size_t bytes_per_sec,
-                                  BufferedPutFunc *put_buffer,
-                                  BufferedPutReadyFunc *put_ready,
-                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
-                                  BufferedCloseFunc *close)
+        size_t bytes_per_sec,
+        BufferedPutFunc *put_buffer,
+        BufferedPutReadyFunc *put_ready,
+        BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
+        BufferedCloseFunc *close)
 {
     QEMUFileBuffered *s;
 
@@ -267,15 +283,14 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
     s->put_ready = put_ready;
     s->wait_for_unfreeze = wait_for_unfreeze;
     s->close = close;
+    s->closed = 0;
 
     s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
                              buffered_close, buffered_rate_limit,
                              buffered_set_rate_limit,
-			     buffered_get_rate_limit);
-
-    s->timer = qemu_new_timer_ms(rt_clock, buffered_rate_tick, s);
+                             buffered_get_rate_limit);
 
-    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
+    qemu_thread_create(&s->thread, migrate_vm, s);
 
     return s->file;
 }
diff --git a/buffered_file.h b/buffered_file.h
index 98d358b..477bf7c 100644
--- a/buffered_file.h
+++ b/buffered_file.h
@@ -17,9 +17,13 @@
 #include "hw/hw.h"
 
 typedef ssize_t (BufferedPutFunc)(void *opaque, const void *data, size_t size);
+typedef void (BufferedBeginFunc)(void *opaque);
 typedef void (BufferedPutReadyFunc)(void *opaque);
 typedef void (BufferedWaitForUnfreezeFunc)(void *opaque);
 typedef int (BufferedCloseFunc)(void *opaque);
+typedef void (BufferedWaitForCancelFunc)(void *opaque);
+
+void wait_for_cancel(void *opaque);
 
 QEMUFile *qemu_fopen_ops_buffered(void *opaque, size_t xfer_limit,
                                   BufferedPutFunc *put_buffer,
diff --git a/migration.c b/migration.c
index af3a1f2..d8a0abb 100644
--- a/migration.c
+++ b/migration.c
@@ -284,8 +284,6 @@ int migrate_fd_cleanup(FdMigrationState *s)
 {
     int ret = 0;
 
-    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
-
     if (s->file) {
         DPRINTF("closing file\n");
         if (qemu_fclose(s->file) != 0) {
@@ -307,14 +305,6 @@ int migrate_fd_cleanup(FdMigrationState *s)
     return ret;
 }
 
-void migrate_fd_put_notify(void *opaque)
-{
-    FdMigrationState *s = opaque;
-
-    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
-    qemu_file_put_notify(s->file);
-}
-
 ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
 {
     FdMigrationState *s = opaque;
@@ -327,9 +317,7 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
     if (ret == -1)
         ret = -(s->get_error(s));
 
-    if (ret == -EAGAIN) {
-        qemu_set_fd_handler2(s->fd, NULL, NULL, migrate_fd_put_notify, s);
-    } else if (ret < 0) {
+    if (ret < 0 && ret != -EAGAIN) {
         if (s->mon) {
             monitor_resume(s->mon);
         }
@@ -342,36 +330,40 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
 
 void migrate_fd_connect(FdMigrationState *s)
 {
-    int ret;
-
+    s->begin = 1;
     s->file = qemu_fopen_ops_buffered(s,
                                       s->bandwidth_limit,
                                       migrate_fd_put_buffer,
                                       migrate_fd_put_ready,
                                       migrate_fd_wait_for_unfreeze,
                                       migrate_fd_close);
-
-    DPRINTF("beginning savevm\n");
-    ret = qemu_savevm_state_begin(s->mon, s->file, s->mig_state.blk,
-                                  s->mig_state.shared);
-    if (ret < 0) {
-        DPRINTF("failed, %d\n", ret);
-        migrate_fd_error(s);
-        return;
-    }
-    
-    migrate_fd_put_ready(s);
 }
 
 void migrate_fd_put_ready(void *opaque)
 {
     FdMigrationState *s = opaque;
+    int ret;
 
     if (s->state != MIG_STATE_ACTIVE) {
         DPRINTF("put_ready returning because of non-active state\n");
+        if (s->state == MIG_STATE_CANCELLED) {
+            migrate_fd_terminate(s);
+        }
         return;
     }
 
+    if (s->begin) {
+        DPRINTF("beginning savevm\n");
+        ret = qemu_savevm_state_begin(s->mon, s->file, s->mig_state.blk,
+                s->mig_state.shared);
+        if (ret < 0) {
+            DPRINTF("failed, %d\n", ret);
+            migrate_fd_error(s);
+            return;
+        }
+        s->begin = 0;
+    }
+
     DPRINTF("iterate\n");
     if (qemu_savevm_state_iterate(s->mon, s->file) == 1) {
         int state;
@@ -415,6 +407,10 @@ void migrate_fd_cancel(MigrationState *mig_state)
     DPRINTF("cancelling migration\n");
 
     s->state = MIG_STATE_CANCELLED;
+}
+
+void migrate_fd_terminate(FdMigrationState *s)
+{
     notifier_list_notify(&migration_state_notifiers);
     qemu_savevm_state_cancel(s->mon, s->file);
 
@@ -458,7 +454,6 @@ int migrate_fd_close(void *opaque)
 {
     FdMigrationState *s = opaque;
 
-    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
     return s->close(s);
 }
 
diff --git a/migration.h b/migration.h
index 050c56c..887f84c 100644
--- a/migration.h
+++ b/migration.h
@@ -45,9 +45,11 @@ struct FdMigrationState
     int fd;
     Monitor *mon;
     int state;
+    int begin;
     int (*get_error)(struct FdMigrationState*);
     int (*close)(struct FdMigrationState*);
     int (*write)(struct FdMigrationState*, const void *, size_t);
+    void (*callback)(void *);
     void *opaque;
 };
 
@@ -118,12 +120,16 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size);
 
 void migrate_fd_connect(FdMigrationState *s);
 
+void migrate_fd_begin(void *opaque);
+
 void migrate_fd_put_ready(void *opaque);
 
 int migrate_fd_get_status(MigrationState *mig_state);
 
 void migrate_fd_cancel(MigrationState *mig_state);
 
+void migrate_fd_terminate(FdMigrationState *s);
+
 void migrate_fd_release(MigrationState *mig_state);
 
 void migrate_fd_wait_for_unfreeze(void *opaque);
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC PATCH v3 2/4] Making iothread block for migrate_cancel
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
  2011-08-11 15:32 ` [RFC PATCH v3 1/4] separate " Umesh Deshpande
@ 2011-08-11 15:32 ` Umesh Deshpande
  2011-08-11 15:32 ` [RFC PATCH v3 3/4] lock to protect memslots Umesh Deshpande
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 15:32 UTC (permalink / raw)
  To: kvm; +Cc: quintela, mtosatti, pbonzini, Umesh Deshpande

Following patch makes iothread wait until the migration thread responds to the
migrate_cancel request and terminates its execution.

Signed-off-by: Umesh Deshpande <udeshpan@redhat.com>
---
 buffered_file.c     |   13 ++++++++++++-
 hw/hw.h             |    5 ++++-
 migration.c         |    1 +
 qemu-thread-posix.c |   10 ++++++++++
 qemu-thread.h       |    1 +
 savevm.c            |   31 +++++++++++++++++++++----------
 6 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 19932b6..b64ada7 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -223,6 +223,16 @@ static int64_t buffered_get_rate_limit(void *opaque)
     return s->xfer_limit;
 }
 
+static void buffered_wait_for_cancel(void *opaque)
+{
+    QEMUFileBuffered *s = opaque;
+    QemuThread thread = s->thread;
+
+    qemu_mutex_unlock_iothread();
+    qemu_thread_join(thread);
+    qemu_mutex_lock_iothread();
+}
+
 static void *migrate_vm(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
@@ -288,7 +298,8 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
     s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
                              buffered_close, buffered_rate_limit,
                              buffered_set_rate_limit,
-                             buffered_get_rate_limit);
+                             buffered_get_rate_limit,
+                             buffered_wait_for_cancel);
 
     qemu_thread_create(&s->thread, migrate_vm, s);
 
diff --git a/hw/hw.h b/hw/hw.h
index 9dd7096..e1d5ea8 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -41,13 +41,15 @@ typedef int (QEMUFileRateLimit)(void *opaque);
  */
 typedef int64_t (QEMUFileSetRateLimit)(void *opaque, int64_t new_rate);
 typedef int64_t (QEMUFileGetRateLimit)(void *opaque);
+typedef void (QEMUFileWaitForCancel)(void *opaque);
 
 QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
                          QEMUFileGetBufferFunc *get_buffer,
                          QEMUFileCloseFunc *close,
                          QEMUFileRateLimit *rate_limit,
                          QEMUFileSetRateLimit *set_rate_limit,
-			 QEMUFileGetRateLimit *get_rate_limit);
+                         QEMUFileGetRateLimit *get_rate_limit,
+                         QEMUFileWaitForCancel *wait_for_cancel);
 QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd);
@@ -56,6 +58,7 @@ QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
 int qemu_stdio_fd(QEMUFile *f);
 void qemu_fflush(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
+void qemu_wait_for_cancel(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
 
diff --git a/migration.c b/migration.c
index d8a0abb..c19a206 100644
--- a/migration.c
+++ b/migration.c
@@ -407,6 +407,7 @@ void migrate_fd_cancel(MigrationState *mig_state)
     DPRINTF("cancelling migration\n");
 
     s->state = MIG_STATE_CANCELLED;
+    qemu_wait_for_cancel(s->file);
 }
 
 void migrate_fd_terminate(FdMigrationState *s)
diff --git a/qemu-thread-posix.c b/qemu-thread-posix.c
index 2bd02ef..0d18b35 100644
--- a/qemu-thread-posix.c
+++ b/qemu-thread-posix.c
@@ -115,6 +115,16 @@ void qemu_cond_wait(QemuCond *cond, QemuMutex *mutex)
         error_exit(err, __func__);
 }
 
+void qemu_thread_join(QemuThread thread)
+{
+    int err;
+
+    err = pthread_join(thread.thread, NULL);
+    if (err) {
+        error_exit(err, __func__);
+    }
+}
+
 void qemu_thread_create(QemuThread *thread,
                        void *(*start_routine)(void*),
                        void *arg)
diff --git a/qemu-thread.h b/qemu-thread.h
index 0a73d50..909529f 100644
--- a/qemu-thread.h
+++ b/qemu-thread.h
@@ -30,6 +30,7 @@ void qemu_cond_destroy(QemuCond *cond);
 void qemu_cond_signal(QemuCond *cond);
 void qemu_cond_broadcast(QemuCond *cond);
 void qemu_cond_wait(QemuCond *cond, QemuMutex *mutex);
+void qemu_thread_join(QemuThread thread);
 
 void qemu_thread_create(QemuThread *thread,
                        void *(*start_routine)(void*),
diff --git a/savevm.c b/savevm.c
index 8139bc7..6bebf7e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -164,6 +164,7 @@ struct QEMUFile {
     QEMUFileRateLimit *rate_limit;
     QEMUFileSetRateLimit *set_rate_limit;
     QEMUFileGetRateLimit *get_rate_limit;
+    QEMUFileWaitForCancel *wait_for_cancel;
     void *opaque;
     int is_write;
 
@@ -261,10 +262,10 @@ QEMUFile *qemu_popen(FILE *stdio_file, const char *mode)
 
     if(mode[0] == 'r') {
         s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_pclose, 
-				 NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     } else {
         s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_pclose, 
-				 NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     }
     return s->file;
 }
@@ -310,10 +311,10 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
 
     if(mode[0] == 'r') {
         s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_fclose, 
-				 NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     } else {
         s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_fclose, 
-				 NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     }
     return s->file;
 
@@ -328,7 +329,7 @@ QEMUFile *qemu_fopen_socket(int fd)
 
     s->fd = fd;
     s->file = qemu_fopen_ops(s, NULL, socket_get_buffer, socket_close, 
-			     NULL, NULL, NULL);
+            NULL, NULL, NULL, NULL);
     return s->file;
 }
 
@@ -366,10 +367,10 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
     
     if(mode[0] == 'w') {
         s->file = qemu_fopen_ops(s, file_put_buffer, NULL, stdio_fclose, 
-				 NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     } else {
         s->file = qemu_fopen_ops(s, NULL, file_get_buffer, stdio_fclose, 
-			       NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
     }
     return s->file;
 fail:
@@ -398,8 +399,9 @@ static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 {
     if (is_writable)
         return qemu_fopen_ops(bs, block_put_buffer, NULL, bdrv_fclose, 
-			      NULL, NULL, NULL);
-    return qemu_fopen_ops(bs, NULL, block_get_buffer, bdrv_fclose, NULL, NULL, NULL);
+                NULL, NULL, NULL, NULL);
+    return qemu_fopen_ops(bs, NULL, block_get_buffer, bdrv_fclose, NULL, NULL,
+            NULL, NULL);
 }
 
 QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
@@ -407,7 +409,8 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
                          QEMUFileCloseFunc *close,
                          QEMUFileRateLimit *rate_limit,
                          QEMUFileSetRateLimit *set_rate_limit,
-                         QEMUFileGetRateLimit *get_rate_limit)
+                         QEMUFileGetRateLimit *get_rate_limit,
+                         QEMUFileWaitForCancel *wait_for_cancel)
 {
     QEMUFile *f;
 
@@ -420,6 +423,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
     f->rate_limit = rate_limit;
     f->set_rate_limit = set_rate_limit;
     f->get_rate_limit = get_rate_limit;
+    f->wait_for_cancel = wait_for_cancel;
     f->is_write = 0;
 
     return f;
@@ -481,6 +485,13 @@ int qemu_fclose(QEMUFile *f)
     return ret;
 }
 
+void qemu_wait_for_cancel(QEMUFile *f)
+{
+    if (f->wait_for_cancel) {
+        f->wait_for_cancel(f->opaque);
+    }
+}
+
 void qemu_file_put_notify(QEMUFile *f)
 {
     f->put_buffer(f->opaque, NULL, 0, 0);
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
  2011-08-11 15:32 ` [RFC PATCH v3 1/4] separate " Umesh Deshpande
  2011-08-11 15:32 ` [RFC PATCH v3 2/4] Making iothread block for migrate_cancel Umesh Deshpande
@ 2011-08-11 15:32 ` Umesh Deshpande
  2011-08-11 16:20   ` Paolo Bonzini
  2011-08-11 15:32 ` [RFC PATCH v3 4/4] Separate migration bitmap Umesh Deshpande
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 15:32 UTC (permalink / raw)
  To: kvm; +Cc: quintela, mtosatti, pbonzini, Umesh Deshpande

Following patch introduces a mutex to protect the migration thread against the
removal of memslots during the guest migration iteration.

Signed-off-by: Umesh Deshpande <udeshpan@redhat.com>
---
 arch_init.c     |   10 ++++++++++
 buffered_file.c |    4 ++++
 cpu-all.h       |    2 ++
 cpus.c          |   12 ++++++++++++
 exec.c          |   10 ++++++++++
 qemu-common.h   |    2 ++
 6 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 484b39d..f0ddda6 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -298,6 +298,11 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     bytes_transferred_last = bytes_transferred;
     bwidth = qemu_get_clock_ns(rt_clock);
 
+    if (stage != 3) {
+        qemu_mutex_lock_ramlist();
+        qemu_mutex_unlock_iothread();
+    }
+
     while (!qemu_file_rate_limit(f)) {
         int bytes_sent;
 
@@ -308,6 +313,11 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         }
     }
 
+    if (stage != 3) {
+        qemu_mutex_lock_iothread();
+        qemu_mutex_unlock_ramlist();
+    }
+
     bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
     bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
 
diff --git a/buffered_file.c b/buffered_file.c
index b64ada7..5735e18 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -243,7 +243,9 @@ static void *migrate_vm(void *opaque)
 
     while (!s->closed) {
         if (s->freeze_output) {
+            qemu_mutex_unlock_iothread();
             s->wait_for_unfreeze(s);
+            qemu_mutex_lock_iothread();
             s->freeze_output = 0;
             continue;
         }
@@ -255,7 +257,9 @@ static void *migrate_vm(void *opaque)
         current_time = qemu_get_clock_ms(rt_clock);
         if (!s->closed && (expire_time > current_time)) {
             tv.tv_usec = 1000 * (expire_time - current_time);
+            qemu_mutex_unlock_iothread();
             select(0, NULL, NULL, NULL, &tv);
+            qemu_mutex_lock_iothread();
             continue;
         }
 
diff --git a/cpu-all.h b/cpu-all.h
index e839100..6a5dbb3 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -21,6 +21,7 @@
 
 #include "qemu-common.h"
 #include "cpu-common.h"
+#include "qemu-thread.h"
 
 /* some important defines:
  *
@@ -931,6 +932,7 @@ typedef struct RAMBlock {
 } RAMBlock;
 
 typedef struct RAMList {
+    QemuMutex mutex;
     uint8_t *phys_dirty;
     QLIST_HEAD(ram, RAMBlock) blocks;
 } RAMList;
diff --git a/cpus.c b/cpus.c
index de70e02..6090c44 100644
--- a/cpus.c
+++ b/cpus.c
@@ -666,6 +666,7 @@ int qemu_init_main_loop(void)
     qemu_cond_init(&qemu_work_cond);
     qemu_mutex_init(&qemu_fair_mutex);
     qemu_mutex_init(&qemu_global_mutex);
+    qemu_mutex_init(&ram_list.mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
     qemu_thread_get_self(&io_thread);
@@ -919,6 +920,17 @@ void qemu_mutex_unlock_iothread(void)
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
+void qemu_mutex_lock_ramlist(void)
+{
+    qemu_mutex_lock(&ram_list.mutex);
+}
+
+void qemu_mutex_unlock_ramlist(void)
+{
+    qemu_mutex_unlock(&ram_list.mutex);
+}
+
+
 static int all_vcpus_paused(void)
 {
     CPUState *penv = first_cpu;
diff --git a/exec.c b/exec.c
index 0e2ce57..7bfb36f 100644
--- a/exec.c
+++ b/exec.c
@@ -2972,6 +2972,8 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
     }
     new_block->length = size;
 
+    qemu_mutex_lock_ramlist();
+
     QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
 
     ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
@@ -2979,6 +2981,8 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
     memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
            0xff, size >> TARGET_PAGE_BITS);
 
+    qemu_mutex_unlock_ramlist();
+
     if (kvm_enabled())
         kvm_setup_guest_memory(new_block->host, size);
 
@@ -2996,7 +3000,9 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr == block->offset) {
+            qemu_mutex_lock_ramlist();
             QLIST_REMOVE(block, next);
+            qemu_mutex_unlock_ramlist();
             qemu_free(block);
             return;
         }
@@ -3009,7 +3015,9 @@ void qemu_ram_free(ram_addr_t addr)
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr == block->offset) {
+            qemu_mutex_lock_ramlist();
             QLIST_REMOVE(block, next);
+            qemu_mutex_unlock_ramlist();
             if (block->flags & RAM_PREALLOC_MASK) {
                 ;
             } else if (mem_path) {
@@ -3117,8 +3125,10 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
         if (addr - block->offset < block->length) {
             /* Move this entry to to start of the list.  */
             if (block != QLIST_FIRST(&ram_list.blocks)) {
+                qemu_mutex_lock_ramlist();
                 QLIST_REMOVE(block, next);
                 QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+                qemu_mutex_unlock_ramlist();
             }
             if (xen_mapcache_enabled()) {
                 /* We need to check if the requested address is in the RAM
diff --git a/qemu-common.h b/qemu-common.h
index abd7a75..b802883 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -212,6 +212,8 @@ char *qemu_strndup(const char *str, size_t size);
 
 void qemu_mutex_lock_iothread(void);
 void qemu_mutex_unlock_iothread(void);
+void qemu_mutex_lock_ramlist(void);
+void qemu_mutex_unlock_ramlist(void);
 
 int qemu_open(const char *name, int flags, ...);
 ssize_t qemu_write_full(int fd, const void *buf, size_t count)
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC PATCH v3 4/4] Separate migration bitmap
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
                   ` (2 preceding siblings ...)
  2011-08-11 15:32 ` [RFC PATCH v3 3/4] lock to protect memslots Umesh Deshpande
@ 2011-08-11 15:32 ` Umesh Deshpande
  2011-08-11 16:23 ` [RFC PATCH v3 0/4] Separate thread for VM migration Paolo Bonzini
  2011-08-11 18:25 ` Anthony Liguori
  5 siblings, 0 replies; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 15:32 UTC (permalink / raw)
  To: kvm; +Cc: quintela, mtosatti, pbonzini, Umesh Deshpande

This patch creates a migration bitmap, which is periodically kept in sync with
the qemu bitmap. A separate copy of the dirty bitmap for the migration avoids
concurrent access to the qemu bitmap from iothread and migration thread.

Signed-off-by: Umesh Deshpande <udeshpan@redhat.com>
---
 arch_init.c |   16 ++++++++------
 cpu-all.h   |   37 ++++++++++++++++++++++++++++++++++
 exec.c      |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index f0ddda6..296b7d6 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -123,13 +123,13 @@ static int ram_save_block(QEMUFile *f)
     current_addr = block->offset + offset;
 
     do {
-        if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) {
+        if (migration_bitmap_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) {
             uint8_t *p;
             int cont = (block == last_block) ? RAM_SAVE_FLAG_CONTINUE : 0;
 
-            cpu_physical_memory_reset_dirty(current_addr,
-                                            current_addr + TARGET_PAGE_SIZE,
-                                            MIGRATION_DIRTY_FLAG);
+            migration_bitmap_reset_dirty(current_addr,
+                                         current_addr + TARGET_PAGE_SIZE,
+                                         MIGRATION_DIRTY_FLAG);
 
             p = block->host + offset;
 
@@ -185,7 +185,7 @@ static ram_addr_t ram_save_remaining(void)
         ram_addr_t addr;
         for (addr = block->offset; addr < block->offset + block->length;
              addr += TARGET_PAGE_SIZE) {
-            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
+            if (migration_bitmap_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
                 count++;
             }
         }
@@ -265,6 +265,8 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         return 0;
     }
 
+    sync_migration_bitmap(0, TARGET_PHYS_ADDR_MAX);
+
     if (stage == 1) {
         RAMBlock *block;
         bytes_transferred = 0;
@@ -276,9 +278,9 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         QLIST_FOREACH(block, &ram_list.blocks, next) {
             for (addr = block->offset; addr < block->offset + block->length;
                  addr += TARGET_PAGE_SIZE) {
-                if (!cpu_physical_memory_get_dirty(addr,
+                if (!migration_bitmap_get_dirty(addr,
                                                    MIGRATION_DIRTY_FLAG)) {
-                    cpu_physical_memory_set_dirty(addr);
+                    migration_bitmap_set_dirty(addr);
                 }
             }
         }
diff --git a/cpu-all.h b/cpu-all.h
index 6a5dbb3..34a225b 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -934,6 +934,7 @@ typedef struct RAMBlock {
 typedef struct RAMList {
     QemuMutex mutex;
     uint8_t *phys_dirty;
+    uint8_t *migration_bitmap;
     QLIST_HEAD(ram, RAMBlock) blocks;
 } RAMList;
 extern RAMList ram_list;
@@ -1006,8 +1007,44 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
     }
 }
 
+
+
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
                                      int dirty_flags);
+
+static inline int migration_bitmap_get_dirty(ram_addr_t addr,
+                                             int dirty_flags)
+{
+    return ram_list.migration_bitmap[addr >> TARGET_PAGE_BITS] & dirty_flags;
+}
+
+static inline void migration_bitmap_set_dirty(ram_addr_t addr)
+{
+    ram_list.migration_bitmap[addr >> TARGET_PAGE_BITS] = 0xff;
+}
+
+static inline void migration_bitmap_mask_dirty_range(ram_addr_t start,
+                                                     int length,
+                                                     int dirty_flags)
+{
+    int i, mask, len;
+    uint8_t *p;
+
+    len = length >> TARGET_PAGE_BITS;
+    mask = ~dirty_flags;
+    p = ram_list.migration_bitmap + (start >> TARGET_PAGE_BITS);
+    for (i = 0; i < len; i++) {
+        p[i] &= mask;
+    }
+}
+
+
+void migration_bitmap_reset_dirty(ram_addr_t start,
+                                  ram_addr_t end,
+                                  int dirty_flags);
+
+void sync_migration_bitmap(ram_addr_t start, ram_addr_t end);
+
 void cpu_tlb_update_dirty(CPUState *env);
 
 int cpu_physical_memory_set_dirty_tracking(int enable);
diff --git a/exec.c b/exec.c
index 7bfb36f..f758c4e 100644
--- a/exec.c
+++ b/exec.c
@@ -2106,6 +2106,10 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
         abort();
     }
 
+    if (kvm_enabled()) {
+        return;
+    }
+
     for(env = first_cpu; env != NULL; env = env->next_cpu) {
         int mmu_idx;
         for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@@ -2114,8 +2118,61 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
                                       start1, length);
         }
     }
+
+}
+
+void migration_bitmap_reset_dirty(ram_addr_t start, ram_addr_t end,
+                                  int dirty_flags)
+{
+    unsigned long length, start1;
+
+    start &= TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+
+    length = end - start;
+    if (length == 0) {
+        return;
+    }
+
+    migration_bitmap_mask_dirty_range(start, length, dirty_flags);
+
+    /* we modify the TLB cache so that the dirty bit will be set again
+       when accessing the range */
+    start1 = (unsigned long)qemu_safe_ram_ptr(start);
+    /* Check that we don't span multiple blocks - this breaks the
+       address comparisons below.  */
+    if ((unsigned long)qemu_safe_ram_ptr(end - 1) - start1
+            != (end - 1) - start) {
+        abort();
+    }
+}
+
+void sync_migration_bitmap(ram_addr_t start, ram_addr_t end)
+{
+    unsigned long length, len, i;
+    ram_addr_t addr;
+    start &= TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+
+    length = end - start;
+    if (length == 0) {
+        return;
+    }
+
+    len = length >> TARGET_PAGE_BITS;
+    for (i = 0; i < len; i++) {
+        addr = i << TARGET_PAGE_BITS;
+        if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
+            migration_bitmap_set_dirty(addr);
+            cpu_physical_memory_reset_dirty(addr, addr + TARGET_PAGE_SIZE,
+                                            MIGRATION_DIRTY_FLAG);
+        }
+    }
+
 }
 
+
+
 int cpu_physical_memory_set_dirty_tracking(int enable)
 {
     int ret = 0;
@@ -2981,6 +3038,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
     memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
            0xff, size >> TARGET_PAGE_BITS);
 
+    ram_list.migration_bitmap = qemu_realloc(ram_list.phys_dirty,
+                                             last_ram_offset() >>
+                                             TARGET_PAGE_BITS);
+
+    memset(ram_list.migration_bitmap + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, size >> TARGET_PAGE_BITS);
+
     qemu_mutex_unlock_ramlist();
 
     if (kvm_enabled())
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 1/4] separate thread for VM migration
  2011-08-11 15:32 ` [RFC PATCH v3 1/4] separate " Umesh Deshpande
@ 2011-08-11 16:18   ` Paolo Bonzini
  2011-08-11 17:36     ` Umesh Deshpande
  0 siblings, 1 reply; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-11 16:18 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti

On 08/11/2011 05:32 PM, Umesh Deshpande wrote:
> This patch creates a separate thread for the guest migration on the source side.
> migrate_cancel request from the iothread is handled asynchronously. That is,
> iothread submits migrate_cancel to the migration thread and returns, while the
> migration thread attends this request at the next iteration to terminate its
> execution.

Looks pretty good!  I hope you agree. :)  Just one note inside.

> Signed-off-by: Umesh Deshpande<udeshpan@redhat.com>
> ---
>   buffered_file.c |   85 ++++++++++++++++++++++++++++++++----------------------
>   buffered_file.h |    4 ++
>   migration.c     |   49 ++++++++++++++-----------------
>   migration.h     |    6 ++++
>   4 files changed, 82 insertions(+), 62 deletions(-)
>
> diff --git a/buffered_file.c b/buffered_file.c
> index 41b42c3..19932b6 100644
> --- a/buffered_file.c
> +++ b/buffered_file.c
> @@ -16,6 +16,8 @@
>   #include "qemu-timer.h"
>   #include "qemu-char.h"
>   #include "buffered_file.h"
> +#include "migration.h"
> +#include "qemu-thread.h"
>
>   //#define DEBUG_BUFFERED_FILE
>
> @@ -28,13 +30,14 @@ typedef struct QEMUFileBuffered
>       void *opaque;
>       QEMUFile *file;
>       int has_error;
> +    int closed;
>       int freeze_output;
>       size_t bytes_xfer;
>       size_t xfer_limit;
>       uint8_t *buffer;
>       size_t buffer_size;
>       size_t buffer_capacity;
> -    QEMUTimer *timer;
> +    QemuThread thread;
>   } QEMUFileBuffered;
>
>   #ifdef DEBUG_BUFFERED_FILE
> @@ -155,14 +158,6 @@ static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, in
>           offset = size;
>       }
>
> -    if (pos == 0&&  size == 0) {
> -        DPRINTF("file is ready\n");
> -        if (s->bytes_xfer<= s->xfer_limit) {
> -            DPRINTF("notifying client\n");
> -            s->put_ready(s->opaque);
> -        }
> -    }
> -
>       return offset;
>   }
>
> @@ -175,20 +170,20 @@ static int buffered_close(void *opaque)
>
>       while (!s->has_error&&  s->buffer_size) {
>           buffered_flush(s);
> -        if (s->freeze_output)
> +        if (s->freeze_output) {
>               s->wait_for_unfreeze(s);
> +        }
>       }

This is racy; you might end up calling buffered_put_buffer twice from 
two different threads.

> -    ret = s->close(s->opaque);
> +    s->closed = 1;
>
> -    qemu_del_timer(s->timer);
> -    qemu_free_timer(s->timer);
> +    ret = s->close(s->opaque);
>       qemu_free(s->buffer);
> -    qemu_free(s);

... similarly, here the migration thread might end up using the buffer. 
  Just set s->closed here and wait for thread completion; the migration 
thread can handle the flushes free the buffer etc.  Let the migration 
thread do as much as possible, it will simplify your life.

>       return ret;
>   }
>
> +
>   static int buffered_rate_limit(void *opaque)
>   {
>       QEMUFileBuffered *s = opaque;
> @@ -228,34 +223,55 @@ static int64_t buffered_get_rate_limit(void *opaque)
>       return s->xfer_limit;
>   }
>
> -static void buffered_rate_tick(void *opaque)
> +static void *migrate_vm(void *opaque)
>   {
>       QEMUFileBuffered *s = opaque;
> +    int64_t current_time, expire_time = qemu_get_clock_ms(rt_clock) + 100;
> +    struct timeval tv = { .tv_sec = 0, .tv_usec = 100000};
>
> -    if (s->has_error) {
> -        buffered_close(s);
> -        return;
> -    }
> +    qemu_mutex_lock_iothread();
>
> -    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
> +    while (!s->closed) {

... This can be in fact

     while (!s->closed || s->buffered_size)

and that alone will subsume the loop in buffered_close, no?

> +        if (s->freeze_output) {
> +            s->wait_for_unfreeze(s);
> +            s->freeze_output = 0;
> +            continue;
> +        }
>
> -    if (s->freeze_output)
> -        return;
> +        if (s->has_error) {
> +            break;
> +        }
> +
> +        current_time = qemu_get_clock_ms(rt_clock);
> +        if (!s->closed&&  (expire_time>  current_time)) {
> +            tv.tv_usec = 1000 * (expire_time - current_time);
> +            select(0, NULL, NULL, NULL,&tv);
> +            continue;
> +        }
>
> -    s->bytes_xfer = 0;
> +        s->bytes_xfer = 0;
> +        buffered_flush(s);
>
> -    buffered_flush(s);
> +        expire_time = qemu_get_clock_ms(rt_clock) + 100;
> +        s->put_ready(s->opaque);
> +    }
>
> -    /* Add some checks around this */
> -    s->put_ready(s->opaque);
> +    if (s->has_error) {
> +        buffered_close(s);
> +    }
> +    qemu_free(s);
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    return NULL;
>   }
>
>   QEMUFile *qemu_fopen_ops_buffered(void *opaque,
> -                                  size_t bytes_per_sec,
> -                                  BufferedPutFunc *put_buffer,
> -                                  BufferedPutReadyFunc *put_ready,
> -                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
> -                                  BufferedCloseFunc *close)
> +        size_t bytes_per_sec,
> +        BufferedPutFunc *put_buffer,
> +        BufferedPutReadyFunc *put_ready,
> +        BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
> +        BufferedCloseFunc *close)
>   {
>       QEMUFileBuffered *s;
>
> @@ -267,15 +283,14 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
>       s->put_ready = put_ready;
>       s->wait_for_unfreeze = wait_for_unfreeze;
>       s->close = close;
> +    s->closed = 0;
>
>       s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
>                                buffered_close, buffered_rate_limit,
>                                buffered_set_rate_limit,
> -			     buffered_get_rate_limit);
> -
> -    s->timer = qemu_new_timer_ms(rt_clock, buffered_rate_tick, s);
> +                             buffered_get_rate_limit);
>
> -    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
> +    qemu_thread_create(&s->thread, migrate_vm, s);
>
>       return s->file;
>   }
> diff --git a/buffered_file.h b/buffered_file.h
> index 98d358b..477bf7c 100644
> --- a/buffered_file.h
> +++ b/buffered_file.h
> @@ -17,9 +17,13 @@
>   #include "hw/hw.h"
>
>   typedef ssize_t (BufferedPutFunc)(void *opaque, const void *data, size_t size);
> +typedef void (BufferedBeginFunc)(void *opaque);

Unused typedef.

>   typedef void (BufferedPutReadyFunc)(void *opaque);
>   typedef void (BufferedWaitForUnfreezeFunc)(void *opaque);
>   typedef int (BufferedCloseFunc)(void *opaque);
> +typedef void (BufferedWaitForCancelFunc)(void *opaque);
> +
> +void wait_for_cancel(void *opaque);

BufferedWaitForCancelFunc should go in patch 2; wait_for_cancel is unused.

>   QEMUFile *qemu_fopen_ops_buffered(void *opaque, size_t xfer_limit,
>                                     BufferedPutFunc *put_buffer,
> diff --git a/migration.c b/migration.c
> index af3a1f2..d8a0abb 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -284,8 +284,6 @@ int migrate_fd_cleanup(FdMigrationState *s)
>   {
>       int ret = 0;
>
> -    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> -
>       if (s->file) {
>           DPRINTF("closing file\n");
>           if (qemu_fclose(s->file) != 0) {
> @@ -307,14 +305,6 @@ int migrate_fd_cleanup(FdMigrationState *s)
>       return ret;
>   }
>
> -void migrate_fd_put_notify(void *opaque)
> -{
> -    FdMigrationState *s = opaque;
> -
> -    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> -    qemu_file_put_notify(s->file);
> -}
> -

qemu_file_put_notify is also unused now.

>   ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
>   {
>       FdMigrationState *s = opaque;
> @@ -327,9 +317,7 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
>       if (ret == -1)
>           ret = -(s->get_error(s));
>
> -    if (ret == -EAGAIN) {
> -        qemu_set_fd_handler2(s->fd, NULL, NULL, migrate_fd_put_notify, s);
> -    } else if (ret<  0) {
> +    if (ret<  0&&  ret != -EAGAIN) {
>           if (s->mon) {
>               monitor_resume(s->mon);
>           }
> @@ -342,36 +330,40 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
>
>   void migrate_fd_connect(FdMigrationState *s)
>   {
> -    int ret;
> -
> +    s->begin = 1;
>       s->file = qemu_fopen_ops_buffered(s,
>                                         s->bandwidth_limit,
>                                         migrate_fd_put_buffer,
>                                         migrate_fd_put_ready,
>                                         migrate_fd_wait_for_unfreeze,
>                                         migrate_fd_close);
> -
> -    DPRINTF("beginning savevm\n");
> -    ret = qemu_savevm_state_begin(s->mon, s->file, s->mig_state.blk,
> -                                  s->mig_state.shared);
> -    if (ret<  0) {
> -        DPRINTF("failed, %d\n", ret);
> -        migrate_fd_error(s);
> -        return;
> -    }
> -
> -    migrate_fd_put_ready(s);
>   }
>
>   void migrate_fd_put_ready(void *opaque)
>   {
>       FdMigrationState *s = opaque;
> +    int ret;
>
>       if (s->state != MIG_STATE_ACTIVE) {
>           DPRINTF("put_ready returning because of non-active state\n");
> +        if (s->state == MIG_STATE_CANCELLED) {
> +            migrate_fd_terminate(s);
> +        }
>           return;
>       }
>
> +    if (s->begin) {
> +        DPRINTF("beginning savevm\n");
> +        ret = qemu_savevm_state_begin(s->mon, s->file, s->mig_state.blk,
> +                s->mig_state.shared);
> +        if (ret<  0) {
> +            DPRINTF("failed, %d\n", ret);
> +            migrate_fd_error(s);
> +            return;
> +        }
> +        s->begin = 0;
> +    }
> +
>       DPRINTF("iterate\n");
>       if (qemu_savevm_state_iterate(s->mon, s->file) == 1) {
>           int state;
> @@ -415,6 +407,10 @@ void migrate_fd_cancel(MigrationState *mig_state)
>       DPRINTF("cancelling migration\n");
>
>       s->state = MIG_STATE_CANCELLED;
> +}
> +
> +void migrate_fd_terminate(FdMigrationState *s)
> +{
>       notifier_list_notify(&migration_state_notifiers);
>       qemu_savevm_state_cancel(s->mon, s->file);
>
> @@ -458,7 +454,6 @@ int migrate_fd_close(void *opaque)
>   {
>       FdMigrationState *s = opaque;
>
> -    qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
>       return s->close(s);
>   }
>
> diff --git a/migration.h b/migration.h
> index 050c56c..887f84c 100644
> --- a/migration.h
> +++ b/migration.h
> @@ -45,9 +45,11 @@ struct FdMigrationState
>       int fd;
>       Monitor *mon;
>       int state;
> +    int begin;
>       int (*get_error)(struct FdMigrationState*);
>       int (*close)(struct FdMigrationState*);
>       int (*write)(struct FdMigrationState*, const void *, size_t);
> +    void (*callback)(void *);
>       void *opaque;
>   };
>
> @@ -118,12 +120,16 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size);
>
>   void migrate_fd_connect(FdMigrationState *s);
>
> +void migrate_fd_begin(void *opaque);
> +
>   void migrate_fd_put_ready(void *opaque);
>
>   int migrate_fd_get_status(MigrationState *mig_state);
>
>   void migrate_fd_cancel(MigrationState *mig_state);
>
> +void migrate_fd_terminate(FdMigrationState *s);
> +
>   void migrate_fd_release(MigrationState *mig_state);
>
>   void migrate_fd_wait_for_unfreeze(void *opaque);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-11 15:32 ` [RFC PATCH v3 3/4] lock to protect memslots Umesh Deshpande
@ 2011-08-11 16:20   ` Paolo Bonzini
  2011-08-12  6:45     ` Paolo Bonzini
  0 siblings, 1 reply; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-11 16:20 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti

> diff --git a/buffered_file.c b/buffered_file.c
> index b64ada7..5735e18 100644
> --- a/buffered_file.c
> +++ b/buffered_file.c
> @@ -243,7 +243,9 @@ static void *migrate_vm(void *opaque)
>
>       while (!s->closed) {
>           if (s->freeze_output) {
> +            qemu_mutex_unlock_iothread();
>               s->wait_for_unfreeze(s);
> +            qemu_mutex_lock_iothread();
>               s->freeze_output = 0;
>               continue;
>           }
> @@ -255,7 +257,9 @@ static void *migrate_vm(void *opaque)
>           current_time = qemu_get_clock_ms(rt_clock);
>           if (!s->closed&&  (expire_time>  current_time)) {
>               tv.tv_usec = 1000 * (expire_time - current_time);
> +            qemu_mutex_unlock_iothread();
>               select(0, NULL, NULL, NULL,&tv);
> +            qemu_mutex_lock_iothread();
>               continue;
>           }
>

I believe these should be already in patch 1 or anyway in a separate patch.

> diff --git a/cpus.c b/cpus.c
> index de70e02..6090c44 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -666,6 +666,7 @@ int qemu_init_main_loop(void)
>       qemu_cond_init(&qemu_work_cond);
>       qemu_mutex_init(&qemu_fair_mutex);
>       qemu_mutex_init(&qemu_global_mutex);
> +    qemu_mutex_init(&ram_list.mutex);
>       qemu_mutex_lock(&qemu_global_mutex);
>
>       qemu_thread_get_self(&io_thread);

These are only executed if CONFIG_IOTHREAD; you can probably move it 
somewhere in exec.c.

> @@ -919,6 +920,17 @@ void qemu_mutex_unlock_iothread(void)
>       qemu_mutex_unlock(&qemu_global_mutex);
>   }
>
> +void qemu_mutex_lock_ramlist(void)
> +{
> +    qemu_mutex_lock(&ram_list.mutex);
> +}
> +
> +void qemu_mutex_unlock_ramlist(void)
> +{
> +    qemu_mutex_unlock(&ram_list.mutex);
> +}
> +
> +

And these too.

>   static int all_vcpus_paused(void)
>   {
>       CPUState *penv = first_cpu;
> diff --git a/exec.c b/exec.c
> index 0e2ce57..7bfb36f 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -2972,6 +2972,8 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>       }
>       new_block->length = size;
>
> +    qemu_mutex_lock_ramlist();
> +
>       QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
>
>       ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
> @@ -2979,6 +2981,8 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>       memset(ram_list.phys_dirty + (new_block->offset>>  TARGET_PAGE_BITS),
>              0xff, size>>  TARGET_PAGE_BITS);
>
> +    qemu_mutex_unlock_ramlist();
> +
>       if (kvm_enabled())
>           kvm_setup_guest_memory(new_block->host, size);
>
> @@ -2996,7 +3000,9 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (addr == block->offset) {
> +            qemu_mutex_lock_ramlist();
>               QLIST_REMOVE(block, next);
> +            qemu_mutex_unlock_ramlist();
>               qemu_free(block);
>               return;
>           }
> @@ -3009,7 +3015,9 @@ void qemu_ram_free(ram_addr_t addr)
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (addr == block->offset) {
> +            qemu_mutex_lock_ramlist();
>               QLIST_REMOVE(block, next);
> +            qemu_mutex_unlock_ramlist();
>               if (block->flags&  RAM_PREALLOC_MASK) {
>                   ;
>               } else if (mem_path) {
> @@ -3117,8 +3125,10 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>           if (addr - block->offset<  block->length) {
>               /* Move this entry to to start of the list.  */
>               if (block != QLIST_FIRST(&ram_list.blocks)) {
> +                qemu_mutex_lock_ramlist();
>                   QLIST_REMOVE(block, next);
>                   QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
> +                qemu_mutex_unlock_ramlist();

Theoretically qemu_get_ram_ptr should be protected.  The problem is not 
just accessing the ramlist, it is accessing the data underneath it 
before anyone frees it.  Luckily we can set aside that problem for now, 
because qemu_ram_free_from_ptr is only used by device assignment and 
device assignment makes VMs unmigratable.

If we support generic RAM hotplug/hotunplug, we would probably need 
something RCU-like to protect accesses, since protecting all uses of 
qemu_get_ram_ptr is not practical.

>               }
>               if (xen_mapcache_enabled()) {
>                   /* We need to check if the requested address is in the RAM
> diff --git a/qemu-common.h b/qemu-common.h
> index abd7a75..b802883 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -212,6 +212,8 @@ char *qemu_strndup(const char *str, size_t size);
>
>   void qemu_mutex_lock_iothread(void);
>   void qemu_mutex_unlock_iothread(void);
> +void qemu_mutex_lock_ramlist(void);
> +void qemu_mutex_unlock_ramlist(void);
>
>   int qemu_open(const char *name, int flags, ...);
>   ssize_t qemu_write_full(int fd, const void *buf, size_t count)

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 0/4] Separate thread for VM migration
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
                   ` (3 preceding siblings ...)
  2011-08-11 15:32 ` [RFC PATCH v3 4/4] Separate migration bitmap Umesh Deshpande
@ 2011-08-11 16:23 ` Paolo Bonzini
  2011-08-11 18:25 ` Anthony Liguori
  5 siblings, 0 replies; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-11 16:23 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti

On 08/11/2011 05:32 PM, Umesh Deshpande wrote:
> Following patch series deals with VCPU and iothread starvation during the
> migration of
> a guest. Currently the iothread is responsible for performing the guest
> migration. It holds qemu_mutex during the migration and doesn't allow VCPU to
> enter the qemu mode and delays its return to the guest. The guest migration,
> executed as an iohandler also delays the execution of other iohandlers.
> In the following patch series,
>
> The migration has been moved to a separate thread to
> reduce the qemu_mutex contention and iohandler starvation.
>
> Umesh Deshpande (4):
>    separate thread for VM migration
>    synchronous migrate_cancel
>    lock to protect memslots
>    separate migration bitmap
>
>   arch_init.c         |   26 ++++++++++----
>   buffered_file.c     |  100 +++++++++++++++++++++++++++++++++------------------
>   buffered_file.h     |    4 ++
>   cpu-all.h           |   39 ++++++++++++++++++++
>   cpus.c              |   12 ++++++
>   exec.c              |   74 +++++++++++++++++++++++++++++++++++++
>   hw/hw.h             |    5 ++-
>   migration.c         |   50 ++++++++++++--------------
>   migration.h         |    6 +++
>   qemu-common.h       |    2 +
>   qemu-thread-posix.c |   10 +++++
>   qemu-thread.h       |    1 +
>   savevm.c            |   31 +++++++++++-----
>   13 files changed, 280 insertions(+), 80 deletions(-)
>

Looks quite good.  Note that when submitting for inclusion, the last two 
patches should go first, so that you only add the migration thread after 
you're race free.  If there are parts that do not apply before patches 
1/2, that likely means they are in the wrong patch. :)

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 1/4] separate thread for VM migration
  2011-08-11 16:18   ` Paolo Bonzini
@ 2011-08-11 17:36     ` Umesh Deshpande
  2011-08-12  6:40       ` Paolo Bonzini
  0 siblings, 1 reply; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-11 17:36 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, quintela, mtosatti

On 08/11/2011 12:18 PM, Paolo Bonzini wrote:
>> @@ -175,20 +170,20 @@ static int buffered_close(void *opaque)
>>
>>       while (!s->has_error&&  s->buffer_size) {
>>           buffered_flush(s);
>> -        if (s->freeze_output)
>> +        if (s->freeze_output) {
>>               s->wait_for_unfreeze(s);
>> +        }
>>       }
>
> This is racy; you might end up calling buffered_put_buffer twice from 
> two different threads.
Now, migrate_fd_cleanup, buffured_close is just executed by the 
migration thread.
I am not letting iothread call any migration cancellation related 
functions. In stead it just submits the request and waits for the 
migration thread to terminate itself in the next iteration.
The reason is to avoid the call to qemu_fflush,  
qemu_savevm_state_cancel (to carry out migrate_cancel) from iothread 
while migration thread is transferring data without holding the locks.

>
>> -    ret = s->close(s->opaque);
>> +    s->closed = 1;
>>
>> -    qemu_del_timer(s->timer);
>> -    qemu_free_timer(s->timer);
>> +    ret = s->close(s->opaque);
>>       qemu_free(s->buffer);
>> -    qemu_free(s);
>
> ... similarly, here the migration thread might end up using the 
> buffer.  Just set s->closed here and wait for thread completion; the 
> migration thread can handle the flushes free the buffer etc.  Let the 
> migration thread do as much as possible, it will simplify your life.
>
>>       return ret;
>>   }
>>
>> +
>>   static int buffered_rate_limit(void *opaque)
>>   {
>>       QEMUFileBuffered *s = opaque;
>> @@ -228,34 +223,55 @@ static int64_t buffered_get_rate_limit(void 
>> *opaque)
>>       return s->xfer_limit;
>>   }
>>
>> -static void buffered_rate_tick(void *opaque)
>> +static void *migrate_vm(void *opaque)
>>   {
>>       QEMUFileBuffered *s = opaque;
>> +    int64_t current_time, expire_time = qemu_get_clock_ms(rt_clock) 
>> + 100;
>> +    struct timeval tv = { .tv_sec = 0, .tv_usec = 100000};
>>
>> -    if (s->has_error) {
>> -        buffered_close(s);
>> -        return;
>> -    }
>> +    qemu_mutex_lock_iothread();
>>
>> -    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
>> +    while (!s->closed) {
>
> ... This can be in fact
>
>     while (!s->closed || s->buffered_size)
>
> and that alone will subsume the loop in buffered_close, no?
s->fd is closed in migrate_fd_cleanup (which calls buffered_close). So I 
flush the buffer in buffered_close before closing the descriptor, and 
then migration thread simply exits because s->closed is set.

- Umesh

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 0/4] Separate thread for VM migration
  2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
                   ` (4 preceding siblings ...)
  2011-08-11 16:23 ` [RFC PATCH v3 0/4] Separate thread for VM migration Paolo Bonzini
@ 2011-08-11 18:25 ` Anthony Liguori
  5 siblings, 0 replies; 19+ messages in thread
From: Anthony Liguori @ 2011-08-11 18:25 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti, pbonzini

On 08/11/2011 10:32 AM, Umesh Deshpande wrote:
> Following patch series deals with VCPU and iothread starvation during the
> migration of
> a guest. Currently the iothread is responsible for performing the guest
> migration. It holds qemu_mutex during the migration and doesn't allow VCPU to
> enter the qemu mode and delays its return to the guest. The guest migration,
> executed as an iohandler also delays the execution of other iohandlers.
> In the following patch series,
>
> The migration has been moved to a separate thread to
> reduce the qemu_mutex contention and iohandler starvation.

This needs to go to qemu-devel.

Regards,

Anthony Liguori

>
> Umesh Deshpande (4):
>    separate thread for VM migration
>    synchronous migrate_cancel
>    lock to protect memslots
>    separate migration bitmap
>
>   arch_init.c         |   26 ++++++++++----
>   buffered_file.c     |  100 +++++++++++++++++++++++++++++++++------------------
>   buffered_file.h     |    4 ++
>   cpu-all.h           |   39 ++++++++++++++++++++
>   cpus.c              |   12 ++++++
>   exec.c              |   74 +++++++++++++++++++++++++++++++++++++
>   hw/hw.h             |    5 ++-
>   migration.c         |   50 ++++++++++++--------------
>   migration.h         |    6 +++
>   qemu-common.h       |    2 +
>   qemu-thread-posix.c |   10 +++++
>   qemu-thread.h       |    1 +
>   savevm.c            |   31 +++++++++++-----
>   13 files changed, 280 insertions(+), 80 deletions(-)
>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 1/4] separate thread for VM migration
  2011-08-11 17:36     ` Umesh Deshpande
@ 2011-08-12  6:40       ` Paolo Bonzini
  0 siblings, 0 replies; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-12  6:40 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti

On 08/11/2011 07:36 PM, Umesh Deshpande wrote:
>>
> Now, migrate_fd_cleanup, buffured_close is just executed by the
> migration thread.
> I am not letting iothread call any migration cancellation related
> functions. In stead it just submits the request and waits for the
> migration thread to terminate itself in the next iteration.
> The reason is to avoid the call to qemu_fflush,
> qemu_savevm_state_cancel (to carry out migrate_cancel) from iothread
> while migration thread is transferring data without holding the locks.

Got it now, thanks for explaining.

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-11 16:20   ` Paolo Bonzini
@ 2011-08-12  6:45     ` Paolo Bonzini
  2011-08-15  6:45       ` Umesh Deshpande
  2011-08-15  7:26       ` Marcelo Tosatti
  0 siblings, 2 replies; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-12  6:45 UTC (permalink / raw)
  Cc: Umesh Deshpande, kvm, quintela, mtosatti

On 08/11/2011 06:20 PM, Paolo Bonzini wrote:
>>
>> +                qemu_mutex_lock_ramlist();
>>                   QLIST_REMOVE(block, next);
>>                   QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
>> +                qemu_mutex_unlock_ramlist();
>
> Theoretically qemu_get_ram_ptr should be protected.  The problem is not
> just accessing the ramlist, it is accessing the data underneath it
> before anyone frees it.  Luckily we can set aside that problem for now,
> because qemu_ram_free_from_ptr is only used by device assignment and
> device assignment makes VMs unmigratable.

Hmm, rethinking about it, all the loops in exec.c should be protected 
from the mutex.  That's not too good because qemu_get_ram_ptr is a hot 
path for TCG.  Perhaps you can also avoid the mutex entirely, and just 
disable the above optimization for most-recently-used-block while 
migration is running.  It's not a complete solution, but it could be 
good enough until we have RAM hot-plug/hot-unplug.

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-12  6:45     ` Paolo Bonzini
@ 2011-08-15  6:45       ` Umesh Deshpande
  2011-08-15 14:10         ` Paolo Bonzini
  2011-08-15  7:26       ` Marcelo Tosatti
  1 sibling, 1 reply; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-15  6:45 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, quintela, mtosatti

On 08/12/2011 02:45 AM, Paolo Bonzini wrote:
> On 08/11/2011 06:20 PM, Paolo Bonzini wrote:
>>>
>>> +                qemu_mutex_lock_ramlist();
>>>                   QLIST_REMOVE(block, next);
>>>                   QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
>>> +                qemu_mutex_unlock_ramlist();
>>
>> Theoretically qemu_get_ram_ptr should be protected.  The problem is not
>> just accessing the ramlist, it is accessing the data underneath it
>> before anyone frees it.  Luckily we can set aside that problem for now,
>> because qemu_ram_free_from_ptr is only used by device assignment and
>> device assignment makes VMs unmigratable.
>
> Hmm, rethinking about it, all the loops in exec.c should be protected 
> from the mutex. 
Other loops in exec.c are just for reading the ram_list members, and the 
migration thread doesn't modify ram_list.
Also, protecting the loops in exec.c would make those functions 
un-callable from the functions that are already holding the ram_list 
mutex to protect themselves against memslot removal (migration thread in 
our case).
> That's not too good because qemu_get_ram_ptr is a hot path for TCG. 
Looks like qemu_get_ram_ptr isn't called from the source side code of 
guest migration.
> Perhaps you can also avoid the mutex entirely, and just disable the 
> above optimization for most-recently-used-block while migration is 
> running.  It's not a complete solution, but it could be good enough 
> until we have RAM hot-plug/hot-unplug.
>
> Paolo


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-12  6:45     ` Paolo Bonzini
  2011-08-15  6:45       ` Umesh Deshpande
@ 2011-08-15  7:26       ` Marcelo Tosatti
  2011-08-15 14:14         ` Paolo Bonzini
  1 sibling, 1 reply; 19+ messages in thread
From: Marcelo Tosatti @ 2011-08-15  7:26 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Umesh Deshpande, quintela

On Fri, Aug 12, 2011 at 08:45:50AM +0200, Paolo Bonzini wrote:
> On 08/11/2011 06:20 PM, Paolo Bonzini wrote:
> >>
> >>+                qemu_mutex_lock_ramlist();
> >>                  QLIST_REMOVE(block, next);
> >>                  QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
> >>+                qemu_mutex_unlock_ramlist();
> >
> >Theoretically qemu_get_ram_ptr should be protected.  The problem is not
> >just accessing the ramlist, it is accessing the data underneath it
> >before anyone frees it.  Luckily we can set aside that problem for now,
> >because qemu_ram_free_from_ptr is only used by device assignment and
> >device assignment makes VMs unmigratable.
> 
> Hmm, rethinking about it, all the loops in exec.c should be
> protected from the mutex.  That's not too good because
> qemu_get_ram_ptr is a hot path for TCG.

Right.

>  Perhaps you can also avoid
> the mutex entirely, and just disable the above optimization for
> most-recently-used-block while migration is running.  It's not a
> complete solution, but it could be good enough until we have RAM
> hot-plug/hot-unplug.

Actually the previous patchset does not traverse the ramlist without 
qemu_mutex locked, which is safe versus the most-recently-used-block
optimization.

So, agreed that the new ramlist lock is not necessary for now. 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-15  6:45       ` Umesh Deshpande
@ 2011-08-15 14:10         ` Paolo Bonzini
  0 siblings, 0 replies; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-15 14:10 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: kvm, quintela, mtosatti

On 08/14/2011 11:45 PM, Umesh Deshpande wrote:
>> That's not too good because qemu_get_ram_ptr is a hot path for TCG.
>
> Looks like qemu_get_ram_ptr isn't called from the source side code of
> guest migration.

Right, but since you are accessing the list from both migration and 
non-migration threads, qemu_get_ram_ptr needs to take the ram_list lock. 
  The ram_list and IO thread mutexes together protect the "next" member, 
and you need to hold them both when modifying the list.

See the reply to Marcelo for a proposed solution.

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-15  7:26       ` Marcelo Tosatti
@ 2011-08-15 14:14         ` Paolo Bonzini
  2011-08-15 20:27           ` Umesh Deshpande
  0 siblings, 1 reply; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-15 14:14 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Umesh Deshpande, quintela

[-- Attachment #1: Type: text/plain, Size: 2056 bytes --]

On 08/15/2011 12:26 AM, Marcelo Tosatti wrote:
> Actually the previous patchset does not traverse the ramlist without
> qemu_mutex locked, which is safe versus the most-recently-used-block
> optimization.

Actually it does:

     bytes_transferred_last = bytes_transferred;
     bwidth = qemu_get_clock_ns(rt_clock);
 
+    if (stage != 3) {
+        qemu_mutex_lock_ramlist();
+        qemu_mutex_unlock_iothread();
+    }
+
     while (!qemu_file_rate_limit(f)) {
         int bytes_sent;
 
         /* ram_save_block does traverse memory.  */
         bytes_sent = ram_save_block(f);
         bytes_transferred += bytes_sent;
         if (bytes_sent == 0) { /* no more blocks */
             break;
         }
     }
 
+    if (stage != 3) {
+        qemu_mutex_lock_iothread();
+        qemu_mutex_unlock_ramlist();
+    }
+
     bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
     bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
 

What Umesh is doing is using "either ramlist mutex or iothread mutex" when reading
the ramlist, and "both" when writing the ramlist; similar to rwlocks done with a
regular mutex per CPU---clever!  So this:

+                qemu_mutex_lock_ramlist();
                 QLIST_REMOVE(block, next);
                 QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+                qemu_mutex_unlock_ramlist();

is effectively upgrading the lock from read-side to write-side, assuming that
qemu_get_ram_ptr is never called from the migration thread (which is true).

However, I propose that you put the MRU order in a separate list.  You would still
need two locks: the IO thread lock to protect the new list, a new lock to protect
the other fields in the ram_list.  For simplicity you may skip the new lock if you
assume that the migration and I/O threads never modify the list concurrently,
which is true.  And more importantly, the MRU and migration code absolutely do not
affect each other, because indeed the migration thread does not do MRU accesses.
See the attachment for an untested patch.

Paolo

[-- Attachment #2: 0001-split-MRU-ram-list.patch --]
[-- Type: text/x-patch, Size: 3713 bytes --]

>From 8579b821a2c7c4da55a4208c5df3c86e8ce2cc87 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 12 Aug 2011 13:08:04 +0200
Subject: [PATCH] split MRU ram list

Outside the execution threads the normal, non-MRU-ized order of
the RAM blocks should always be enough.  So manage two separate
lists, which will have separate locking rules.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpu-all.h |    4 +++-
 exec.c    |   16 +++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index f5c82cd..083d9e6 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -479,8 +479,9 @@ typedef struct RAMBlock {
     ram_addr_t offset;
     ram_addr_t length;
     uint32_t flags;
-    char idstr[256];
     QLIST_ENTRY(RAMBlock) next;
+    QLIST_ENTRY(RAMBlock) next_mru;
+    char idstr[256];
 #if defined(__linux__) && !defined(TARGET_S390X)
     int fd;
 #endif
@@ -489,6 +490,7 @@ typedef struct RAMBlock {
 typedef struct RAMList {
     uint8_t *phys_dirty;
     QLIST_HEAD(, RAMBlock) blocks;
+    QLIST_HEAD(, RAMBlock) blocks_mru;
 } RAMList;
 extern RAMList ram_list;
 
diff --git a/exec.c b/exec.c
index 253f42c..be0f37e 100644
--- a/exec.c
+++ b/exec.c
@@ -110,7 +110,10 @@ static uint8_t *code_gen_ptr;
 int phys_ram_fd;
 static int in_migration;
 
-RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
+RAMList ram_list = {
+    .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks),
+    .blocks_mru = QLIST_HEAD_INITIALIZER(ram_list.blocks_mru)
+};
 
 static MemoryRegion *system_memory;
 
@@ -2972,6 +2975,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
     new_block->length = size;
 
     QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+    QLIST_INSERT_HEAD(&ram_list.blocks_mru, new_block, next_mru);
 
     ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
                                        last_ram_offset() >> TARGET_PAGE_BITS);
@@ -2996,6 +3000,7 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr == block->offset) {
             QLIST_REMOVE(block, next);
+            QLIST_REMOVE(block, next_mru);
             qemu_free(block);
             return;
         }
@@ -3009,6 +3014,7 @@ void qemu_ram_free(ram_addr_t addr)
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr == block->offset) {
             QLIST_REMOVE(block, next);
+            QLIST_REMOVE(block, next_mru);
             if (block->flags & RAM_PREALLOC_MASK) {
                 ;
             } else if (mem_path) {
@@ -3113,12 +3119,12 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
 {
     RAMBlock *block;
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QLIST_FOREACH(block, &ram_list.blocks_mru, next_mru) {
         if (addr - block->offset < block->length) {
             /* Move this entry to to start of the list.  */
             if (block != QLIST_FIRST(&ram_list.blocks)) {
-                QLIST_REMOVE(block, next);
-                QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+                QLIST_REMOVE(block, next_mru);
+                QLIST_INSERT_HEAD(&ram_list.blocks_mru, block, next_mru);
             }
             if (xen_enabled()) {
                 /* We need to check if the requested address is in the RAM
@@ -3213,7 +3219,7 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
         return 0;
     }
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QLIST_FOREACH(block, &ram_list.blocks_mru, next_mru) {
         /* This case append when the block is not mapped. */
         if (block->host == NULL) {
             continue;
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-15 14:14         ` Paolo Bonzini
@ 2011-08-15 20:27           ` Umesh Deshpande
  2011-08-16  6:15             ` Paolo Bonzini
  0 siblings, 1 reply; 19+ messages in thread
From: Umesh Deshpande @ 2011-08-15 20:27 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Marcelo Tosatti, kvm, quintela

On 08/15/2011 10:14 AM, Paolo Bonzini wrote:
> On 08/15/2011 12:26 AM, Marcelo Tosatti wrote:
>> Actually the previous patchset does not traverse the ramlist without
>> qemu_mutex locked, which is safe versus the most-recently-used-block
>> optimization.
> Actually it does:
>
>       bytes_transferred_last = bytes_transferred;
>       bwidth = qemu_get_clock_ns(rt_clock);
>
> +    if (stage != 3) {
> +        qemu_mutex_lock_ramlist();
> +        qemu_mutex_unlock_iothread();
> +    }
> +
>       while (!qemu_file_rate_limit(f)) {
>           int bytes_sent;
>
>           /* ram_save_block does traverse memory.  */
>           bytes_sent = ram_save_block(f);
>           bytes_transferred += bytes_sent;
>           if (bytes_sent == 0) { /* no more blocks */
>               break;
>           }
>       }
>
> +    if (stage != 3) {
> +        qemu_mutex_lock_iothread();
> +        qemu_mutex_unlock_ramlist();
> +    }
> +
>       bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
>       bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
>
>
> What Umesh is doing is using "either ramlist mutex or iothread mutex" when reading
> the ramlist, and "both" when writing the ramlist; similar to rwlocks done with a
> regular mutex per CPU---clever!  So this:
>
> +                qemu_mutex_lock_ramlist();
>                   QLIST_REMOVE(block, next);
>                   QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
> +                qemu_mutex_unlock_ramlist();
>
> is effectively upgrading the lock from read-side to write-side, assuming that
> qemu_get_ram_ptr is never called from the migration thread (which is true).
>
> However, I propose that you put the MRU order in a separate list.  You would still
> need two locks: the IO thread lock to protect the new list, a new lock to protect
> the other fields in the ram_list.  For simplicity you may skip the new lock if you
> assume that the migration and I/O threads never modify the list concurrently,
> which is true.
Yes, the mru list patch would obviate the need of holding the ram_list 
mutex in qemu_get_ram_ptr.
Also, I was planning to protect the whole migration thread with iothread 
mutex, and ram_list mutex. (i.e. holding ram_list mutex while sleeping 
between two iterations, when we release iothread mutex). This will 
prevent the memslot block removal altogether during the migration. Do 
you see any problem with this?

> And more importantly, the MRU and migration code absolutely do not
> affect each other, because indeed the migration thread does not do MRU accesses.
> See the attachment for an untested patch.
>
> Paolo
Thanks
Umesh

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-15 20:27           ` Umesh Deshpande
@ 2011-08-16  6:15             ` Paolo Bonzini
  2011-08-16  7:56               ` Paolo Bonzini
  0 siblings, 1 reply; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-16  6:15 UTC (permalink / raw)
  To: Umesh Deshpande; +Cc: Marcelo Tosatti, kvm, quintela

On 08/15/2011 01:27 PM, Umesh Deshpande wrote:
> Yes, the mru list patch would obviate the need of holding the ram_list
> mutex in qemu_get_ram_ptr.

Feel free to take it and complete it with locking then!

> Also, I was planning to protect the whole migration thread with iothread
> mutex, and ram_list mutex. (i.e. holding ram_list mutex while sleeping
> between two iterations, when we release iothread mutex). This will
> prevent the memslot block removal altogether during the migration. Do
> you see any problem with this?

No, indeed holding the ram_list mutex through all the migration "fixes" 
the problems with ram_lists removing during migration.  I guess whoever 
will implement memory hotplug, will find a way around it. :)

Paolo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH v3 3/4] lock to protect memslots
  2011-08-16  6:15             ` Paolo Bonzini
@ 2011-08-16  7:56               ` Paolo Bonzini
  0 siblings, 0 replies; 19+ messages in thread
From: Paolo Bonzini @ 2011-08-16  7:56 UTC (permalink / raw)
  Cc: Umesh Deshpande, Marcelo Tosatti, kvm, quintela, Kevin Wolf

On 08/15/2011 11:15 PM, Paolo Bonzini wrote:
> On 08/15/2011 01:27 PM, Umesh Deshpande wrote:
>> Yes, the mru list patch would obviate the need of holding the ram_list
>> mutex in qemu_get_ram_ptr.
>
> Feel free to take it and complete it with locking then!
>
>> Also, I was planning to protect the whole migration thread with iothread
>> mutex, and ram_list mutex. (i.e. holding ram_list mutex while sleeping
>> between two iterations, when we release iothread mutex). This will
>> prevent the memslot block removal altogether during the migration. Do
>> you see any problem with this?
>
> No, indeed holding the ram_list mutex through all the migration "fixes"
> the problems with ram_lists removing during migration. I guess whoever
> will implement memory hotplug, will find a way around it. :)

BTW, now that I think of it, do you have ideas on how to do the 
migration thread do block migration?

Paolo


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2011-08-16  7:56 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-11 15:32 [RFC PATCH v3 0/4] Separate thread for VM migration Umesh Deshpande
2011-08-11 15:32 ` [RFC PATCH v3 1/4] separate " Umesh Deshpande
2011-08-11 16:18   ` Paolo Bonzini
2011-08-11 17:36     ` Umesh Deshpande
2011-08-12  6:40       ` Paolo Bonzini
2011-08-11 15:32 ` [RFC PATCH v3 2/4] Making iothread block for migrate_cancel Umesh Deshpande
2011-08-11 15:32 ` [RFC PATCH v3 3/4] lock to protect memslots Umesh Deshpande
2011-08-11 16:20   ` Paolo Bonzini
2011-08-12  6:45     ` Paolo Bonzini
2011-08-15  6:45       ` Umesh Deshpande
2011-08-15 14:10         ` Paolo Bonzini
2011-08-15  7:26       ` Marcelo Tosatti
2011-08-15 14:14         ` Paolo Bonzini
2011-08-15 20:27           ` Umesh Deshpande
2011-08-16  6:15             ` Paolo Bonzini
2011-08-16  7:56               ` Paolo Bonzini
2011-08-11 15:32 ` [RFC PATCH v3 4/4] Separate migration bitmap Umesh Deshpande
2011-08-11 16:23 ` [RFC PATCH v3 0/4] Separate thread for VM migration Paolo Bonzini
2011-08-11 18:25 ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox