* [Qemu-devel] [PATCH v3 1/5] migration/rdma: Fix race on source
2017-07-14 11:57 [Qemu-devel] [PATCH v3 0/5] A bunch of RDMA fixes Dr. David Alan Gilbert (git)
@ 2017-07-14 11:57 ` Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 2/5] migration: Close file on failed migration load Dr. David Alan Gilbert (git)
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Dr. David Alan Gilbert (git) @ 2017-07-14 11:57 UTC (permalink / raw)
To: qemu-devel; +Cc: quintela, lvivier, peterx
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Fix a race where the destination might try and send the source a
WRID_READY before the source has done a post-recv for it.
rdma_post_recv has to happen after the qp exists, and we're
OK since we've already called qemu_rdma_source_init that calls
qemu_alloc_qp.
This corresponds to:
https://bugzilla.redhat.com/show_bug.cgi?id=1285044
The race can be triggered by adding a few ms wait before this
post_recv_control (which was originally due to me turning on loads of
debug).
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
migration/rdma.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index c6bc607a03..6111e10c70 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2365,6 +2365,12 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
caps_to_network(&cap);
+ ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
+ if (ret) {
+ ERROR(errp, "posting second control recv");
+ goto err_rdma_source_connect;
+ }
+
ret = rdma_connect(rdma->cm_id, &conn_param);
if (ret) {
perror("rdma_connect");
@@ -2405,12 +2411,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
rdma_ack_cm_event(cm_event);
- ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
- if (ret) {
- ERROR(errp, "posting second control recv!");
- goto err_rdma_source_connect;
- }
-
rdma->control_ready_expected = 1;
rdma->nb_sent = 0;
return 0;
--
2.13.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [Qemu-devel] [PATCH v3 3/5] migration/rdma: Allow cancelling while waiting for wrid
2017-07-14 11:57 [Qemu-devel] [PATCH v3 0/5] A bunch of RDMA fixes Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 1/5] migration/rdma: Fix race on source Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 2/5] migration: Close file on failed migration load Dr. David Alan Gilbert (git)
@ 2017-07-14 11:57 ` Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 4/5] migration/rdma: Safely convert control types Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 5/5] migration/rdma: Send error during cancelling Dr. David Alan Gilbert (git)
4 siblings, 0 replies; 7+ messages in thread
From: Dr. David Alan Gilbert (git) @ 2017-07-14 11:57 UTC (permalink / raw)
To: qemu-devel; +Cc: quintela, lvivier, peterx
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
When waiting for a WRID, if the other side dies we end up waiting
for ever with no way to cancel the migration.
Cure this by poll()ing the fd first with a timeout and checking
error flags and migration state.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
migration/rdma.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 53 insertions(+), 6 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 6111e10c70..53076646ef 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1466,6 +1466,57 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
return 0;
}
+/* Wait for activity on the completion channel.
+ * Returns 0 on success, none-0 on error.
+ */
+static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
+{
+ /*
+ * Coroutine doesn't start until migration_fd_process_incoming()
+ * so don't yield unless we know we're running inside of a coroutine.
+ */
+ if (rdma->migration_started_on_destination) {
+ yield_until_fd_readable(rdma->comp_channel->fd);
+ } else {
+ /* This is the source side, we're in a separate thread
+ * or destination prior to migration_fd_process_incoming()
+ * we can't yield; so we have to poll the fd.
+ * But we need to be able to handle 'cancel' or an error
+ * without hanging forever.
+ */
+ while (!rdma->error_state && !rdma->received_error) {
+ GPollFD pfds[1];
+ pfds[0].fd = rdma->comp_channel->fd;
+ pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
+ /* 0.1s timeout, should be fine for a 'cancel' */
+ switch (qemu_poll_ns(pfds, 1, 100 * 1000 * 1000)) {
+ case 1: /* fd active */
+ return 0;
+
+ case 0: /* Timeout, go around again */
+ break;
+
+ default: /* Error of some type -
+ * I don't trust errno from qemu_poll_ns
+ */
+ error_report("%s: poll failed", __func__);
+ rdma->error_state = -EPIPE;
+ return -1;
+ }
+
+ if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
+ /* Bail out and let the cancellation happen */
+ return -EPIPE;
+ }
+ }
+ }
+
+ if (rdma->received_error) {
+ return -EPIPE;
+ }
+ return rdma->error_state;
+}
+
/*
* Block until the next work request has completed.
*
@@ -1513,12 +1564,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
}
while (1) {
- /*
- * Coroutine doesn't start until migration_fd_process_incoming()
- * so don't yield unless we know we're running inside of a coroutine.
- */
- if (rdma->migration_started_on_destination) {
- yield_until_fd_readable(rdma->comp_channel->fd);
+ if (qemu_rdma_wait_comp_channel(rdma)) {
+ goto err_block_for_wrid;
}
if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
--
2.13.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [Qemu-devel] [PATCH v3 4/5] migration/rdma: Safely convert control types
2017-07-14 11:57 [Qemu-devel] [PATCH v3 0/5] A bunch of RDMA fixes Dr. David Alan Gilbert (git)
` (2 preceding siblings ...)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 3/5] migration/rdma: Allow cancelling while waiting for wrid Dr. David Alan Gilbert (git)
@ 2017-07-14 11:57 ` Dr. David Alan Gilbert (git)
2017-07-14 11:57 ` [Qemu-devel] [PATCH v3 5/5] migration/rdma: Send error during cancelling Dr. David Alan Gilbert (git)
4 siblings, 0 replies; 7+ messages in thread
From: Dr. David Alan Gilbert (git) @ 2017-07-14 11:57 UTC (permalink / raw)
To: qemu-devel; +Cc: quintela, lvivier, peterx
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
control_desc[] is an array of strings that correspond to a
series of message types; they're used only for error messages, but if
the message type is seriously broken then we could go off the end of
the array.
Convert the array to a function control_desc() that bound checks.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
migration/rdma.c | 54 ++++++++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 53076646ef..13d6dd7709 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -165,20 +165,6 @@ enum {
RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
};
-static const char *control_desc[] = {
- [RDMA_CONTROL_NONE] = "NONE",
- [RDMA_CONTROL_ERROR] = "ERROR",
- [RDMA_CONTROL_READY] = "READY",
- [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
- [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
- [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
- [RDMA_CONTROL_COMPRESS] = "COMPRESS",
- [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
- [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
- [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
- [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
- [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
-};
/*
* Memory and MR structures used to represent an IB Send/Recv work request.
@@ -251,6 +237,30 @@ typedef struct QEMU_PACKED RDMADestBlock {
uint32_t padding;
} RDMADestBlock;
+static const char *control_desc(unsigned int rdma_control)
+{
+ static const char *strs[] = {
+ [RDMA_CONTROL_NONE] = "NONE",
+ [RDMA_CONTROL_ERROR] = "ERROR",
+ [RDMA_CONTROL_READY] = "READY",
+ [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
+ [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
+ [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
+ [RDMA_CONTROL_COMPRESS] = "COMPRESS",
+ [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
+ [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
+ [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
+ [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
+ [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
+ };
+
+ if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
+ return "??BAD CONTROL VALUE??";
+ }
+
+ return strs[rdma_control];
+}
+
static uint64_t htonll(uint64_t v)
{
union { uint32_t lv[2]; uint64_t llv; } u;
@@ -1637,7 +1647,7 @@ static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
.num_sge = 1,
};
- trace_qemu_rdma_post_send_control(control_desc[head->type]);
+ trace_qemu_rdma_post_send_control(control_desc(head->type));
/*
* We don't actually need to do a memcpy() in here if we used
@@ -1716,16 +1726,16 @@ static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
network_to_control((void *) rdma->wr_data[idx].control);
memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
- trace_qemu_rdma_exchange_get_response_start(control_desc[expecting]);
+ trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
if (expecting == RDMA_CONTROL_NONE) {
- trace_qemu_rdma_exchange_get_response_none(control_desc[head->type],
+ trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
head->type);
} else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
error_report("Was expecting a %s (%d) control message"
", but got: %s (%d), length: %d",
- control_desc[expecting], expecting,
- control_desc[head->type], head->type, head->len);
+ control_desc(expecting), expecting,
+ control_desc(head->type), head->type, head->len);
if (head->type == RDMA_CONTROL_ERROR) {
rdma->received_error = true;
}
@@ -1835,7 +1845,7 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
}
}
- trace_qemu_rdma_exchange_send_waiting(control_desc[resp->type]);
+ trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
ret = qemu_rdma_exchange_get_response(rdma, resp,
resp->type, RDMA_WRID_DATA);
@@ -1847,7 +1857,7 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
if (resp_idx) {
*resp_idx = RDMA_WRID_DATA;
}
- trace_qemu_rdma_exchange_send_received(control_desc[resp->type]);
+ trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
}
rdma->control_ready_expected = 1;
@@ -3397,7 +3407,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
ret = -EIO;
goto out;
default:
- error_report("Unknown control message %s", control_desc[head.type]);
+ error_report("Unknown control message %s", control_desc(head.type));
ret = -EIO;
goto out;
}
--
2.13.0
^ permalink raw reply related [flat|nested] 7+ messages in thread