From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:51131) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Zwt1g-0006TY-20 for qemu-devel@nongnu.org; Thu, 12 Nov 2015 09:39:04 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Zwt1a-0000hm-Sm for qemu-devel@nongnu.org; Thu, 12 Nov 2015 09:38:59 -0500 Received: from mx1.redhat.com ([209.132.183.28]:55219) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Zwt1a-0000hf-JC for qemu-devel@nongnu.org; Thu, 12 Nov 2015 09:38:54 -0500 Received: from int-mx14.intmail.prod.int.phx2.redhat.com (int-mx14.intmail.prod.int.phx2.redhat.com [10.5.11.27]) by mx1.redhat.com (Postfix) with ESMTPS id 2A2FC8E74B for ; Thu, 12 Nov 2015 14:38:54 +0000 (UTC) Date: Thu, 12 Nov 2015 16:38:51 +0200 From: "Michael S. Tsirkin" Message-ID: <20151112162743-mutt-send-email-mst@redhat.com> References: <1447331679-6270-1-git-send-email-victork@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1447331679-6270-1-git-send-email-victork@redhat.com> Subject: Re: [Qemu-devel] [PATCH v2] tests/vhost-user-bridge: implement logging of dirty pages List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Victor Kaplansky Cc: Marc-Andre Lureau , qemu-devel@nongnu.org, peterx@redhat.com On Thu, Nov 12, 2015 at 02:34:56PM +0200, Victor Kaplansky wrote: > During migration devices continue writing to the guest's memory. > The writes has to be reported to QEMU. This change implements > minimal support in vhost-user-bridge required for successful > migration of a guest with virtio-net device. > > Signed-off-by: Victor Kaplansky > --- > v2: > - use log_guest_addr for used ring reported by qemu instead of > translating. > - use mmap_size and mmap_offset defined in new > VHOST_USER_SET_LOG_BASE interface. See the patch > "vhost-user: modify SET_LOG_BASE to pass mmap size and > offset". > - start logging dirty pages only after the appropriate feature > is set by a VHOST_USER_GET_PROTOCOL_FEATURES request. > - updated TODO list. > > tests/vhost-user-bridge.c | 169 ++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 155 insertions(+), 14 deletions(-) > > diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c > index fa18ad5..8c1c997 100644 > --- a/tests/vhost-user-bridge.c > +++ b/tests/vhost-user-bridge.c > @@ -13,16 +13,22 @@ > /* > * TODO: > * - main should get parameters from the command line. > - * - implement all request handlers. > + * - implement all request handlers. Still not implemented: > + * vubr_set_protocol_features_exec() > + * vubr_get_queue_num_exec() > + * vubr_set_vring_enable_exec() > + * vubr_send_rarp_exec() > + * vubr_set_log_fd_exec() > * - test for broken requests and virtqueue. > * - implement features defined by Virtio 1.0 spec. > * - support mergeable buffers and indirect descriptors. > - * - implement RESET_DEVICE request. > * - implement clean shutdown. > * - implement non-blocking writes to UDP backend. > * - implement polling strategy. > */ > > +#define _FILE_OFFSET_BITS 64 > + > #include > #include > #include > @@ -166,6 +172,7 @@ typedef struct VubrVirtq { > struct vring_desc *desc; > struct vring_avail *avail; > struct vring_used *used; > + uint64_t log_guest_addr; > } VubrVirtq; > > /* Based on qemu/hw/virtio/vhost-user.c */ > @@ -173,6 +180,9 @@ typedef struct VubrVirtq { > #define VHOST_MEMORY_MAX_NREGIONS 8 > #define VHOST_USER_F_PROTOCOL_FEATURES 30 > > +typedef uint8_t vhost_log_chunk_t; Most code just uses uint8_t directly - I think you should just drop this typedef. > +#define VHOST_LOG_PAGE 4096 > + > enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_MQ = 0, > VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, > @@ -220,6 +230,11 @@ typedef struct VhostUserMemory { > VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; > } VhostUserMemory; > > +typedef struct VhostUserLog { > + uint64_t mmap_size; > + uint64_t mmap_offset; > +} VhostUserLog; > + > typedef struct VhostUserMsg { > VhostUserRequest request; > > @@ -234,6 +249,7 @@ typedef struct VhostUserMsg { > struct vhost_vring_state state; > struct vhost_vring_addr addr; > VhostUserMemory memory; > + VhostUserLog log; > } payload; > int fds[VHOST_MEMORY_MAX_NREGIONS]; > int fd_num; > @@ -265,8 +281,13 @@ typedef struct VubrDev { > uint32_t nregions; > VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; > VubrVirtq vq[MAX_NR_VIRTQUEUE]; > + int log_call_fd; This doesn't seem to be used. Pls add code to signal this after logging. > + uint64_t log_size; > + vhost_log_chunk_t *log_table; > int backend_udp_sock; > struct sockaddr_in backend_udp_dest; > + int ready; > + uint64_t features; > } VubrDev; > > static const char *vubr_request_str[] = { > @@ -368,7 +389,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg) > > rc = recvmsg(conn_fd, &msg, 0); > > - if (rc <= 0) { > + if (rc == 0) { > + vubr_die("recvmsg"); > + fprintf(stderr, "Peer disconnected.\n"); > + exit(1); > + } > + if (rc < 0) { > vubr_die("recvmsg"); > } > > @@ -395,7 +421,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg) > > if (vmsg->size) { > rc = read(conn_fd, &vmsg->payload, vmsg->size); > - if (rc <= 0) { > + if (rc == 0) { > + vubr_die("recvmsg"); > + fprintf(stderr, "Peer disconnected.\n"); > + exit(1); > + } > + if (rc < 0) { > vubr_die("recvmsg"); > } > > @@ -465,12 +496,39 @@ vubr_virtqueue_kick(VubrVirtq *vq) > } > } > > + > +static void > +vubr_log_page(uint8_t *log_table, uint64_t page) > +{ > + DPRINT("Logged dirty guest page: %"PRId64"\n", page); > + log_table[page / 8] |= 1 << (page % 8); > +} > + This is only safe if there's a single writer. Please add a comment that says as much. Or set this atomically, that's also not hard to do. > +static void > +vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length) > +{ > + uint64_t page; > + > + if (!(dev->features & VHOST_F_LOG_ALL) || !dev->log_table || !length) { > + return; > + } > + > + assert(dev->log_size >= ((address + length) / VHOST_LOG_PAGE / 8)); I think there's an off by 1 here. Imagine size == 0, test should always fail. But imagine address == 0 and length == 1. You get >= and test passes, seems wrong. > + > + page = address / VHOST_LOG_PAGE; > + while (page * VHOST_LOG_PAGE < address + length) { > + vubr_log_page(dev->log_table, page); > + page += VHOST_LOG_PAGE; > + } > +} > + > static void > vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) > { > struct vring_desc *desc = vq->desc; > struct vring_avail *avail = vq->avail; > struct vring_used *used = vq->used; > + uint64_t log_guest_addr = vq->log_guest_addr; > > unsigned int size = vq->size; > > @@ -510,6 +568,7 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) > > if (len <= chunk_len) { > memcpy(chunk_start, buf, len); > + vubr_log_write(dev, desc[i].addr, len); > } else { > fprintf(stderr, > "Received too long packet from the backend. Dropping...\n"); > @@ -519,11 +578,17 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) > /* Add descriptor to the used ring. */ > used->ring[u_index].id = d_index; > used->ring[u_index].len = len; > + vubr_log_write(dev, > + log_guest_addr + offsetof(struct vring_used, ring[u_index]), > + sizeof(used->ring[u_index])); > > vq->last_avail_index++; > vq->last_used_index++; > > atomic_mb_set(&used->idx, vq->last_used_index); > + vubr_log_write(dev, > + log_guest_addr + offsetof(struct vring_used, idx), > + sizeof(used->idx)); > > /* Kick the guest if necessary. */ > vubr_virtqueue_kick(vq); > @@ -535,6 +600,7 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq) > struct vring_desc *desc = vq->desc; > struct vring_avail *avail = vq->avail; > struct vring_used *used = vq->used; > + uint64_t log_guest_addr = vq->log_guest_addr; > > unsigned int size = vq->size; > > @@ -552,6 +618,8 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq) > void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); > uint32_t chunk_len = desc[i].len; > > + assert(!(desc[i].flags & VRING_DESC_F_WRITE)); > + > if (len + chunk_len < buf_size) { > memcpy(buf + len, chunk_start, chunk_len); > DPRINT("%d ", chunk_len); > @@ -577,6 +645,9 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq) > /* Add descriptor to the used ring. */ > used->ring[u_index].id = d_index; > used->ring[u_index].len = len; > + vubr_log_write(dev, > + log_guest_addr + offsetof(struct vring_used, ring[u_index]), > + sizeof(used->ring[u_index])); > > vubr_consume_raw_packet(dev, buf, len); > > @@ -588,6 +659,7 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq) > { > struct vring_avail *avail = vq->avail; > struct vring_used *used = vq->used; > + uint64_t log_guest_addr = vq->log_guest_addr; > > while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { > vubr_process_desc(dev, vq); > @@ -596,6 +668,9 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq) > } > > atomic_mb_set(&used->idx, vq->last_used_index); > + vubr_log_write(dev, > + log_guest_addr + offsetof(struct vring_used, idx), > + sizeof(used->idx)); > } > > static void > @@ -609,6 +684,10 @@ vubr_backend_recv_cb(int sock, void *ctx) > int buflen = sizeof(buf); > int len; > > + if (!dev->ready) { > + return; > + } > + > DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); > > uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); > @@ -656,9 +735,9 @@ vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > vmsg->payload.u64 = > ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | > - (1ULL << VIRTIO_NET_F_CTRL_VQ) | > - (1ULL << VIRTIO_NET_F_CTRL_RX) | > - (1ULL << VHOST_F_LOG_ALL)); > + (1ULL << VHOST_F_LOG_ALL) | > + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)); > + > vmsg->size = sizeof(vmsg->payload.u64); > > DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); > @@ -671,6 +750,7 @@ static int > vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); > + dev->features = vmsg->payload.u64; > return 0; > } > > @@ -680,10 +760,28 @@ vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) > return 0; > } > > +static void > +vubr_close_log(VubrDev *dev) > +{ > + if (dev->log_table) { > + if (munmap(dev->log_table, dev->log_size) != 0) { > + vubr_die("munmap()"); > + } > + > + dev->log_table = 0; > + } > + if (dev->log_call_fd != -1) { > + close(dev->log_call_fd); > + dev->log_call_fd = -1; > + } > +} > + > static int > vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > - DPRINT("Function %s() not implemented yet.\n", __func__); > + vubr_close_log(dev); > + dev->ready = 0; > + dev->features = 0; > return 0; > } > > @@ -736,8 +834,30 @@ vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) > static int > vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > - DPRINT("Function %s() not implemented yet.\n", __func__); > - return 0; > + int fd; > + uint64_t log_mmap_size, log_mmap_offset; > + void *rc; > + > + assert(vmsg->fd_num == 1); > + fd = vmsg->fds[0]; > + > + assert(vmsg->size == sizeof(vmsg->payload.log)); > + log_mmap_offset = vmsg->payload.log.mmap_offset; > + log_mmap_size = vmsg->payload.log.mmap_size; > + DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); > + DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); > + > + rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, > + log_mmap_offset); > + if (rc == MAP_FAILED) { > + vubr_die("mmap"); > + } > + dev->log_table = (vhost_log_chunk_t *) rc; Cast is not needed here. > + dev->log_size = log_mmap_size; > + > + vmsg->size = sizeof(vmsg->payload.u64); > + /* Reply */ > + return 1; > } > > static int > @@ -777,6 +897,7 @@ vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) > vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr); > vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr); > vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr); > + vq->log_guest_addr = vra->log_guest_addr; > > DPRINT("Setting virtq addresses:\n"); > DPRINT(" vring_desc at %p\n", vq->desc); > @@ -803,8 +924,14 @@ vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) > static int > vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > - DPRINT("Function %s() not implemented yet.\n", __func__); > - return 0; > + unsigned int index = vmsg->payload.state.index; > + > + DPRINT("State.index: %d\n", index); > + vmsg->payload.state.num = dev->vq[index].last_avail_index; > + vmsg->size = sizeof(vmsg->payload.state); > + > + /* reply */ But Reply above. Pls keep it consistent. > + return 1; > } > > static int > @@ -829,6 +956,10 @@ vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) > DPRINT("Waiting for kicks on fd: %d for vq: %d\n", > dev->vq[index].kick_fd, index); > } > + if (dev->vq[0].kick_fd != -1 && > + dev->vq[1].kick_fd != -1) { > + dev->ready = 1; I'm not sure what this does. Related to logging somehow? Anyway, processing a VQ should happen after either - you received a kick or - you received VRING_ENABLE > + } > return 0; > } > > @@ -858,9 +989,12 @@ vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) > static int > vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) > { > - /* FIXME: unimplented */ > + vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; > DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); > - return 0; > + vmsg->size = sizeof(vmsg->payload.u64); > + > + /* Reply */ > + return 1; > } > > static int > @@ -1012,6 +1146,13 @@ vubr_new(const char *path) > }; > } > > + /* Init log */ > + dev->log_call_fd = -1; > + dev->log_size = 0; > + dev->log_table = 0; Pls just put a single space there. Don't align things, it's hard to keep consistency when one does. > + dev->ready = 0; > + dev->features = 0; > + > /* Get a UNIX socket. */ > dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); > if (dev->sock == -1) { > -- > --Victor