From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([208.118.235.92]:59612)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <mst@redhat.com>) id 1Te5z0-0004gb-DO
	for qemu-devel@nongnu.org; Thu, 29 Nov 2012 10:25:00 -0500
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <mst@redhat.com>) id 1Te5yt-0006JX-DI
	for qemu-devel@nongnu.org; Thu, 29 Nov 2012 10:24:58 -0500
Received: from mx1.redhat.com ([209.132.183.28]:62706)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <mst@redhat.com>) id 1Te5yt-0006JS-2a
	for qemu-devel@nongnu.org; Thu, 29 Nov 2012 10:24:51 -0500
Date: Thu, 29 Nov 2012 17:27:39 +0200
From: "Michael S. Tsirkin" <mst@redhat.com>
Message-ID: <20121129152739.GE10896@redhat.com>
References: <20121129140237.GE9625@redhat.com>
	<897279607.16250581.1354202491938.JavaMail.root@redhat.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <897279607.16250581.1354202491938.JavaMail.root@redhat.com>
Subject: Re: [Qemu-devel] [PATCH v4 10/11] dataplane: add virtio-blk data
	plane code
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>, Anthony Liguori <aliguori@us.ibm.com>, qemu-devel@nongnu.org, Blue Swirl <blauwirbel@gmail.com>, khoa@us.ibm.com, Stefan Hajnoczi <stefanha@redhat.com>, Asias He <asias@redhat.com>

On Thu, Nov 29, 2012 at 10:21:31AM -0500, Paolo Bonzini wrote:
> 
> > > +    unsigned int num_reqs;
> > > +    QemuMutex num_reqs_lock;
> > 
> > OK the only reason this lock is needed is because
> > you want to drain outside the thread.
> > Won't it be better to queue process the drain request through
> > the thread?
> > You won't need any locks then.
> 
> Draining is processed in the thread.

Confused. You say same thread is always taking this lock?
This is what I suggest.

>  This lock is only needed
> to use it together with no_reqs_cond, because userspace threads
> do not have something like wait_event.
> 
> Direct usage of futexes would let you remove the lock, but it's
> not portable.
> 
> Paolo

It's easier: simply signal condition which is local on stack.

> > > +    QemuCond no_reqs_cond;
> > > +};
> > > +
> > > +/* Raise an interrupt to signal guest, if necessary */
> > > +static void notify_guest(VirtIOBlockDataPlane *s)
> > > +{
> > > +    if (!vring_should_notify(s->vdev, &s->vring)) {
> > > +        return;
> > > +    }
> > > +
> > > +    event_notifier_set(s->guest_notifier);
> > > +}
> > > +
> > > +static void complete_request(struct iocb *iocb, ssize_t ret, void
> > > *opaque)
> > > +{
> > > +    VirtIOBlockDataPlane *s = opaque;
> > > +    VirtIOBlockRequest *req = container_of(iocb,
> > > VirtIOBlockRequest, iocb);
> > > +    struct virtio_blk_inhdr hdr;
> > > +    int len;
> > > +
> > > +    if (likely(ret >= 0)) {
> > > +        hdr.status = VIRTIO_BLK_S_OK;
> > > +        len = ret;
> > > +    } else {
> > > +        hdr.status = VIRTIO_BLK_S_IOERR;
> > > +        len = 0;
> > > +    }
> > > +
> > > +    trace_virtio_blk_data_plane_complete_request(s, req->head,
> > > ret);
> > > +
> > > +    qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
> > > +    qemu_iovec_destroy(req->inhdr);
> > > +    g_slice_free(QEMUIOVector, req->inhdr);
> > > +
> > > +    /* According to the virtio specification len should be the
> > > number of bytes
> > > +     * written to, but for virtio-blk it seems to be the number of
> > > bytes
> > > +     * transferred plus the status bytes.
> > > +     */
> > > +    vring_push(&s->vring, req->head, len + sizeof(hdr));
> > > +
> > > +    qemu_mutex_lock(&s->num_reqs_lock);
> > > +    if (--s->num_reqs == 0) {
> > > +        qemu_cond_broadcast(&s->no_reqs_cond);
> > > +    }
> > > +    qemu_mutex_unlock(&s->num_reqs_lock);
> > > +}
> > > +
> > > +static void fail_request_early(VirtIOBlockDataPlane *s, unsigned
> > > int head,
> > > +                               QEMUIOVector *inhdr, unsigned char
> > > status)
> > > +{
> > > +    struct virtio_blk_inhdr hdr = {
> > > +        .status = status,
> > > +    };
> > > +
> > > +    qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
> > > +    qemu_iovec_destroy(inhdr);
> > > +    g_slice_free(QEMUIOVector, inhdr);
> > > +
> > > +    vring_push(&s->vring, head, sizeof(hdr));
> > > +    notify_guest(s);
> > > +}
> > > +
> > > +static int process_request(IOQueue *ioq, struct iovec iov[],
> > > +                           unsigned int out_num, unsigned int
> > > in_num,
> > > +                           unsigned int head)
> > > +{
> > > +    VirtIOBlockDataPlane *s = container_of(ioq,
> > > VirtIOBlockDataPlane, ioqueue);
> > > +    struct iovec *in_iov = &iov[out_num];
> > > +    struct virtio_blk_outhdr outhdr;
> > > +    QEMUIOVector *inhdr;
> > > +    size_t in_size;
> > > +
> > > +    /* Copy in outhdr */
> > > +    if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
> > > +                            sizeof(outhdr)) != sizeof(outhdr))) {
> > > +        error_report("virtio-blk request outhdr too short");
> > > +        return -EFAULT;
> > > +    }
> > > +    iov_discard(&iov, &out_num, sizeof(outhdr));
> > > +
> > > +    /* Grab inhdr for later */
> > > +    in_size = iov_size(in_iov, in_num);
> > > +    if (in_size < sizeof(struct virtio_blk_inhdr)) {
> > > +        error_report("virtio_blk request inhdr too short");
> > > +        return -EFAULT;
> > > +    }
> > > +    inhdr = g_slice_new(QEMUIOVector);
> > > +    qemu_iovec_init(inhdr, 1);
> > > +    qemu_iovec_concat_iov(inhdr, in_iov, in_num,
> > > +            in_size - sizeof(struct virtio_blk_inhdr),
> > > +            sizeof(struct virtio_blk_inhdr));
> > > +    iov_discard(&in_iov, &in_num, -sizeof(struct
> > > virtio_blk_inhdr));
> > > +
> > > +    /* TODO Linux sets the barrier bit even when not advertised!
> > > */
> > > +    outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
> > > +
> > > +    struct iocb *iocb;
> > > +    switch (outhdr.type & (VIRTIO_BLK_T_OUT |
> > > VIRTIO_BLK_T_SCSI_CMD |
> > > +                           VIRTIO_BLK_T_FLUSH)) {
> > > +    case VIRTIO_BLK_T_IN:
> > > +        iocb = ioq_rdwr(ioq, true, in_iov, in_num, outhdr.sector *
> > > 512);
> > > +        break;
> > > +
> > > +    case VIRTIO_BLK_T_OUT:
> > > +        iocb = ioq_rdwr(ioq, false, iov, out_num, outhdr.sector *
> > > 512);
> > > +        break;
> > > +
> > > +    case VIRTIO_BLK_T_SCSI_CMD:
> > > +        /* TODO support SCSI commands */
> > > +        fail_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
> > > +        return 0;
> > > +
> > > +    case VIRTIO_BLK_T_FLUSH:
> > > +        /* TODO fdsync not supported by Linux AIO, do it
> > > synchronously here! */
> > > +        fdatasync(s->fd);
> > > +        fail_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
> > > +        return 0;
> > > +
> > > +    default:
> > > +        error_report("virtio-blk unsupported request type %#x",
> > > outhdr.type);
> > > +        qemu_iovec_destroy(inhdr);
> > > +        g_slice_free(QEMUIOVector, inhdr);
> > > +        return -EFAULT;
> > > +    }
> > > +
> > > +    /* Fill in virtio block metadata needed for completion */
> > > +    VirtIOBlockRequest *req = container_of(iocb,
> > > VirtIOBlockRequest, iocb);
> > > +    req->head = head;
> > > +    req->inhdr = inhdr;
> > > +    return 0;
> > > +}
> > > +
> > > +static bool handle_notify(EventHandler *handler)
> > > +{
> > > +    VirtIOBlockDataPlane *s = container_of(handler,
> > > VirtIOBlockDataPlane,
> > > +                                           notify_handler);
> > > +
> > > +    /* There is one array of iovecs into which all new requests
> > > are extracted
> > > +     * from the vring.  Requests are read from the vring and the
> > > translated
> > > +     * descriptors are written to the iovecs array.  The iovecs do
> > > not have to
> > > +     * persist across handle_notify() calls because the kernel
> > > copies the
> > > +     * iovecs on io_submit().
> > > +     *
> > > +     * Handling io_submit() EAGAIN may require storing the
> > > requests across
> > > +     * handle_notify() calls until the kernel has sufficient
> > > resources to
> > > +     * accept more I/O.  This is not implemented yet.
> > > +     */
> > > +    struct iovec iovec[VRING_MAX];
> > > +    struct iovec *end = &iovec[VRING_MAX];
> > > +    struct iovec *iov = iovec;
> > > +
> > > +    /* When a request is read from the vring, the index of the
> > > first descriptor
> > > +     * (aka head) is returned so that the completed request can be
> > > pushed onto
> > > +     * the vring later.
> > > +     *
> > > +     * The number of hypervisor read-only iovecs is out_num.  The
> > > number of
> > > +     * hypervisor write-only iovecs is in_num.
> > > +     */
> > > +    int head;
> > > +    unsigned int out_num = 0, in_num = 0;
> > > +    unsigned int num_queued;
> > > +
> > > +    for (;;) {
> > > +        /* Disable guest->host notifies to avoid unnecessary
> > > vmexits */
> > > +        vring_set_notification(s->vdev, &s->vring, false);
> > > +
> > > +        for (;;) {
> > > +            head = vring_pop(s->vdev, &s->vring, iov, end,
> > > &out_num, &in_num);
> > > +            if (head < 0) {
> > > +                break; /* no more requests */
> > > +            }
> > > +
> > > +            trace_virtio_blk_data_plane_process_request(s,
> > > out_num, in_num,
> > > +                                                        head);
> > > +
> > > +            if (process_request(&s->ioqueue, iov, out_num, in_num,
> > > head) < 0) {
> > > +                vring_set_broken(&s->vring);
> > > +                break;
> > > +            }
> > > +            iov += out_num + in_num;
> > > +        }
> > > +
> > > +        if (likely(head == -EAGAIN)) { /* vring emptied */
> > > +            /* Re-enable guest->host notifies and stop processing
> > > the vring.
> > > +             * But if the guest has snuck in more descriptors,
> > > keep processing.
> > > +             */
> > > +            vring_set_notification(s->vdev, &s->vring, true);
> > > +            smp_mb();
> > > +            if (!vring_more_avail(&s->vring)) {
> > > +                break;
> > > +            }
> > > +        } else { /* head == -ENOBUFS or fatal error, iovecs[] is
> > > depleted */
> > > +            /* Since there are no iovecs[] left, stop processing
> > > for now.  Do
> > > +             * not re-enable guest->host notifies since the I/O
> > > completion
> > > +             * handler knows to check for more vring descriptors
> > > anyway.
> > > +             */
> > > +            break;
> > > +        }
> > > +    }
> > > +
> > > +    num_queued = ioq_num_queued(&s->ioqueue);
> > > +    if (num_queued > 0) {
> > > +        qemu_mutex_lock(&s->num_reqs_lock);
> > > +        s->num_reqs += num_queued;
> > > +        qemu_mutex_unlock(&s->num_reqs_lock);
> > > +
> > > +        int rc = ioq_submit(&s->ioqueue);
> > > +        if (unlikely(rc < 0)) {
> > > +            fprintf(stderr, "ioq_submit failed %d\n", rc);
> > > +            exit(1);
> > > +        }
> > > +    }
> > > +    return true;
> > > +}
> > > +
> > > +static bool handle_io(EventHandler *handler)
> > > +{
> > > +    VirtIOBlockDataPlane *s = container_of(handler,
> > > VirtIOBlockDataPlane,
> > > +                                           io_handler);
> > > +
> > > +    if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0)
> > > {
> > > +        notify_guest(s);
> > > +    }
> > > +
> > > +    /* If there were more requests than iovecs, the vring will not
> > > be empty yet
> > > +     * so check again.  There should now be enough resources to
> > > process more
> > > +     * requests.
> > > +     */
> > > +    if (unlikely(vring_more_avail(&s->vring))) {
> > > +        return handle_notify(&s->notify_handler);
> > > +    }
> > > +
> > > +    return true;
> > > +}
> > > +
> > > +static void *data_plane_thread(void *opaque)
> > > +{
> > > +    VirtIOBlockDataPlane *s = opaque;
> > > +    event_poll_run(&s->event_poll);
> > > +    return NULL;
> > > +}
> > > +
> > > +static void start_data_plane_bh(void *opaque)
> > > +{
> > > +    VirtIOBlockDataPlane *s = opaque;
> > > +
> > > +    qemu_bh_delete(s->start_bh);
> > > +    s->start_bh = NULL;
> > > +    qemu_thread_create(&s->thread, data_plane_thread,
> > > +                       s, QEMU_THREAD_JOINABLE);
> > > +}
> > > +
> > > +VirtIOBlockDataPlane *virtio_blk_data_plane_create(VirtIODevice
> > > *vdev, int fd)
> > > +{
> > > +    VirtIOBlockDataPlane *s;
> > > +
> > > +    s = g_new0(VirtIOBlockDataPlane, 1);
> > > +    s->vdev = vdev;
> > > +    s->fd = fd;
> > > +    return s;
> > > +}
> > > +
> > > +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
> > > +{
> > > +    if (!s) {
> > > +        return;
> > > +    }
> > > +    virtio_blk_data_plane_stop(s);
> > > +    g_free(s);
> > > +}
> > > +
> > > +/* Block until pending requests have completed
> > > + *
> > > + * The vring continues to be serviced so ensure no new requests
> > > will be added
> > > + * to avoid races.
> > 
> > This comment confuses me. "avoid races" is a kind of vague
> > comment that does not really help.
> > 
> > This function does not seem to ensure
> > no new requests - it simply waits until num requests
> > gets to 0. But requests could get added right afterwards
> > and it won't help.
> > 
> > Could be comment be made more clear please?
> > 
> > > + */
> > > +void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s)
> > > +{
> > > +    qemu_mutex_lock(&s->num_reqs_lock);
> > > +    while (s->num_reqs > 0) {
> > > +        qemu_cond_wait(&s->no_reqs_cond, &s->num_reqs_lock);
> > > +    }
> > > +    qemu_mutex_unlock(&s->num_reqs_lock);
> > > +}
> > > +
> > > +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
> > > +{
> > > +    VirtQueue *vq;
> > > +    int i;
> > > +
> > > +    if (s->started) {
> > > +        return;
> > > +    }
> > > +
> > > +    vq = virtio_get_queue(s->vdev, 0);
> > > +    if (!vring_setup(&s->vring, s->vdev, 0)) {
> > > +        return;
> > > +    }
> > > +
> > > +    event_poll_init(&s->event_poll);
> > > +
> > > +    /* Set up guest notifier (irq) */
> > > +    if
> > > (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
> > > +                                              true) != 0) {
> > > +        fprintf(stderr, "virtio-blk failed to set guest notifier,
> > > "
> > > +                "ensure -enable-kvm is set\n");
> > > +        exit(1);
> > > +    }
> > > +    s->guest_notifier = virtio_queue_get_guest_notifier(vq);
> > > +
> > > +    /* Set up virtqueue notify */
> > > +    if
> > > (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
> > > +                                            0, true) != 0) {
> > > +        fprintf(stderr, "virtio-blk failed to set host
> > > notifier\n");
> > > +        exit(1);
> > > +    }
> > > +    event_poll_add(&s->event_poll, &s->notify_handler,
> > > +                   virtio_queue_get_host_notifier(vq),
> > > +                   handle_notify);
> > > +
> > > +    /* Set up ioqueue */
> > > +    ioq_init(&s->ioqueue, s->fd, REQ_MAX);
> > > +    for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
> > > +        ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
> > > +    }
> > > +    event_poll_add(&s->event_poll, &s->io_handler,
> > > +                   ioq_get_notifier(&s->ioqueue), handle_io);
> > > +
> > > +    s->started = true;
> > > +    trace_virtio_blk_data_plane_start(s);
> > > +
> > > +    /* Kick right away to begin processing requests already in
> > > vring */
> > > +    event_notifier_set(virtio_queue_get_host_notifier(vq));
> > > +
> > > +    /* Spawn thread in BH so it inherits iothread cpusets */
> > > +    s->start_bh = qemu_bh_new(start_data_plane_bh, s);
> > > +    qemu_bh_schedule(s->start_bh);
> > > +}
> > > +
> > > +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
> > > +{
> > > +    if (!s->started) {
> > > +        return;
> > > +    }
> > > +    s->started = false;
> > > +    trace_virtio_blk_data_plane_stop(s);
> > > +
> > > +    /* Stop thread or cancel pending thread creation BH */
> > > +    if (s->start_bh) {
> > > +        qemu_bh_delete(s->start_bh);
> > > +        s->start_bh = NULL;
> > > +    } else {
> > > +        virtio_blk_data_plane_drain(s);
> > > +        event_poll_stop(&s->event_poll);
> > > +        qemu_thread_join(&s->thread);
> > > +    }
> > > +
> > > +    ioq_cleanup(&s->ioqueue);
> > > +
> > > +    s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
> > > 0, false);
> > > +
> > > +    event_poll_cleanup(&s->event_poll);
> > > +
> > > +    /* Clean up guest notifier (irq) */
> > > +    s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
> > > false);
> > > +
> > > +    vring_teardown(&s->vring);
> > > +}
> > > diff --git a/hw/dataplane/virtio-blk.h b/hw/dataplane/virtio-blk.h
> > > new file mode 100644
> > > index 0000000..ddf1115
> > > --- /dev/null
> > > +++ b/hw/dataplane/virtio-blk.h
> > > @@ -0,0 +1,41 @@
> > > +/*
> > > + * Dedicated thread for virtio-blk I/O processing
> > > + *
> > > + * Copyright 2012 IBM, Corp.
> > > + * Copyright 2012 Red Hat, Inc. and/or its affiliates
> > > + *
> > > + * Authors:
> > > + *   Stefan Hajnoczi <stefanha@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2
> > > or later.
> > > + * See the COPYING file in the top-level directory.
> > > + *
> > > + */
> > > +
> > > +#ifndef HW_DATAPLANE_VIRTIO_BLK_H
> > > +#define HW_DATAPLANE_VIRTIO_BLK_H
> > > +
> > > +#include "hw/virtio.h"
> > > +
> > > +typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
> > > +
> > > +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
> > > +VirtIOBlockDataPlane *virtio_blk_data_plane_create(VirtIODevice
> > > *vdev, int fd);
> > > +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
> > > +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
> > > +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
> > > +void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s);
> > > +#else
> > > +static inline VirtIOBlockDataPlane *virtio_blk_data_plane_create(
> > > +        VirtIODevice *vdev, int fd)
> > > +{
> > > +    return NULL;
> > > +}
> > > +
> > > +static inline void
> > > virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) {}
> > > +static inline void
> > > virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) {}
> > > +static inline void virtio_blk_data_plane_stop(VirtIOBlockDataPlane
> > > *s) {}
> > > +static inline void
> > > virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s) {}
> > > +#endif
> > > +
> > > +#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
> > > diff --git a/trace-events b/trace-events
> > > index a9a791b..1edc2ae 100644
> > > --- a/trace-events
> > > +++ b/trace-events
> > > @@ -98,6 +98,12 @@ virtio_blk_rw_complete(void *req, int ret) "req
> > > %p ret %d"
> > >  virtio_blk_handle_write(void *req, uint64_t sector, size_t
> > >  nsectors) "req %p sector %"PRIu64" nsectors %zu"
> > >  virtio_blk_handle_read(void *req, uint64_t sector, size_t
> > >  nsectors) "req %p sector %"PRIu64" nsectors %zu"
> > >  
> > > +# hw/dataplane/virtio-blk.c
> > > +virtio_blk_data_plane_start(void *s) "dataplane %p"
> > > +virtio_blk_data_plane_stop(void *s) "dataplane %p"
> > > +virtio_blk_data_plane_process_request(void *s, unsigned int
> > > out_num, unsigned int in_num, unsigned int head) "dataplane %p
> > > out_num %u in_num %u head %u"
> > > +virtio_blk_data_plane_complete_request(void *s, unsigned int head,
> > > int ret) "dataplane %p head %u ret %d"
> > > +
> > >  # hw/dataplane/vring.c
> > >  vring_setup(uint64_t physical, void *desc, void *avail, void
> > >  *used) "vring physical %#"PRIx64" desc %p avail %p used %p"
> > >  
> > > --
> > > 1.8.0
> >