From: "Michael S. Tsirkin" <mst@redhat.com>
To: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>,
Paolo Bonzini <pbonzini@redhat.com>,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/1] Add vhost_blk driver
Date: Fri, 2 Nov 2018 14:36:46 -0400 [thread overview]
Message-ID: <20181102142645-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <20181102182123.29420-2-v.mayatskih@gmail.com>
On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote:
> This driver accelerates host side of virtio-blk.
>
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
> ---
> drivers/vhost/Kconfig | 13 ++
> drivers/vhost/Makefile | 3 +
> drivers/vhost/blk.c | 510 +++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 526 insertions(+)
> create mode 100644 drivers/vhost/blk.c
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index b580885243f7..c4980d6af0ea 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
> adds some overhead, it is disabled by default.
>
> If unsure, say "N".
> +
> +config VHOST_BLK
> + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> + depends on BLOCK && EVENTFD
> + select VHOST
> + default n
> + help
> + This kernel module can be loaded in host kernel to accelerate
> + guest block with virtio_blk. Not to be confused with virtio_blk
> + module itself which needs to be loaded in guest kernel.
> +
> + To compile this driver as a module, choose M here: the module will
> + be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..c8be36cd9214 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
> obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
> vhost_vsock-y := vsock.o
>
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> +
> obj-$(CONFIG_VHOST_RING) += vringh.o
>
> obj-$(CONFIG_VHOST) += vhost.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 000000000000..aefb9a61fa0f
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,510 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 IBM Corporation
> + * Author: Vitaly Mayatskikh <v.mayatskih@gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/vhost.h>
> +#include <linux/fs.h>
> +#include "vhost.h"
> +
> +enum {
> + VHOST_BLK_FEATURES =
> + VHOST_FEATURES |
> + (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> + (1ULL << VIRTIO_RING_F_EVENT_IDX) |
> + (1ULL << VIRTIO_BLK_F_MQ)
> +};
> +
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
> +
> +enum {
> + VHOST_BLK_VQ_MAX = 16,
> + VHOST_BLK_VQ_MAX_REQS = 128,
> +};
> +
> +struct vhost_blk_req {
> + struct llist_node list;
> + int index;
> + struct vhost_blk_queue *q;
> + struct virtio_blk_outhdr hdr;
> + struct iovec *out_iov;
> + struct iovec *in_iov;
> + u8 out_num;
> + u8 in_num;
> + long len;
> + struct kiocb iocb;
> + struct iov_iter i;
> + int res;
> + void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> + int index;
> + struct vhost_blk *blk;
> + struct vhost_virtqueue vq;
> + struct vhost_work w;
> + struct llist_head wl;
> + struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
> +};
> +
> +struct vhost_blk {
> + struct vhost_dev dev;
> + struct file *backend;
> + int num_queues;
> + struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
> + struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
> +};
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> + int i;
> +
> + for (i = 0; i < blk->num_queues; i++)
> + vhost_poll_flush(&blk->queue[i].vq.poll);
> +}
> +
> +
> +static void vhost_blk_stop(struct vhost_blk *blk)
> +{
> + struct vhost_virtqueue *vq;
> + int i;
> +
> + for (i = 0; i < blk->num_queues; i++) {
> + vq = &blk->queue[i].vq;
> + mutex_lock(&vq->mutex);
> + rcu_assign_pointer(vq->private_data, NULL);
> + mutex_unlock(&vq->mutex);
> + }
> +}
> +
> +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
> +{
> + int ret;
> + int len = req->len;
> +
> + pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
> + req->q->index, req->index, status, len);
> + ret = put_user(status, (unsigned char __user *)req->status);
I'd make it u8 and not unsigned char. Also why don't you change
req->status type so you don't need a cast?
> +
> + WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
> + req->q->index, req->index);
kernel warnings and debug messages that are guest-triggerable lead to
disk full errors on the host. applies elsewhere too. you want traces
instead.
> +
> + vhost_add_used(&req->q->vq, req->index, len);
this can fail too.
> +
> + return ret;
> +}
> +
> +static void vhost_blk_io_done_work(struct vhost_work *w)
> +{
> + struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
> + struct llist_node *node;
> + struct vhost_blk_req *req, *tmp;
> +
> + node = llist_del_all(&q->wl);
> + llist_for_each_entry_safe(req, tmp, node, list) {
> + vhost_blk_req_done(req, req->res);
> + }
> + vhost_signal(&q->blk->dev, &q->vq);
> +}
> +
> +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> + struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
> + iocb);
> +
> + pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
> + req->q->index, req->index, ret, ret2);
> +
> + req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
> + llist_add(&req->list, &req->q->wl);
> + vhost_vq_work_queue(&req->q->vq, &req->q->w);
> +}
> +
> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> + struct vhost_blk *blk = req->q->blk;
> + struct vhost_virtqueue *vq = &req->q->vq;
> + int type = le32_to_cpu(req->hdr.type);
> + int ret;
> + u8 status;
> +
> + if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> + bool write = (type == VIRTIO_BLK_T_OUT);
> + int nr_seg = (write ? req->out_num : req->in_num) - 1;
> + unsigned long sector = le64_to_cpu(req->hdr.sector);
> + ssize_t len, rem_len;
> +
> + if (!req->q->blk->backend) {
> + vq_err(vq, "blk %p no backend!\n", req->q->blk);
> + ret = -EINVAL;
> + goto out_err;
> + }
> +
> + len = iov_length(&vq->iov[1], nr_seg);
> + pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
> + __func__, current->pid, current->comm,
> + write ? "WRITE" : "READ", req->hdr.sector, len);
> +
> + req->len = len;
> + rem_len = len;
> + iov_iter_init(&req->i, (write ? WRITE : READ),
> + write ? &req->out_iov[0] : &req->in_iov[0],
> + nr_seg, len);
> +
> + req->iocb.ki_pos = sector << 9;
> + req->iocb.ki_filp = blk->backend;
> + req->iocb.ki_complete = vhost_blk_iocb_complete;
> + req->iocb.ki_flags = IOCB_DIRECT;
> +
> + if (write)
> + ret = call_write_iter(blk->backend, &req->iocb,
> + &req->i);
> + else
> + ret = call_read_iter(blk->backend, &req->iocb,
> + &req->i);
> +
> + if (ret != -EIOCBQUEUED)
> + vhost_blk_iocb_complete(&req->iocb, ret, 0);
> +
> + ret = 0;
> + goto out;
> + }
> +
> + if (type == VIRTIO_BLK_T_GET_ID) {
> + char s[] = "vhost_blk";
Isn't this supposed to return the serial #?
> + size_t len = min_t(size_t, req->in_iov[0].iov_len,
> + strlen(s));
> +
> + ret = copy_to_user(req->in_iov[0].iov_base, s, len);
I don't think we should assume there's no scatter list here.
> + status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> + if (put_user(status, (unsigned char __user *)req->status)) {
> + ret = -EFAULT;
> + goto out_err;
> + }
> + vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
> + ret = 0;
> + goto out;
> + } else {
> + pr_warn("Unsupported request type %d\n", type);
> + vhost_discard_vq_desc(vq, 1);
> + ret = -EINVAL;
> + return ret;
> + }
> +out_err:
> + vhost_discard_vq_desc(vq, 1);
> +out:
> + return ret;
> +}
> +
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> + struct vhost_virtqueue *vq;
> + struct vhost_blk_queue *q;
> + struct vhost_blk *blk;
> + struct vhost_blk_req *req;
> + int in, out;
> + int head;
> +
> + vq = container_of(work, struct vhost_virtqueue, poll.work);
> + q = container_of(vq, struct vhost_blk_queue, vq);
> + blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> + vhost_disable_notify(&blk->dev, vq);
> + for (;;) {
> + in = out = -1;
> +
> + head = vhost_get_vq_desc(vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in, NULL, NULL);
> +
> + if (head < 0)
> + break;
> +
> + if (head == vq->num) {
> + if (vhost_enable_notify(&blk->dev, vq)) {
> + vhost_disable_notify(&blk->dev, vq);
> + continue;
> + }
> + break;
> + }
> +
> + req = &q->req[head];
> + req->index = head;
> + req->out_num = out;
> + req->in_num = in;
> + req->out_iov = &vq->iov[1];
> + req->in_iov = &vq->iov[out];
> + req->status = vq->iov[out + in - 1].iov_base;
Shouldn't we validate that there's actually an in?
> +
> + if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
> + sizeof(req->hdr))) {
> + vq_err(vq, "Failed to get block header!\n");
> + vhost_discard_vq_desc(vq, 1);
> + continue;
> + }
It's better to avoid assuming that header is in a single iov entry,
use an iterator.
> + if (vhost_blk_req_handle(req) < 0)
> + break;
> + }
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> + struct vhost_blk *blk;
> + struct vhost_blk_queue *q;
> + int i, j;
> +
> + blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
> + if (!blk)
> + return -ENOMEM;
> +
> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
> + q = &blk->queue[i];
> + q->index = i;
> + q->blk = blk;
> + q->vq.handle_kick = vhost_blk_handle_guest_kick;
> + vhost_work_init(&q->w, vhost_blk_io_done_work);
> + blk->vqs[i] = &q->vq;
> + for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
> + q->req[j].index = j;
> + q->req[j].q = q;
> + }
> + }
> + vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
> + VHOST_BLK_VQ_MAX);
> + file->private_data = blk;
> +
> + return 0;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> + struct vhost_blk *blk = f->private_data;
> +
> + vhost_blk_stop(blk);
> + mutex_lock(&blk->dev.mutex);
> + vhost_blk_flush(blk);
> + vhost_dev_stop(&blk->dev);
> + vhost_dev_cleanup(&blk->dev);
> + vhost_blk_flush(blk);
> +
> + if (blk->backend) {
> + fput(blk->backend);
> + blk->backend = NULL;
> + }
> +
> + mutex_unlock(&blk->dev.mutex);
> + kvfree(blk);
> +
> + return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> + int i;
> + int ret = -EFAULT;
> +
> + mutex_lock(&blk->dev.mutex);
> + if ((features & (1 << VHOST_F_LOG_ALL)) &&
> + !vhost_log_access_ok(&blk->dev))
> + goto out_unlock;
> +
> + if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> + if (vhost_init_device_iotlb(&blk->dev, true))
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
> + struct vhost_virtqueue *vq = blk->vqs[i];
> +
> + mutex_lock(&vq->mutex);
> + vq->acked_features = features & VHOST_BLK_FEATURES;
> + mutex_unlock(&vq->mutex);
> + }
> + ret = 0;
> +out_unlock:
> + mutex_unlock(&blk->dev.mutex);
> +
> + return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> + long err;
> + struct vhost_umem *umem;
> +
> + mutex_lock(&blk->dev.mutex);
> + err = vhost_dev_check_owner(&blk->dev);
> + if (err)
> + goto done;
> + umem = vhost_dev_reset_owner_prepare();
> + if (!umem) {
> + err = -ENOMEM;
> + goto done;
> + }
> + vhost_blk_stop(blk);
> + vhost_blk_flush(blk);
> + vhost_dev_reset_owner(&blk->dev, umem);
> +done:
> + mutex_unlock(&blk->dev.mutex);
> + return err;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
> +{
> + struct file *backend;
> + int ret, i;
> + struct vhost_virtqueue *vq;
> +
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_check_owner(&blk->dev);
> + if (ret)
> + goto out_dev;
> +
> + backend = fget(fd);
> + if (IS_ERR(backend)) {
> + ret = PTR_ERR(backend);
> + goto out_dev;
> + }
> +
> + if (backend == blk->backend) {
> + ret = 0;
> + goto out_file;
> + }
> +
> + if (blk->backend)
> + fput(blk->backend);
> + blk->backend = backend;
> + for (i = 0; i < blk->num_queues; i++) {
> + vq = &blk->queue[i].vq;
> + if (!vhost_vq_access_ok(vq)) {
> + ret = -EFAULT;
> + goto out_file;
> + }
> + mutex_lock(&vq->mutex);
> + rcu_assign_pointer(vq->private_data, backend);
> + ret = vhost_vq_init_access(vq);
> + mutex_unlock(&vq->mutex);
> + if (ret) {
> + pr_err("vhost_vq_init_access failed: %d\n", ret);
> + goto out_file;
> + }
> +
> + }
> + ret = 0;
> + goto out_dev;
> +out_file:
> + fput(backend);
> + blk->backend = NULL;
> +out_dev:
> + mutex_unlock(&blk->dev.mutex);
> + vhost_blk_flush(blk);
> + return ret;
> +}
> +
> +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
> + void __user *argp)
> +{
> + long ret;
> +
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
> + if (ret == -ENOIOCTLCMD)
> + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
> + else
> + vhost_blk_flush(blk);
> + mutex_unlock(&blk->dev.mutex);
> + return ret;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> + unsigned long arg)
> +{
> + struct vhost_blk *blk = f->private_data;
> + void __user *argp = (void __user *)arg;
> + int fd;
> + u64 __user *featurep = argp;
> + u64 features;
> + long ret;
> + struct vhost_vring_state s;
> +
> + switch (ioctl) {
> + case VHOST_SET_MEM_TABLE:
> + vhost_blk_stop(blk);
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + break;
> + case VHOST_SET_VRING_NUM:
> + if (copy_from_user(&s, argp, sizeof(s)))
> + return -EFAULT;
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + if (!ret)
> + blk->num_queues = s.index + 1;
> + break;
> + case VHOST_BLK_SET_BACKEND:
> + if (copy_from_user(&fd, argp, sizeof(fd)))
> + return -EFAULT;
> + ret = vhost_blk_set_backend(blk, fd);
> + break;
> + case VHOST_GET_FEATURES:
> + features = VHOST_BLK_FEATURES;
> + if (copy_to_user(featurep, &features, sizeof(features)))
> + return -EFAULT;
> + ret = 0;
> + break;
> + case VHOST_SET_FEATURES:
> + if (copy_from_user(&features, featurep, sizeof(features)))
> + return -EFAULT;
> + if (features & ~VHOST_BLK_FEATURES)
> + return -EOPNOTSUPP;
> + ret = vhost_blk_set_features(blk, features);
> + break;
> + case VHOST_RESET_OWNER:
> + ret = vhost_blk_reset_owner(blk);
> + break;
> + default:
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + break;
> + }
> + return ret;
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> + .owner = THIS_MODULE,
> + .open = vhost_blk_open,
> + .release = vhost_blk_release,
> + .llseek = noop_llseek,
> + .unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> + MISC_DYNAMIC_MINOR,
> + "vhost-blk",
> + &vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> + return misc_register(&vhost_blk_misc);
> +}
> +module_init(vhost_blk_init);
> +
> +static void vhost_blk_exit(void)
> +{
> + misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Vitaly Mayatskikh");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
> +MODULE_ALIAS("devname:vhost-blk");
> --
> 2.17.1
next prev parent reply other threads:[~2018-11-02 18:36 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-11-02 18:21 [PATCH 0/1] vhost: add vhost_blk driver Vitaly Mayatskikh
2018-11-02 18:21 ` [PATCH 1/1] Add " Vitaly Mayatskikh
2018-11-02 18:36 ` Michael S. Tsirkin
2018-11-02 18:36 ` Michael S. Tsirkin [this message]
2018-11-02 19:24 ` Vitaly Mayatskih
2018-11-02 20:38 ` Michael S. Tsirkin
2018-11-02 20:38 ` Michael S. Tsirkin
2018-11-03 2:50 ` kbuild test robot
2018-11-03 2:50 ` kbuild test robot
2018-11-03 2:50 ` kbuild test robot
2018-11-06 16:03 ` Stefan Hajnoczi
2018-11-06 19:47 ` Vitaly Mayatskih
2018-11-06 16:03 ` Stefan Hajnoczi
2018-11-02 18:26 ` [PATCH 0/1] vhost: add " Michael S. Tsirkin
2018-11-02 18:31 ` Vitaly Mayatskih
2018-11-05 14:02 ` Christian Borntraeger
2018-11-05 14:02 ` Christian Borntraeger
2018-11-05 14:21 ` Vitaly Mayatskih
2018-11-06 15:40 ` Stefan Hajnoczi
2018-11-06 15:40 ` Stefan Hajnoczi
2018-11-06 18:46 ` Denis Lunev
2018-11-06 20:08 ` Vitaly Mayatskih
2018-11-02 18:26 ` Michael S. Tsirkin
2018-11-04 11:57 ` Maxim Levitsky
2018-11-04 16:40 ` Vitaly Mayatskih
2018-11-05 11:55 ` Maxim Levitsky
2018-11-05 3:00 ` Jason Wang
2018-11-05 3:23 ` Vitaly Mayatskih
2018-11-06 2:45 ` Jason Wang
2018-11-06 2:56 ` Vitaly Mayatskih
2018-11-06 7:13 ` Christoph Hellwig
2018-11-06 7:13 ` Christoph Hellwig
2018-11-06 21:41 ` Paolo Bonzini
2018-11-06 21:41 ` Paolo Bonzini
2018-11-06 2:45 ` Jason Wang
2018-11-05 15:48 ` Christian Borntraeger
2018-11-05 15:48 ` Christian Borntraeger
2018-11-05 16:15 ` Vitaly Mayatskih
2018-11-06 15:21 ` Stefan Hajnoczi
2018-11-06 15:21 ` Stefan Hajnoczi
2018-11-05 3:00 ` Jason Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181102142645-mutt-send-email-mst@kernel.org \
--to=mst@redhat.com \
--cc=jasowang@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=v.mayatskih@gmail.com \
--cc=virtualization@lists.linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.