* [PATCH 1/1] vhost-blk: Add vhost-blk support v4
@ 2012-10-15 9:57 Asias He
2012-11-07 12:30 ` Michael S. Tsirkin
2012-11-07 23:47 ` Rusty Russell
0 siblings, 2 replies; 4+ messages in thread
From: Asias He @ 2012-10-15 9:57 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm, virtualization
vhost-blk is an in-kernel virito-blk device accelerator.
Due to lack of proper in-kernel AIO interface, this version converts
guest's I/O request to bio and use submit_bio() to submit I/O directly.
So this version any supports raw block device as guest's disk image,
e.g. /dev/sda, /dev/ram0. We can add file based image support to
vhost-blk once we have in-kernel AIO interface. There are some work in
progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
http://marc.info/?l=linux-fsdevel&m=133312234313122
Performance evaluation:
-----------------------------
1) LKVM
Fio with libaio ioengine on Fusion IO device using kvm tool
IOPS Before After Improvement
seq-read 107 121 +13.0%
seq-write 130 179 +37.6%
rnd-read 102 122 +19.6%
rnd-write 125 159 +27.0%
2) QEMU
Fio with libaio ioengine on Fusion IO device using QEMU
IOPS Before After Improvement
seq-read 76 123 +61.8%
seq-write 139 173 +24.4%
rnd-read 73 120 +64.3%
rnd-write 75 156 +108.0%
Userspace bits:
-----------------------------
1) LKVM
The latest vhost-blk userspace bits for kvm tool can be found here:
git@github.com:asias/linux-kvm.git blk.vhost-blk
2) QEMU
The latest vhost-blk userspace prototype for QEMU can be found here:
git@github.com:asias/qemu.git blk.vhost-blk
Changes in v4:
- Mark req->status as userspace pointer
- Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
- Add if (need_resched()) schedule() in blk thread
- Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
- Use vq_err() instead of pr_warn()
- Fail un Unsupported request
- Add flush in vhost_blk_set_features()
Changes in v3:
- Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
- Check file passed by user is a raw block device file
Signed-off-by: Asias He <asias@redhat.com>
---
drivers/vhost/Kconfig | 1 +
drivers/vhost/Kconfig.blk | 10 +
drivers/vhost/Makefile | 2 +
drivers/vhost/blk.c | 677 ++++++++++++++++++++++++++++++++++++++++++++++
drivers/vhost/blk.h | 8 +
5 files changed, 698 insertions(+)
create mode 100644 drivers/vhost/Kconfig.blk
create mode 100644 drivers/vhost/blk.c
create mode 100644 drivers/vhost/blk.h
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 202bba6..acd8038 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -11,4 +11,5 @@ config VHOST_NET
if STAGING
source "drivers/vhost/Kconfig.tcm"
+source "drivers/vhost/Kconfig.blk"
endif
diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
new file mode 100644
index 0000000..ff8ab76
--- /dev/null
+++ b/drivers/vhost/Kconfig.blk
@@ -0,0 +1,10 @@
+config VHOST_BLK
+ tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
+ depends on BLOCK && EXPERIMENTAL && m
+ ---help---
+ This kernel module can be loaded in host kernel to accelerate
+ guest block with virtio_blk. Not to be confused with virtio_blk
+ module itself which needs to be loaded in guest kernel.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053..1a8a4a5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
vhost_net-y := vhost.o net.o
obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 0000000..5c2b790
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan <tailai.ly@taobao.com>
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/vhost.h>
+#include <linux/virtio_blk.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+
+#include "vhost.c"
+#include "vhost.h"
+#include "blk.h"
+
+/* The block header is in the first and separate buffer. */
+#define BLK_HDR 0
+
+static DEFINE_IDA(vhost_blk_index_ida);
+
+enum {
+ VHOST_BLK_VQ_REQ = 0,
+ VHOST_BLK_VQ_MAX = 1,
+};
+
+struct req_page_list {
+ struct page **pages;
+ int pages_nr;
+};
+
+struct vhost_blk_req {
+ struct llist_node llnode;
+ struct req_page_list *pl;
+ struct vhost_blk *blk;
+
+ struct iovec *iov;
+ int iov_nr;
+
+ struct bio **bio;
+ atomic_t bio_nr;
+
+ sector_t sector;
+ int write;
+ u16 head;
+ long len;
+
+ u8 __user *status;
+};
+
+struct vhost_blk {
+ struct task_struct *host_kick;
+ struct vhost_blk_req *reqs;
+ struct vhost_virtqueue vq;
+ struct llist_head llhead;
+ struct vhost_dev dev;
+ u16 reqs_nr;
+ int index;
+};
+
+static inline int iov_num_pages(struct iovec *iov)
+{
+ return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+ ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static int vhost_blk_setup(struct vhost_blk *blk)
+{
+ blk->reqs_nr = blk->vq.num;
+
+ blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->reqs_nr,
+ GFP_KERNEL);
+ if (!blk->reqs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status)
+{
+ struct vhost_blk *blk = req->blk;
+
+ if (__copy_to_user(req->status, &status, sizeof(status))) {
+ vq_err(&blk->vq, "Failed to write status\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static void vhost_blk_enable_vq(struct vhost_blk *blk,
+ struct vhost_virtqueue *vq)
+{
+ wake_up_process(blk->host_kick);
+}
+
+static void vhost_blk_req_done(struct bio *bio, int err)
+{
+ struct vhost_blk_req *req = bio->bi_private;
+ struct vhost_blk *blk = req->blk;
+
+ if (err)
+ req->len = err;
+
+ if (atomic_dec_and_test(&req->bio_nr)) {
+ llist_add(&req->llnode, &blk->llhead);
+ wake_up_process(blk->host_kick);
+ }
+
+ bio_put(bio);
+}
+
+static void vhost_blk_req_umap(struct vhost_blk_req *req)
+{
+ struct req_page_list *pl;
+ int i, j;
+
+
+ if (!req->pl) {
+ kfree(req->bio);
+ return;
+ }
+
+ for (i = 0; i < req->iov_nr; i++) {
+ pl = &req->pl[i];
+ for (j = 0; j < pl->pages_nr; j++) {
+ if (!req->write)
+ set_page_dirty_lock(pl->pages[j]);
+ page_cache_release(pl->pages[j]);
+ }
+ }
+
+ kfree(req->pl);
+}
+
+static int vhost_blk_bio_make(struct vhost_blk_req *req,
+ struct block_device *bdev)
+{
+ int pages_nr_total, i, j, ret;
+ struct iovec *iov = req->iov;
+ int iov_nr = req->iov_nr;
+ struct page **pages, *page;
+ struct bio *bio = NULL;
+ int bio_nr = 0;
+
+ req->len = 0;
+ pages_nr_total = 0;
+ for (i = 0; i < iov_nr; i++) {
+ req->len += iov[i].iov_len;
+ pages_nr_total += iov_num_pages(&iov[i]);
+ }
+
+ if (unlikely(req->write == WRITE_FLUSH)) {
+ req->pl = NULL;
+ req->bio = kmalloc(sizeof(struct bio *), GFP_KERNEL);
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio) {
+ kfree(req->bio);
+ return -ENOMEM;
+ }
+ bio->bi_sector = req->sector;
+ bio->bi_bdev = bdev;
+ bio->bi_private = req;
+ bio->bi_end_io = vhost_blk_req_done;
+ req->bio[bio_nr++] = bio;
+
+ goto out;
+ }
+
+ req->pl = kmalloc((iov_nr * sizeof(struct req_page_list)) +
+ (pages_nr_total * sizeof(struct page *)) +
+ (pages_nr_total * sizeof(struct bio *)),
+ GFP_KERNEL);
+ if (!req->pl)
+ return -ENOMEM;
+ pages = (struct page **)&req->pl[iov_nr];
+ req->bio = (struct bio **)&pages[pages_nr_total];
+
+ req->iov_nr = 0;
+ for (i = 0; i < iov_nr; i++) {
+ int pages_nr = iov_num_pages(&iov[i]);
+ unsigned long iov_base, iov_len;
+ struct req_page_list *pl;
+
+ iov_base = (unsigned long)iov[i].iov_base;
+ iov_len = (unsigned long)iov[i].iov_len;
+
+ ret = get_user_pages_fast(iov_base, pages_nr,
+ !req->write, pages);
+ if (ret != pages_nr)
+ goto fail;
+
+ req->iov_nr++;
+ pl = &req->pl[i];
+ pl->pages_nr = pages_nr;
+ pl->pages = pages;
+
+ for (j = 0; j < pages_nr; j++) {
+ unsigned int off, len;
+ page = pages[j];
+ off = iov_base & ~PAGE_MASK;
+ len = PAGE_SIZE - off;
+ if (len > iov_len)
+ len = iov_len;
+
+ while (!bio || bio_add_page(bio, page, len, off) <= 0) {
+ bio = bio_alloc(GFP_KERNEL, pages_nr);
+ if (!bio)
+ goto fail;
+ bio->bi_sector = req->sector;
+ bio->bi_bdev = bdev;
+ bio->bi_private = req;
+ bio->bi_end_io = vhost_blk_req_done;
+ req->bio[bio_nr++] = bio;
+ }
+ req->sector += len >> 9;
+ iov_base += len;
+ iov_len -= len;
+ }
+
+ pages += pages_nr;
+ }
+out:
+ atomic_set(&req->bio_nr, bio_nr);
+ return 0;
+
+fail:
+ for (i = 0; i < bio_nr; i++)
+ bio_put(req->bio[i]);
+ vhost_blk_req_umap(req);
+ return -ENOMEM;
+}
+
+static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
+{
+ struct blk_plug plug;
+ int i, bio_nr;
+
+ bio_nr = atomic_read(&req->bio_nr);
+ blk_start_plug(&plug);
+ for (i = 0; i < bio_nr; i++)
+ submit_bio(req->write, req->bio[i]);
+ blk_finish_plug(&plug);
+}
+
+static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file)
+{
+
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *bdev = inode->i_bdev;
+ int ret;
+
+ ret = vhost_blk_bio_make(req, bdev);
+ if (ret < 0)
+ return ret;
+
+ vhost_blk_bio_send(req);
+
+ return ret;
+}
+
+static int vhost_blk_req_done_thread(void *data)
+{
+ mm_segment_t oldfs = get_fs();
+ struct vhost_blk *blk = data;
+ struct vhost_virtqueue *vq;
+ struct llist_node *llnode;
+ struct vhost_blk_req *req;
+ bool added;
+ u8 status;
+ int ret;
+
+ vq = &blk->vq;
+ set_fs(USER_DS);
+ use_mm(blk->dev.mm);
+ for (;;) {
+ llnode = llist_del_all(&blk->llhead);
+ if (!llnode) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ if (unlikely(kthread_should_stop()))
+ break;
+ continue;
+ }
+
+ if (need_resched())
+ schedule();
+
+ added = false;
+ while (llnode) {
+ req = llist_entry(llnode, struct vhost_blk_req, llnode);
+ llnode = llist_next(llnode);
+
+ vhost_blk_req_umap(req);
+
+ status = req->len > 0 ?
+ VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+ ret = vhost_blk_set_status(req, status);
+ if (unlikely(ret))
+ continue;
+ vhost_add_used(&blk->vq, req->head, req->len);
+ added = true;
+ }
+ if (likely(added))
+ vhost_signal(&blk->dev, &blk->vq);
+ }
+ unuse_mm(blk->dev.mm);
+ set_fs(oldfs);
+ return 0;
+}
+
+static void vhost_blk_flush(struct vhost_blk *blk)
+{
+ vhost_poll_flush(&blk->vq.poll);
+}
+
+static inline void vhost_blk_stop(struct vhost_blk *blk, struct file **file)
+{
+ struct vhost_virtqueue *vq = &blk->vq;
+ struct file *f;
+
+ mutex_lock(&vq->mutex);
+ f = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
+ rcu_assign_pointer(vq->private_data, NULL);
+ mutex_unlock(&vq->mutex);
+
+ *file = f;
+}
+
+/* Handle guest request */
+static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
+ struct virtio_blk_outhdr *hdr,
+ u16 head, u16 out, u16 in,
+ struct file *file)
+{
+ struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev);
+ struct vhost_blk_req *req;
+ int iov_nr, ret;
+ u8 status;
+
+ if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID)
+ iov_nr = in - 1;
+ else
+ iov_nr = out - 1;
+
+ req = &blk->reqs[head];
+ req->head = head;
+ req->blk = blk;
+ /* The block data buffer follows block header buffer */
+ req->iov = &vq->iov[BLK_HDR + 1];
+ req->iov_nr = iov_nr;
+ /* The block status buffer follows block data buffer */
+ req->status = vq->iov[iov_nr + 1].iov_base;
+ req->sector = hdr->sector;
+
+ switch (hdr->type) {
+ case VIRTIO_BLK_T_OUT:
+ req->write = WRITE;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_IN:
+ req->write = READ;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_FLUSH:
+ req->write = WRITE_FLUSH;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_GET_ID: {
+ char id[VIRTIO_BLK_ID_BYTES];
+ int len;
+
+ ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
+ "vhost-blk%d", blk->index);
+ if (ret < 0)
+ break;
+ len = ret;
+ ret = __copy_to_user(req->iov[0].iov_base, id, len);
+ status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+ ret = vhost_blk_set_status(req, status);
+ if (ret)
+ break;
+ vhost_add_used_and_signal(&blk->dev, vq, head, len);
+ break;
+ }
+ default:
+ vq_err(vq, "Unsupported request type %d\n", hdr->type);
+ status = VIRTIO_BLK_S_UNSUPP;
+ ret = vhost_blk_set_status(req, status);
+ if (ret)
+ break;
+ vhost_add_used_and_signal(&blk->dev, vq, head, 0);
+ }
+
+ return ret;
+}
+
+/* Guest kick us for IO request */
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+ struct virtio_blk_outhdr hdr;
+ struct vhost_virtqueue *vq;
+ struct vhost_blk *blk;
+ struct blk_plug plug;
+ struct file *f;
+ int in, out;
+ u16 head;
+
+ vq = container_of(work, struct vhost_virtqueue, poll.work);
+ blk = container_of(vq->dev, struct vhost_blk, dev);
+
+ /* TODO: check that we are running from vhost_worker? */
+ f = rcu_dereference_check(vq->private_data, 1);
+ if (!f)
+ return;
+
+ vhost_disable_notify(&blk->dev, vq);
+ blk_start_plug(&plug);
+ for (;;) {
+ head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (unlikely(head < 0))
+ break;
+
+ if (unlikely(head == vq->num)) {
+ if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
+ vhost_disable_notify(&blk->dev, vq);
+ continue;
+ }
+ break;
+ }
+
+ if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base,
+ sizeof(hdr)))) {
+ vq_err(vq, "Failed to get block header!\n");
+ vhost_discard_vq_desc(vq, 1);
+ break;
+ }
+
+ if (vhost_blk_req_handle(vq, &hdr, head, out, in, f) < 0)
+ break;
+ }
+ blk_finish_plug(&plug);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *file)
+{
+ struct vhost_blk *blk;
+ int ret;
+
+ blk = kzalloc(sizeof(*blk), GFP_KERNEL);
+ if (!blk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ida_simple_get(&vhost_blk_index_ida, 0, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto out_dev;
+ blk->index = ret;
+
+ blk->vq.handle_kick = vhost_blk_handle_guest_kick;
+
+ ret = vhost_dev_init(&blk->dev, &blk->vq, VHOST_BLK_VQ_MAX);
+ if (ret < 0)
+ goto out_dev;
+ file->private_data = blk;
+
+ blk->host_kick = kthread_create(vhost_blk_req_done_thread,
+ blk, "vhost-blk-%d", current->pid);
+ if (IS_ERR(blk->host_kick)) {
+ ret = PTR_ERR(blk->host_kick);
+ goto out_dev;
+ }
+
+ return ret;
+out_dev:
+ kfree(blk);
+out:
+ return ret;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+ struct vhost_blk *blk = f->private_data;
+ struct file *file;
+
+ ida_simple_remove(&vhost_blk_index_ida, blk->index);
+ vhost_blk_stop(blk, &file);
+ vhost_blk_flush(blk);
+ vhost_dev_cleanup(&blk->dev, false);
+ if (file)
+ fput(file);
+ kthread_stop(blk->host_kick);
+ kfree(blk->reqs);
+ kfree(blk);
+
+ return 0;
+}
+
+static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
+{
+ mutex_lock(&blk->dev.mutex);
+ blk->dev.acked_features = features;
+ vhost_blk_flush(blk);
+ mutex_unlock(&blk->dev.mutex);
+
+ return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *blk, unsigned index, int fd)
+{
+ struct vhost_virtqueue *vq = &blk->vq;
+ struct file *file, *oldfile;
+ struct inode *inode;
+ int ret;
+
+ mutex_lock(&blk->dev.mutex);
+ ret = vhost_dev_check_owner(&blk->dev);
+ if (ret)
+ goto out_dev;
+
+ if (index >= VHOST_BLK_VQ_MAX) {
+ ret = -ENOBUFS;
+ goto out_dev;
+ }
+
+ mutex_lock(&vq->mutex);
+
+ if (!vhost_vq_access_ok(vq)) {
+ ret = -EFAULT;
+ goto out_vq;
+ }
+
+ file = fget(fd);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto out_vq;
+ }
+
+ /* Only raw block device is supported for now */
+ inode = file->f_mapping->host;
+ if (!S_ISBLK(inode->i_mode)) {
+ ret = -EFAULT;
+ goto out_file;
+ }
+
+ oldfile = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
+ if (file != oldfile) {
+ rcu_assign_pointer(vq->private_data, file);
+ vhost_blk_enable_vq(blk, vq);
+
+ ret = vhost_init_used(vq);
+ if (ret)
+ goto out_file;
+ }
+
+ mutex_unlock(&vq->mutex);
+
+ if (oldfile) {
+ vhost_blk_flush(blk);
+ fput(oldfile);
+ }
+
+ mutex_unlock(&blk->dev.mutex);
+ return 0;
+
+out_file:
+ fput(file);
+out_vq:
+ mutex_unlock(&vq->mutex);
+out_dev:
+ mutex_unlock(&blk->dev.mutex);
+ return ret;
+}
+
+static long vhost_blk_reset_owner(struct vhost_blk *blk)
+{
+ struct file *file = NULL;
+ int err;
+
+ mutex_lock(&blk->dev.mutex);
+ err = vhost_dev_check_owner(&blk->dev);
+ if (err)
+ goto done;
+ vhost_blk_stop(blk, &file);
+ vhost_blk_flush(blk);
+ err = vhost_dev_reset_owner(&blk->dev);
+done:
+ mutex_unlock(&blk->dev.mutex);
+ if (file)
+ fput(file);
+ return err;
+}
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct vhost_blk *blk = f->private_data;
+ void __user *argp = (void __user *)arg;
+ struct vhost_vring_file backend;
+ u64 __user *featurep = argp;
+ u64 features;
+ int ret;
+
+ switch (ioctl) {
+ case VHOST_BLK_SET_BACKEND:
+ if (copy_from_user(&backend, argp, sizeof(backend)))
+ return -EFAULT;
+ return vhost_blk_set_backend(blk, backend.index, backend.fd);
+ case VHOST_GET_FEATURES:
+ features = VHOST_BLK_FEATURES;
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+ return 0;
+ case VHOST_SET_FEATURES:
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+ if (features & ~VHOST_BLK_FEATURES)
+ return -EOPNOTSUPP;
+ return vhost_blk_set_features(blk, features);
+ case VHOST_RESET_OWNER:
+ return vhost_blk_reset_owner(blk);
+ default:
+ mutex_lock(&blk->dev.mutex);
+ ret = vhost_dev_ioctl(&blk->dev, ioctl, arg);
+ if (!ret && ioctl == VHOST_SET_VRING_NUM)
+ ret = vhost_blk_setup(blk);
+ vhost_blk_flush(blk);
+ mutex_unlock(&blk->dev.mutex);
+ return ret;
+ }
+}
+
+static const struct file_operations vhost_blk_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_blk_open,
+ .release = vhost_blk_release,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+ MISC_DYNAMIC_MINOR,
+ "vhost-blk",
+ &vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+ return misc_register(&vhost_blk_misc);
+}
+
+static void vhost_blk_exit(void)
+{
+ misc_deregister(&vhost_blk_misc);
+}
+
+module_init(vhost_blk_init);
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.3");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk");
diff --git a/drivers/vhost/blk.h b/drivers/vhost/blk.h
new file mode 100644
index 0000000..2f674f0
--- /dev/null
+++ b/drivers/vhost/blk.h
@@ -0,0 +1,8 @@
+#include <linux/vhost.h>
+
+enum {
+ VHOST_BLK_FEATURES = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX),
+};
+/* VHOST_BLK specific defines */
+#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, struct vhost_vring_file)
--
1.7.11.7
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4
2012-10-15 9:57 [PATCH 1/1] vhost-blk: Add vhost-blk support v4 Asias He
@ 2012-11-07 12:30 ` Michael S. Tsirkin
2012-11-07 23:47 ` Rusty Russell
1 sibling, 0 replies; 4+ messages in thread
From: Michael S. Tsirkin @ 2012-11-07 12:30 UTC (permalink / raw)
To: Asias He; +Cc: axboe, kvm, linux-kernel, virtualization, hch, davem
On Mon, Oct 15, 2012 at 05:57:03PM +0800, Asias He wrote:
> vhost-blk is an in-kernel virito-blk device accelerator.
>
> Due to lack of proper in-kernel AIO interface, this version converts
> guest's I/O request to bio and use submit_bio() to submit I/O directly.
> So this version any supports raw block device as guest's disk image,
> e.g. /dev/sda, /dev/ram0. We can add file based image support to
> vhost-blk once we have in-kernel AIO interface. There are some work in
> progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
>
> http://marc.info/?l=linux-fsdevel&m=133312234313122
>
> Performance evaluation:
> -----------------------------
> 1) LKVM
> Fio with libaio ioengine on Fusion IO device using kvm tool
> IOPS Before After Improvement
> seq-read 107 121 +13.0%
> seq-write 130 179 +37.6%
> rnd-read 102 122 +19.6%
> rnd-write 125 159 +27.0%
>
> 2) QEMU
> Fio with libaio ioengine on Fusion IO device using QEMU
> IOPS Before After Improvement
> seq-read 76 123 +61.8%
> seq-write 139 173 +24.4%
> rnd-read 73 120 +64.3%
> rnd-write 75 156 +108.0%
>
> Userspace bits:
> -----------------------------
> 1) LKVM
> The latest vhost-blk userspace bits for kvm tool can be found here:
> git@github.com:asias/linux-kvm.git blk.vhost-blk
>
> 2) QEMU
> The latest vhost-blk userspace prototype for QEMU can be found here:
> git@github.com:asias/qemu.git blk.vhost-blk
>
> Changes in v4:
> - Mark req->status as userspace pointer
> - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
> - Add if (need_resched()) schedule() in blk thread
> - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
> - Use vq_err() instead of pr_warn()
> - Fail un Unsupported request
> - Add flush in vhost_blk_set_features()
>
> Changes in v3:
> - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
> - Check file passed by user is a raw block device file
>
> Signed-off-by: Asias He <asias@redhat.com>
Cc linux-kernel@vger.kernel.org, Jens, hch, davem.
I wonder what do others think. Christoph - any comments?
Also, what's the best way to merge this?
To avoid conflicts when vhost APIs change
(as that will be for 3.9) I can take this on the vhost tree
which is normally merged by Dave Miller.
Is this OK with everyone? Any acks?
> ---
> drivers/vhost/Kconfig | 1 +
> drivers/vhost/Kconfig.blk | 10 +
> drivers/vhost/Makefile | 2 +
> drivers/vhost/blk.c | 677 ++++++++++++++++++++++++++++++++++++++++++++++
> drivers/vhost/blk.h | 8 +
> 5 files changed, 698 insertions(+)
> create mode 100644 drivers/vhost/Kconfig.blk
> create mode 100644 drivers/vhost/blk.c
> create mode 100644 drivers/vhost/blk.h
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index 202bba6..acd8038 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -11,4 +11,5 @@ config VHOST_NET
>
> if STAGING
> source "drivers/vhost/Kconfig.tcm"
> +source "drivers/vhost/Kconfig.blk"
> endif
> diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
> new file mode 100644
> index 0000000..ff8ab76
> --- /dev/null
> +++ b/drivers/vhost/Kconfig.blk
> @@ -0,0 +1,10 @@
> +config VHOST_BLK
> + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> + depends on BLOCK && EXPERIMENTAL && m
> + ---help---
> + This kernel module can be loaded in host kernel to accelerate
> + guest block with virtio_blk. Not to be confused with virtio_blk
> + module itself which needs to be loaded in guest kernel.
> +
> + To compile this driver as a module, choose M here: the module will
> + be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index a27b053..1a8a4a5 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
> vhost_net-y := vhost.o net.o
>
> obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 0000000..5c2b790
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,677 @@
> +/*
> + * Copyright (C) 2011 Taobao, Inc.
> + * Author: Liu Yuan <tailai.ly@taobao.com>
> + *
> + * Copyright (C) 2012 Red Hat, Inc.
> + * Author: Asias He <asias@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/miscdevice.h>
> +#include <linux/module.h>
> +#include <linux/vhost.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/mutex.h>
> +#include <linux/file.h>
> +#include <linux/kthread.h>
> +#include <linux/blkdev.h>
> +
> +#include "vhost.c"
> +#include "vhost.h"
> +#include "blk.h"
> +
> +/* The block header is in the first and separate buffer. */
> +#define BLK_HDR 0
> +
> +static DEFINE_IDA(vhost_blk_index_ida);
> +
> +enum {
> + VHOST_BLK_VQ_REQ = 0,
> + VHOST_BLK_VQ_MAX = 1,
> +};
> +
> +struct req_page_list {
> + struct page **pages;
> + int pages_nr;
> +};
> +
> +struct vhost_blk_req {
> + struct llist_node llnode;
> + struct req_page_list *pl;
> + struct vhost_blk *blk;
> +
> + struct iovec *iov;
> + int iov_nr;
> +
> + struct bio **bio;
> + atomic_t bio_nr;
> +
> + sector_t sector;
> + int write;
> + u16 head;
> + long len;
> +
> + u8 __user *status;
> +};
> +
> +struct vhost_blk {
> + struct task_struct *host_kick;
> + struct vhost_blk_req *reqs;
> + struct vhost_virtqueue vq;
> + struct llist_head llhead;
> + struct vhost_dev dev;
> + u16 reqs_nr;
> + int index;
> +};
> +
> +static inline int iov_num_pages(struct iovec *iov)
> +{
> + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
> + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> +}
> +
> +static int vhost_blk_setup(struct vhost_blk *blk)
> +{
> + blk->reqs_nr = blk->vq.num;
> +
> + blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->reqs_nr,
> + GFP_KERNEL);
> + if (!blk->reqs)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status)
> +{
> + struct vhost_blk *blk = req->blk;
> +
> + if (__copy_to_user(req->status, &status, sizeof(status))) {
> + vq_err(&blk->vq, "Failed to write status\n");
> + return -EFAULT;
> + }
> +
> + return 0;
> +}
> +
> +static void vhost_blk_enable_vq(struct vhost_blk *blk,
> + struct vhost_virtqueue *vq)
> +{
> + wake_up_process(blk->host_kick);
> +}
> +
> +static void vhost_blk_req_done(struct bio *bio, int err)
> +{
> + struct vhost_blk_req *req = bio->bi_private;
> + struct vhost_blk *blk = req->blk;
> +
> + if (err)
> + req->len = err;
> +
> + if (atomic_dec_and_test(&req->bio_nr)) {
> + llist_add(&req->llnode, &blk->llhead);
> + wake_up_process(blk->host_kick);
> + }
> +
> + bio_put(bio);
> +}
> +
> +static void vhost_blk_req_umap(struct vhost_blk_req *req)
> +{
> + struct req_page_list *pl;
> + int i, j;
> +
> +
> + if (!req->pl) {
> + kfree(req->bio);
> + return;
> + }
> +
> + for (i = 0; i < req->iov_nr; i++) {
> + pl = &req->pl[i];
> + for (j = 0; j < pl->pages_nr; j++) {
> + if (!req->write)
> + set_page_dirty_lock(pl->pages[j]);
> + page_cache_release(pl->pages[j]);
> + }
> + }
> +
> + kfree(req->pl);
> +}
> +
> +static int vhost_blk_bio_make(struct vhost_blk_req *req,
> + struct block_device *bdev)
> +{
> + int pages_nr_total, i, j, ret;
> + struct iovec *iov = req->iov;
> + int iov_nr = req->iov_nr;
> + struct page **pages, *page;
> + struct bio *bio = NULL;
> + int bio_nr = 0;
> +
> + req->len = 0;
> + pages_nr_total = 0;
> + for (i = 0; i < iov_nr; i++) {
> + req->len += iov[i].iov_len;
> + pages_nr_total += iov_num_pages(&iov[i]);
> + }
> +
> + if (unlikely(req->write == WRITE_FLUSH)) {
> + req->pl = NULL;
> + req->bio = kmalloc(sizeof(struct bio *), GFP_KERNEL);
> + bio = bio_alloc(GFP_KERNEL, 1);
> + if (!bio) {
> + kfree(req->bio);
> + return -ENOMEM;
> + }
> + bio->bi_sector = req->sector;
> + bio->bi_bdev = bdev;
> + bio->bi_private = req;
> + bio->bi_end_io = vhost_blk_req_done;
> + req->bio[bio_nr++] = bio;
> +
> + goto out;
> + }
> +
> + req->pl = kmalloc((iov_nr * sizeof(struct req_page_list)) +
> + (pages_nr_total * sizeof(struct page *)) +
> + (pages_nr_total * sizeof(struct bio *)),
> + GFP_KERNEL);
> + if (!req->pl)
> + return -ENOMEM;
> + pages = (struct page **)&req->pl[iov_nr];
> + req->bio = (struct bio **)&pages[pages_nr_total];
> +
> + req->iov_nr = 0;
> + for (i = 0; i < iov_nr; i++) {
> + int pages_nr = iov_num_pages(&iov[i]);
> + unsigned long iov_base, iov_len;
> + struct req_page_list *pl;
> +
> + iov_base = (unsigned long)iov[i].iov_base;
> + iov_len = (unsigned long)iov[i].iov_len;
> +
> + ret = get_user_pages_fast(iov_base, pages_nr,
> + !req->write, pages);
> + if (ret != pages_nr)
> + goto fail;
> +
> + req->iov_nr++;
> + pl = &req->pl[i];
> + pl->pages_nr = pages_nr;
> + pl->pages = pages;
> +
> + for (j = 0; j < pages_nr; j++) {
> + unsigned int off, len;
> + page = pages[j];
> + off = iov_base & ~PAGE_MASK;
> + len = PAGE_SIZE - off;
> + if (len > iov_len)
> + len = iov_len;
> +
> + while (!bio || bio_add_page(bio, page, len, off) <= 0) {
> + bio = bio_alloc(GFP_KERNEL, pages_nr);
> + if (!bio)
> + goto fail;
> + bio->bi_sector = req->sector;
> + bio->bi_bdev = bdev;
> + bio->bi_private = req;
> + bio->bi_end_io = vhost_blk_req_done;
> + req->bio[bio_nr++] = bio;
> + }
> + req->sector += len >> 9;
> + iov_base += len;
> + iov_len -= len;
> + }
> +
> + pages += pages_nr;
> + }
> +out:
> + atomic_set(&req->bio_nr, bio_nr);
> + return 0;
> +
> +fail:
> + for (i = 0; i < bio_nr; i++)
> + bio_put(req->bio[i]);
> + vhost_blk_req_umap(req);
> + return -ENOMEM;
> +}
> +
> +static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
> +{
> + struct blk_plug plug;
> + int i, bio_nr;
> +
> + bio_nr = atomic_read(&req->bio_nr);
> + blk_start_plug(&plug);
> + for (i = 0; i < bio_nr; i++)
> + submit_bio(req->write, req->bio[i]);
> + blk_finish_plug(&plug);
> +}
> +
> +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file)
> +{
> +
> + struct inode *inode = file->f_mapping->host;
> + struct block_device *bdev = inode->i_bdev;
> + int ret;
> +
> + ret = vhost_blk_bio_make(req, bdev);
> + if (ret < 0)
> + return ret;
> +
> + vhost_blk_bio_send(req);
> +
> + return ret;
> +}
> +
> +static int vhost_blk_req_done_thread(void *data)
> +{
> + mm_segment_t oldfs = get_fs();
> + struct vhost_blk *blk = data;
> + struct vhost_virtqueue *vq;
> + struct llist_node *llnode;
> + struct vhost_blk_req *req;
> + bool added;
> + u8 status;
> + int ret;
> +
> + vq = &blk->vq;
> + set_fs(USER_DS);
> + use_mm(blk->dev.mm);
> + for (;;) {
> + llnode = llist_del_all(&blk->llhead);
> + if (!llnode) {
> + set_current_state(TASK_INTERRUPTIBLE);
> + schedule();
> + if (unlikely(kthread_should_stop()))
> + break;
> + continue;
> + }
> +
> + if (need_resched())
> + schedule();
> +
> + added = false;
> + while (llnode) {
> + req = llist_entry(llnode, struct vhost_blk_req, llnode);
> + llnode = llist_next(llnode);
> +
> + vhost_blk_req_umap(req);
> +
> + status = req->len > 0 ?
> + VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
> + ret = vhost_blk_set_status(req, status);
> + if (unlikely(ret))
> + continue;
> + vhost_add_used(&blk->vq, req->head, req->len);
> + added = true;
> + }
> + if (likely(added))
> + vhost_signal(&blk->dev, &blk->vq);
> + }
> + unuse_mm(blk->dev.mm);
> + set_fs(oldfs);
> + return 0;
> +}
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> + vhost_poll_flush(&blk->vq.poll);
> +}
> +
> +static inline void vhost_blk_stop(struct vhost_blk *blk, struct file **file)
> +{
> + struct vhost_virtqueue *vq = &blk->vq;
> + struct file *f;
> +
> + mutex_lock(&vq->mutex);
> + f = rcu_dereference_protected(vq->private_data,
> + lockdep_is_held(&vq->mutex));
> + rcu_assign_pointer(vq->private_data, NULL);
> + mutex_unlock(&vq->mutex);
> +
> + *file = f;
> +}
> +
> +/* Handle guest request */
> +static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
> + struct virtio_blk_outhdr *hdr,
> + u16 head, u16 out, u16 in,
> + struct file *file)
> +{
> + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev);
> + struct vhost_blk_req *req;
> + int iov_nr, ret;
> + u8 status;
> +
> + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID)
> + iov_nr = in - 1;
> + else
> + iov_nr = out - 1;
> +
> + req = &blk->reqs[head];
> + req->head = head;
> + req->blk = blk;
> + /* The block data buffer follows block header buffer */
> + req->iov = &vq->iov[BLK_HDR + 1];
> + req->iov_nr = iov_nr;
> + /* The block status buffer follows block data buffer */
> + req->status = vq->iov[iov_nr + 1].iov_base;
> + req->sector = hdr->sector;
> +
> + switch (hdr->type) {
> + case VIRTIO_BLK_T_OUT:
> + req->write = WRITE;
> + ret = vhost_blk_req_submit(req, file);
> + break;
> + case VIRTIO_BLK_T_IN:
> + req->write = READ;
> + ret = vhost_blk_req_submit(req, file);
> + break;
> + case VIRTIO_BLK_T_FLUSH:
> + req->write = WRITE_FLUSH;
> + ret = vhost_blk_req_submit(req, file);
> + break;
> + case VIRTIO_BLK_T_GET_ID: {
> + char id[VIRTIO_BLK_ID_BYTES];
> + int len;
> +
> + ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
> + "vhost-blk%d", blk->index);
> + if (ret < 0)
> + break;
> + len = ret;
> + ret = __copy_to_user(req->iov[0].iov_base, id, len);
> + status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> + ret = vhost_blk_set_status(req, status);
> + if (ret)
> + break;
> + vhost_add_used_and_signal(&blk->dev, vq, head, len);
> + break;
> + }
> + default:
> + vq_err(vq, "Unsupported request type %d\n", hdr->type);
> + status = VIRTIO_BLK_S_UNSUPP;
> + ret = vhost_blk_set_status(req, status);
> + if (ret)
> + break;
> + vhost_add_used_and_signal(&blk->dev, vq, head, 0);
> + }
> +
> + return ret;
> +}
> +
> +/* Guest kick us for IO request */
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> + struct virtio_blk_outhdr hdr;
> + struct vhost_virtqueue *vq;
> + struct vhost_blk *blk;
> + struct blk_plug plug;
> + struct file *f;
> + int in, out;
> + u16 head;
> +
> + vq = container_of(work, struct vhost_virtqueue, poll.work);
> + blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> + /* TODO: check that we are running from vhost_worker? */
> + f = rcu_dereference_check(vq->private_data, 1);
> + if (!f)
> + return;
> +
> + vhost_disable_notify(&blk->dev, vq);
> + blk_start_plug(&plug);
> + for (;;) {
> + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in, NULL, NULL);
> + if (unlikely(head < 0))
> + break;
> +
> + if (unlikely(head == vq->num)) {
> + if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
> + vhost_disable_notify(&blk->dev, vq);
> + continue;
> + }
> + break;
> + }
> +
> + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base,
> + sizeof(hdr)))) {
> + vq_err(vq, "Failed to get block header!\n");
> + vhost_discard_vq_desc(vq, 1);
> + break;
> + }
> +
> + if (vhost_blk_req_handle(vq, &hdr, head, out, in, f) < 0)
> + break;
> + }
> + blk_finish_plug(&plug);
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> + struct vhost_blk *blk;
> + int ret;
> +
> + blk = kzalloc(sizeof(*blk), GFP_KERNEL);
> + if (!blk) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + ret = ida_simple_get(&vhost_blk_index_ida, 0, 0, GFP_KERNEL);
> + if (ret < 0)
> + goto out_dev;
> + blk->index = ret;
> +
> + blk->vq.handle_kick = vhost_blk_handle_guest_kick;
> +
> + ret = vhost_dev_init(&blk->dev, &blk->vq, VHOST_BLK_VQ_MAX);
> + if (ret < 0)
> + goto out_dev;
> + file->private_data = blk;
> +
> + blk->host_kick = kthread_create(vhost_blk_req_done_thread,
> + blk, "vhost-blk-%d", current->pid);
> + if (IS_ERR(blk->host_kick)) {
> + ret = PTR_ERR(blk->host_kick);
> + goto out_dev;
> + }
> +
> + return ret;
> +out_dev:
> + kfree(blk);
> +out:
> + return ret;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> + struct vhost_blk *blk = f->private_data;
> + struct file *file;
> +
> + ida_simple_remove(&vhost_blk_index_ida, blk->index);
> + vhost_blk_stop(blk, &file);
> + vhost_blk_flush(blk);
> + vhost_dev_cleanup(&blk->dev, false);
> + if (file)
> + fput(file);
> + kthread_stop(blk->host_kick);
> + kfree(blk->reqs);
> + kfree(blk);
> +
> + return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> + mutex_lock(&blk->dev.mutex);
> + blk->dev.acked_features = features;
> + vhost_blk_flush(blk);
> + mutex_unlock(&blk->dev.mutex);
> +
> + return 0;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, unsigned index, int fd)
> +{
> + struct vhost_virtqueue *vq = &blk->vq;
> + struct file *file, *oldfile;
> + struct inode *inode;
> + int ret;
> +
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_check_owner(&blk->dev);
> + if (ret)
> + goto out_dev;
> +
> + if (index >= VHOST_BLK_VQ_MAX) {
> + ret = -ENOBUFS;
> + goto out_dev;
> + }
> +
> + mutex_lock(&vq->mutex);
> +
> + if (!vhost_vq_access_ok(vq)) {
> + ret = -EFAULT;
> + goto out_vq;
> + }
> +
> + file = fget(fd);
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto out_vq;
> + }
> +
> + /* Only raw block device is supported for now */
> + inode = file->f_mapping->host;
> + if (!S_ISBLK(inode->i_mode)) {
> + ret = -EFAULT;
> + goto out_file;
> + }
> +
> + oldfile = rcu_dereference_protected(vq->private_data,
> + lockdep_is_held(&vq->mutex));
> + if (file != oldfile) {
> + rcu_assign_pointer(vq->private_data, file);
> + vhost_blk_enable_vq(blk, vq);
> +
> + ret = vhost_init_used(vq);
> + if (ret)
> + goto out_file;
> + }
> +
> + mutex_unlock(&vq->mutex);
> +
> + if (oldfile) {
> + vhost_blk_flush(blk);
> + fput(oldfile);
> + }
> +
> + mutex_unlock(&blk->dev.mutex);
> + return 0;
> +
> +out_file:
> + fput(file);
> +out_vq:
> + mutex_unlock(&vq->mutex);
> +out_dev:
> + mutex_unlock(&blk->dev.mutex);
> + return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> + struct file *file = NULL;
> + int err;
> +
> + mutex_lock(&blk->dev.mutex);
> + err = vhost_dev_check_owner(&blk->dev);
> + if (err)
> + goto done;
> + vhost_blk_stop(blk, &file);
> + vhost_blk_flush(blk);
> + err = vhost_dev_reset_owner(&blk->dev);
> +done:
> + mutex_unlock(&blk->dev.mutex);
> + if (file)
> + fput(file);
> + return err;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> + unsigned long arg)
> +{
> + struct vhost_blk *blk = f->private_data;
> + void __user *argp = (void __user *)arg;
> + struct vhost_vring_file backend;
> + u64 __user *featurep = argp;
> + u64 features;
> + int ret;
> +
> + switch (ioctl) {
> + case VHOST_BLK_SET_BACKEND:
> + if (copy_from_user(&backend, argp, sizeof(backend)))
> + return -EFAULT;
> + return vhost_blk_set_backend(blk, backend.index, backend.fd);
> + case VHOST_GET_FEATURES:
> + features = VHOST_BLK_FEATURES;
> + if (copy_to_user(featurep, &features, sizeof(features)))
> + return -EFAULT;
> + return 0;
> + case VHOST_SET_FEATURES:
> + if (copy_from_user(&features, featurep, sizeof(features)))
> + return -EFAULT;
> + if (features & ~VHOST_BLK_FEATURES)
> + return -EOPNOTSUPP;
> + return vhost_blk_set_features(blk, features);
> + case VHOST_RESET_OWNER:
> + return vhost_blk_reset_owner(blk);
> + default:
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_ioctl(&blk->dev, ioctl, arg);
> + if (!ret && ioctl == VHOST_SET_VRING_NUM)
> + ret = vhost_blk_setup(blk);
> + vhost_blk_flush(blk);
> + mutex_unlock(&blk->dev.mutex);
> + return ret;
> + }
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> + .owner = THIS_MODULE,
> + .open = vhost_blk_open,
> + .release = vhost_blk_release,
> + .llseek = noop_llseek,
> + .unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> + MISC_DYNAMIC_MINOR,
> + "vhost-blk",
> + &vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> + return misc_register(&vhost_blk_misc);
> +}
> +
> +static void vhost_blk_exit(void)
> +{
> + misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_init(vhost_blk_init);
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("0.0.3");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Asias He");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk");
> diff --git a/drivers/vhost/blk.h b/drivers/vhost/blk.h
> new file mode 100644
> index 0000000..2f674f0
> --- /dev/null
> +++ b/drivers/vhost/blk.h
> @@ -0,0 +1,8 @@
> +#include <linux/vhost.h>
> +
> +enum {
> + VHOST_BLK_FEATURES = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> + (1ULL << VIRTIO_RING_F_EVENT_IDX),
> +};
> +/* VHOST_BLK specific defines */
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, struct vhost_vring_file)
> --
> 1.7.11.7
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4
2012-10-15 9:57 [PATCH 1/1] vhost-blk: Add vhost-blk support v4 Asias He
2012-11-07 12:30 ` Michael S. Tsirkin
@ 2012-11-07 23:47 ` Rusty Russell
2012-11-08 7:16 ` Asias He
1 sibling, 1 reply; 4+ messages in thread
From: Rusty Russell @ 2012-11-07 23:47 UTC (permalink / raw)
To: Asias He, Michael S. Tsirkin; +Cc: kvm, virtualization
Asias He <asias@redhat.com> writes:
> vhost-blk is an in-kernel virito-blk device accelerator.
>
> Due to lack of proper in-kernel AIO interface, this version converts
> guest's I/O request to bio and use submit_bio() to submit I/O directly.
> So this version any supports raw block device as guest's disk image,
> e.g. /dev/sda, /dev/ram0. We can add file based image support to
> vhost-blk once we have in-kernel AIO interface. There are some work in
> progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
>
> http://marc.info/?l=linux-fsdevel&m=133312234313122
OK, this generally looks quite neat. There is one significant bug,
however:
> +/* The block header is in the first and separate buffer. */
> +#define BLK_HDR 0
You need to do a proper pull off the iovec; you can't simply assume
this. I'm working on fixing qemu, too.
linux/drivers/vhost/net.c simply skips the header, you want something
which actually copies it from userspace:
/* Returns 0, -EFAULT or -EINVAL (too short) */
int copy_from_iovec_user(void *dst, size_t len, struct iovec *iov, int iov_nr);
int copy_to_iovec_user(struct iovec *iov, int iov_nr, const void *src, size_t len);
These consume the iov in place. You could pass struct iovec **iov and
int * if you wanted to be really efficient (otherwise you have
zero-length iov entries at the front after you've pulled things off).
This goes away:
> + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID)
> + iov_nr = in - 1;
> + else
> + iov_nr = out - 1;
This becomes a simple assignment:
> + /* The block data buffer follows block header buffer */
> + req->iov = &vq->iov[BLK_HDR + 1];
This one actually requires iteration, since you should handle the case
where the last iov is zero length:
> + /* The block status buffer follows block data buffer */
> + req->status = vq->iov[iov_nr + 1].iov_base;
This becomes copy_to_iovec_user:
> + case VIRTIO_BLK_T_GET_ID: {
> + char id[VIRTIO_BLK_ID_BYTES];
> + int len;
> +
> + ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
> + "vhost-blk%d", blk->index);
> + if (ret < 0)
> + break;
> + len = ret;
> + ret = __copy_to_user(req->iov[0].iov_base, id, len);
This becomes copy_from_iovec_user:
> + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base,
> + sizeof(hdr)))) {
> + vq_err(vq, "Failed to get block header!\n");
> + vhost_discard_vq_desc(vq, 1);
> + break;
> + }
The rest looks OK, at a glance.
Thanks,
Rusty.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4
2012-11-07 23:47 ` Rusty Russell
@ 2012-11-08 7:16 ` Asias He
0 siblings, 0 replies; 4+ messages in thread
From: Asias He @ 2012-11-08 7:16 UTC (permalink / raw)
To: Rusty Russell; +Cc: virtualization, kvm, Michael S. Tsirkin
Hello Rusty,
On Thu, Nov 8, 2012 at 7:47 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> Asias He <asias@redhat.com> writes:
>> vhost-blk is an in-kernel virito-blk device accelerator.
>>
>> Due to lack of proper in-kernel AIO interface, this version converts
>> guest's I/O request to bio and use submit_bio() to submit I/O directly.
>> So this version any supports raw block device as guest's disk image,
>> e.g. /dev/sda, /dev/ram0. We can add file based image support to
>> vhost-blk once we have in-kernel AIO interface. There are some work in
>> progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
>>
>> http://marc.info/?l=linux-fsdevel&m=133312234313122
>
> OK, this generally looks quite neat. There is one significant bug,
> however:
>
>> +/* The block header is in the first and separate buffer. */
>> +#define BLK_HDR 0
>
> You need to do a proper pull off the iovec; you can't simply assume
> this. I'm working on fixing qemu, too.
Yes. I have changed the code to handle the buffer without assumption
about the layout already.
Just haven't sent out the new version. I will send it out after the kvm forum.
Cheers.
> linux/drivers/vhost/net.c simply skips the header, you want something
> which actually copies it from userspace:
>
> /* Returns 0, -EFAULT or -EINVAL (too short) */
> int copy_from_iovec_user(void *dst, size_t len, struct iovec *iov, int iov_nr);
> int copy_to_iovec_user(struct iovec *iov, int iov_nr, const void *src, size_t len);
>
> These consume the iov in place. You could pass struct iovec **iov and
> int * if you wanted to be really efficient (otherwise you have
> zero-length iov entries at the front after you've pulled things off).
>
> This goes away:
>> + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID)
>> + iov_nr = in - 1;
>> + else
>> + iov_nr = out - 1;
>
> This becomes a simple assignment:
>
>> + /* The block data buffer follows block header buffer */
>> + req->iov = &vq->iov[BLK_HDR + 1];
>
> This one actually requires iteration, since you should handle the case
> where the last iov is zero length:
>
>> + /* The block status buffer follows block data buffer */
>> + req->status = vq->iov[iov_nr + 1].iov_base;
>
> This becomes copy_to_iovec_user:
>
>> + case VIRTIO_BLK_T_GET_ID: {
>> + char id[VIRTIO_BLK_ID_BYTES];
>> + int len;
>> +
>> + ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
>> + "vhost-blk%d", blk->index);
>> + if (ret < 0)
>> + break;
>> + len = ret;
>> + ret = __copy_to_user(req->iov[0].iov_base, id, len);
>
> This becomes copy_from_iovec_user:
>
>> + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base,
>> + sizeof(hdr)))) {
>> + vq_err(vq, "Failed to get block header!\n");
>> + vhost_discard_vq_desc(vq, 1);
>> + break;
>> + }
>
> The rest looks OK, at a glance.
>
> Thanks,
> Rusty.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Asias He
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2012-11-08 7:16 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-15 9:57 [PATCH 1/1] vhost-blk: Add vhost-blk support v4 Asias He
2012-11-07 12:30 ` Michael S. Tsirkin
2012-11-07 23:47 ` Rusty Russell
2012-11-08 7:16 ` Asias He
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).