public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Badari Pulavarty <pbadari@us.ibm.com>
To: Christoph Hellwig <hch@infradead.org>
Cc: kvm@vger.kernel.org
Subject: Re: [RFC] vhost-blk implementation
Date: Mon, 29 Mar 2010 08:41:52 -0700	[thread overview]
Message-ID: <1269877312.7931.93.camel@badari-desktop> (raw)
In-Reply-To: <20100324200402.GA22272@infradead.org>

Hi Christoph,

I am wondering if you can provide your thoughts here..

I modified my vhost-blk implementation to offload work to
work_queues instead of doing synchronously. Infact, I tried
to spread the work across all the CPUs. But to my surprise,
this did not improve the performance compared to virtio-blk.

I see vhost-blk taking more interrupts and context switches
compared to virtio-blk. What is virtio-blk doing which I
am not able to from vhost-blk ???

Thanks,
Badari


vhost-blk

procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 3  1   8920  56076  20760 5603556    0  104   196 79826 17164 13912  0  5 65 30  0
 2  4   9488  57216  20744 5605616    0  114   195 81120 17397 13824  0  5 65 30  0
 2  2  10028  68476  20728 5594764    0  108   206 80318 17162 13845  0  5 65 30  0
 0  4  10560  70856  20708 5593088    0  106   205 82363 17402 13904  0  5 65 30  0
 1  3  10948  80380  20672 5584452    0   78   178 79714 17113 13875  0  5 66 29  0

qemu virtio-blk:

procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  1  14124  57456   5144 4924060    0    0   139 142546 11287 9312  1  4 80 15  0
 0  2  14124  56736   5148 4927396    0    0   146 142968 11283 9248  1  4 80 15  0
 0  1  14124  56712   5384 4927020    0    0    74 150738 11182 9327  1  4 80 16  0
 1  1  14124  55496   5392 4927904    0    0     2 159902 11172 9401  1  3 79 17  0
 0  1  14124  55968   5408 4927232    0    0     0 159202 11212 9325  1  3 80 16  0

---
 drivers/vhost/blk.c |  310 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)

Index: net-next/drivers/vhost/blk.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ net-next/drivers/vhost/blk.c	2010-03-25 20:06:57.484054770 -0400
@@ -0,0 +1,310 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+
+#if 0
+#define myprintk(fmt, ...) printk(pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define myprintk(fmt, ...)
+#endif
+
+struct vhost_blk {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+	struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+	struct work_struct work;
+	struct vhost_blk *blk;
+	struct file *file;
+	int head;
+	uint32_t type;
+	uint64_t sector;
+	struct iovec *iov;
+	int nvecs;
+};
+
+static struct workqueue_struct *vblk_workqueue;
+
+static void handle_io_work(struct work_struct *work)
+{
+	struct vhost_blk_io *vbio;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	int i, ret = 0;
+	loff_t pos;
+	uint8_t status = 0;
+
+	vbio = container_of(work, struct vhost_blk_io, work);
+	blk = vbio->blk;
+	vq = &blk->dev.vqs[0];
+	pos = vbio->sector << 8;
+
+	use_mm(blk->dev.mm);
+
+	if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+		ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+	} else if (vbio->type & VIRTIO_BLK_T_OUT) {
+		ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	} else {
+		ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+	}
+
+	status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+	if (copy_to_user(vbio->iov[vbio->nvecs].iov_base, &status, sizeof status) < 0) {
+		printk("copy to user failed\n");
+		vhost_discard_vq_desc(vq);
+        	unuse_mm(blk->dev.mm);
+		return;
+	}
+	mutex_lock(&vq->mutex);
+	vhost_add_used_and_signal(&blk->dev, vq, vbio->head, ret);
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+	kfree(vbio);
+}
+
+static int cpu = 0;
+static int handoff_io(struct vhost_blk *blk, int head,
+			uint32_t type, uint64_t sector,
+			struct iovec *iov, int nvecs)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	struct vhost_blk_io *vbio;
+
+	vbio = kmalloc(sizeof(struct vhost_blk_io), GFP_KERNEL);
+	if (!vbio)
+		return -ENOMEM;
+
+	INIT_WORK(&vbio->work, handle_io_work);
+	vbio->blk = blk;
+	vbio->file = vq->private_data;
+	vbio->head = head;
+	vbio->type = type;
+	vbio->sector = sector;
+	vbio->iov = iov;
+	vbio->nvecs = nvecs;
+
+	cpu = cpumask_next(cpu, cpu_online_mask);
+	if (cpu >= nr_cpu_ids)
+		cpu = cpumask_first(cpu_online_mask);
+	queue_work_on(cpu, vblk_workqueue, &vbio->work);
+
+	return 0;
+}
+
+
+static void handle_blk(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+	unsigned head, out, in;
+	struct virtio_blk_outhdr hdr;
+	int r, nvecs;
+
+	use_mm(blk->dev.mm);
+	mutex_lock(&vq->mutex);
+
+	vhost_disable_notify(vq);
+
+	for (;;) {
+		head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
+				continue;
+			}
+			break;
+		}
+
+		BUG_ON(vq->iov[0].iov_len != 16);
+
+		r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr);
+		if (r < 0) {
+			printk("copy from user failed\n");
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+
+		nvecs = out - 1;
+		if (hdr.type == VIRTIO_BLK_T_IN)
+			nvecs = in - 1;
+
+		BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+		r = handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs);
+		if (r < 0) {
+			vhost_discard_vq_desc(vq);
+			break;
+		}
+	}
+	mutex_unlock(&vq->mutex);
+	unuse_mm(blk->dev.mm);
+}
+
+static void vhost_blk_flush(struct vhost_blk *n)
+{
+	vhost_poll_flush(n->poll);
+	vhost_poll_flush(&n->dev.vqs[0].poll);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_blk *blk;
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	handle_blk(blk);
+}
+
+static void handle_rq_blk(struct work_struct *work)
+{
+	struct vhost_blk *blk;
+	blk = container_of(work, struct vhost_blk, poll[0].work);
+	handle_blk(blk);
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL);
+	int r;
+	if (!n)
+		return -ENOMEM;
+	n->vqs[0].handle_kick = handle_blk_kick;
+	r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX);
+	if (r < 0) {
+		kfree(n);
+		return r;
+	}
+
+	vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN);
+	f->private_data = n;
+	return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *n = f->private_data;
+
+	fput(n->vqs->private_data);
+	kfree(n);
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd)
+{
+	struct file *file;
+	struct vhost_virtqueue *vq;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	vq = n->vqs + index;
+	mutex_lock(&vq->mutex);
+	rcu_assign_pointer(vq->private_data, file);
+	mutex_unlock(&vq->mutex);
+	return 0;
+}
+
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+	struct vhost_blk *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	int r;
+
+	switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+		r = copy_from_user(&backend, argp, sizeof backend);
+		if (r <	0)
+			return r;
+		return vhost_blk_set_backend(n, backend.index, backend.fd);
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		vhost_blk_flush(n);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+const static struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.release        = vhost_blk_release,
+	.open           = vhost_blk_open,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	234,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+
+int vhost_blk_init(void)
+{
+	int r = vhost_init();
+	if (r)
+		goto err_init;
+
+	vblk_workqueue = create_workqueue("vblk");
+	if (!vblk_workqueue) {
+		r = -ENOMEM;
+		goto err_vblk;
+	}
+
+	r = misc_register(&vhost_blk_misc);
+	if (r)
+		goto err_reg;
+	return 0;
+err_reg:
+	destroy_workqueue(vblk_workqueue);
+err_vblk:
+	vhost_cleanup();
+err_init:
+	return r;
+
+}
+module_init(vhost_blk_init);
+
+void vhost_blk_exit(void)
+{
+	misc_deregister(&vhost_blk_misc);
+	destroy_workqueue(vblk_workqueue);
+	vhost_cleanup();
+}
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");



  parent reply	other threads:[~2010-03-29 15:41 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-23  1:00 [RFC] vhost-blk implementation Badari Pulavarty
2010-03-23  1:16 ` Anthony Liguori
2010-03-23  1:45   ` Badari Pulavarty
2010-03-23  2:00     ` Anthony Liguori
2010-03-23  2:50       ` Badari Pulavarty
2010-03-23 10:05         ` Avi Kivity
2010-03-23 14:48           ` Badari Pulavarty
2010-03-23 10:03 ` Avi Kivity
2010-03-23 14:55   ` Badari Pulavarty
2010-03-23 16:53     ` Avi Kivity
2010-03-24 20:05   ` Christoph Hellwig
2010-03-25  6:29     ` Avi Kivity
2010-03-25 15:48       ` Christoph Hellwig
2010-03-25 15:51         ` Avi Kivity
2010-03-25 15:00     ` Asdo
2010-04-05 19:59       ` Christoph Hellwig
2010-04-07  0:36         ` [RFC] vhost-blk implementation (v2) Badari Pulavarty
2010-03-23 10:09 ` [RFC] vhost-blk implementation Eran Rom
2010-03-24 20:04 ` Christoph Hellwig
2010-03-24 20:22   ` Badari Pulavarty
2010-03-25  7:57     ` Avi Kivity
2010-03-25 14:36       ` Badari Pulavarty
2010-03-25 15:57     ` Christoph Hellwig
2010-03-26 18:53       ` Eran Rom
2010-04-08 16:17         ` Stefan Hajnoczi
2010-04-05 19:23     ` Christoph Hellwig
2010-04-05 23:17       ` Badari Pulavarty
2010-03-24 20:27   ` Badari Pulavarty
2010-03-29 15:41   ` Badari Pulavarty [this message]
2010-03-29 18:20     ` Chris Wright
2010-03-29 20:37       ` Avi Kivity
2010-03-29 22:51         ` Badari Pulavarty
2010-03-29 23:56           ` Chris Wright
2010-03-30 12:43           ` Avi Kivity
2010-04-05 14:22     ` Stefan Hajnoczi
2010-04-06  2:27       ` Badari Pulavarty

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1269877312.7931.93.camel@badari-desktop \
    --to=pbadari@us.ibm.com \
    --cc=hch@infradead.org \
    --cc=kvm@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox