[PATCH 1/2] [kvm/vhost]: make vhost support NUMA model.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Liu Ping Fan <kernelfans@gmail.com>
To: kvm@vger.kernel.org, netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, qemu-devel@nongnu.org,
	Avi Kivity <avi@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	Rusty Russell <rusty@rustcorp.com.au>,
	Anthony Liguori <anthony@codemonkey.ws>,
	Ryan Harper <ryanh@us.ibm.com>, Shirley Ma <xma@us.ibm.com>,
	Krishna Kumar <krkumar2@in.ibm.com>,
	Tom Lendacky <toml@us.ibm.com>
Subject: [PATCH 1/2] [kvm/vhost]: make vhost support NUMA model.
Date: Thu, 17 May 2012 17:20:53 +0800	[thread overview]
Message-ID: <1337246456-30909-2-git-send-email-kernelfans@gmail.com> (raw)
In-Reply-To: <1337246456-30909-1-git-send-email-kernelfans@gmail.com>

From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Make vhost allocate vhost_virtqueue on different host nodes as required.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 drivers/vhost/vhost.c |  380 +++++++++++++++++++++++++++++++++++--------------
 drivers/vhost/vhost.h |   41 ++++--
 include/linux/vhost.h |    2 +-
 3 files changed, 304 insertions(+), 119 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 51e4c1e..b0d2855 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/cgroup.h>
 
@@ -37,12 +38,11 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };
 
-static unsigned vhost_zcopy_mask __read_mostly;
 
 #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
 
-static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
 	struct vhost_poll *poll;
@@ -75,12 +75,12 @@ static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev)
+		     unsigned long mask, struct vhost_sub_dev *dev)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->subdev = dev;
 
 	vhost_work_init(&poll->work, fn);
 }
@@ -103,7 +103,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 	remove_wait_queue(poll->wqh, &poll->wait);
 }
 
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
+static bool vhost_work_seq_done(struct vhost_sub_dev *dev, struct vhost_work *work,
 				unsigned seq)
 {
 	int left;
@@ -114,19 +114,19 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
 	return left <= 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+static void vhost_work_flush(struct vhost_sub_dev *sub, struct vhost_work *work)
 {
 	unsigned seq;
 	int flushing;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&sub->work_lock);
 	seq = work->queue_seq;
 	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
-	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
+	spin_unlock_irq(&sub->work_lock);
+	wait_event(work->done, vhost_work_seq_done(sub, work, seq));
+	spin_lock_irq(&sub->work_lock);
 	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&sub->work_lock);
 	BUG_ON(flushing < 0);
 }
 
@@ -134,26 +134,26 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_flush(poll->subdev, &poll->work);
 }
 
-static inline void vhost_work_queue(struct vhost_dev *dev,
+static inline void vhost_work_queue(struct vhost_sub_dev *sub,
 				    struct vhost_work *work)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->work_lock, flags);
+	spin_lock_irqsave(&sub->work_lock, flags);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
+		list_add_tail(&work->node, &sub->work_list);
 		work->queue_seq++;
-		wake_up_process(dev->worker);
+		wake_up_process(sub->worker);
 	}
-	spin_unlock_irqrestore(&dev->work_lock, flags);
+	spin_unlock_irqrestore(&sub->work_lock, flags);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	vhost_work_queue(poll->dev, &poll->work);
+	vhost_work_queue(poll->subdev, &poll->work);
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -188,7 +188,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
+	struct vhost_sub_dev *sub = data;
+	struct vhost_dev *dev = sub->owner;
 	struct vhost_work *work = NULL;
 	unsigned uninitialized_var(seq);
 
@@ -198,7 +199,7 @@ static int vhost_worker(void *data)
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
+		spin_lock_irq(&sub->work_lock);
 		if (work) {
 			work->done_seq = seq;
 			if (work->flushing)
@@ -206,18 +207,18 @@ static int vhost_worker(void *data)
 		}
 
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
+			spin_unlock_irq(&sub->work_lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
+		if (!list_empty(&sub->work_list)) {
+			work = list_first_entry(&sub->work_list,
 						struct vhost_work, node);
 			list_del_init(&work->node);
 			seq = work->queue_seq;
 		} else
 			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
+		spin_unlock_irq(&sub->work_lock);
 
 		if (work) {
 			__set_current_state(TASK_RUNNING);
@@ -244,54 +245,189 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 	vq->ubuf_info = NULL;
 }
 
-void vhost_enable_zcopy(int vq)
+void vhost_enable_zcopy(struct vhost_dev *dev, int rx)
 {
-	vhost_zcopy_mask |= 0x1 << vq;
+	int i;
+	if (rx == 0)
+		for (i = 0; i < dev->node_cnt; i++)
+			dev->zcopy_mask |= 0x1<<(2*i+1);
 }
 
-/* Helper to allocate iovec buffers for all vqs. */
-static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
+/* Need for vq dynamicly allocator, which is important to migrate among NUMA */
+static int vhost_vq_alloc_iovecs(struct vhost_virtqueue *vq)
 {
-	int i;
 	bool zcopy;
+	int i;
+	struct vhost_dev *dev = vq->dev;
+	int node = vq->node_id;
+	vq->indirect = kmalloc_node(sizeof *vq->indirect  *
+					   UIO_MAXIOV, GFP_KERNEL, node);
+	vq->log = kmalloc_node(sizeof *vq->log * UIO_MAXIOV,
+				  GFP_KERNEL, node);
+	vq->heads = kmalloc_node(sizeof *vq->heads *
+					UIO_MAXIOV, GFP_KERNEL, node);
+	for (i = 0; i < dev->node_cnt*2; i++) {
+		if (dev->vqs[i] == vq) {
+			zcopy = dev->zcopy_mask & (0x1 << i);
+			break;
+		}
+	}
+	if (zcopy)
+		vq->ubuf_info =
+			kmalloc_node(sizeof *vq->ubuf_info *
+				UIO_MAXIOV, GFP_KERNEL, node);
+	if (!vq->indirect || !vq->log || !vq->heads ||
+		(zcopy && !vq->ubuf_info)) {
+		kfree(vq->indirect);
+		kfree(vq->log);
+		kfree(vq->heads);
+		kfree(vq->ubuf_info);
 
-	for (i = 0; i < dev->nvqs; ++i) {
-		dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
-					       UIO_MAXIOV, GFP_KERNEL);
-		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
-					  GFP_KERNEL);
-		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
-					    UIO_MAXIOV, GFP_KERNEL);
-		zcopy = vhost_zcopy_mask & (0x1 << i);
-		if (zcopy)
-			dev->vqs[i].ubuf_info =
-				kmalloc(sizeof *dev->vqs[i].ubuf_info *
-					UIO_MAXIOV, GFP_KERNEL);
-		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
-			!dev->vqs[i].heads ||
-			(zcopy && !dev->vqs[i].ubuf_info))
+		return -ENOMEM;
+	} else
+		return 0;
+}
+
+/* Helper to allocate iovec buffers for all vqs. */
+static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
+{
+	int i, ret;
+	for (i = 0; i < dev->nvqs; i++) {
+		ret = vhost_vq_alloc_iovecs(dev->vqs[i]);
+		if (ret < 0) {
+			i -= 1;
 			goto err_nomem;
+		}
 	}
 	return 0;
-
 err_nomem:
 	for (; i >= 0; --i)
-		vhost_vq_free_iovecs(&dev->vqs[i]);
+		vhost_vq_free_iovecs(dev->vqs[i]);
 	return -ENOMEM;
 }
 
 static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 {
 	int i;
-
 	for (i = 0; i < dev->nvqs; ++i)
-		vhost_vq_free_iovecs(&dev->vqs[i]);
+		vhost_vq_free_iovecs(dev->vqs[i]);
 }
 
-long vhost_dev_init(struct vhost_dev *dev,
-		    struct vhost_virtqueue *vqs, int nvqs)
+int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map,
+	int sz)
+{
+	int i, j = 0;
+	int cur, prev = 0;
+	struct vhost_sub_dev *sub;
+	/* Todo,replace allow_map with dynamic allocated */
+	dev->allow_map = *numa_map;
+	dev->sub_devs = kmalloc(dev->node_cnt*sizeof(void *), GFP_KERNEL);
+
+	while (1) {
+		cur = find_next_bit(numa_map, sz, prev);
+		if (cur >= sz)
+			break;
+		prev = cur;
+		sub =  kmalloc_node(sizeof(struct vhost_sub_dev), GFP_KERNEL, cur);
+		if (sub == NULL)
+			goto err;
+		j++;
+		sub->node_id = cur;
+		sub->owner = dev;
+		spin_lock_init(&sub->work_lock);
+		INIT_LIST_HEAD(&sub->work_list);
+		dev->sub_devs[i] = sub;
+	}
+
+	dev->node_cnt = j;
+	return 0;
+err:
+	for (i = 0; i < j; i++) {
+		kfree(dev->sub_devs[i]);
+		dev->sub_devs[i] = NULL;
+	}
+	return -ENOMEM;
+
+}
+
+void vhost_dev_free_subdevs(struct vhost_dev *dev)
 {
 	int i;
+	for (i = 0; i < dev->node_cnt; i++)
+		kfree(dev->sub_devs[i]);
+	return;
+}
+
+static int check_numa(int *vqs_map, int sz)
+{
+	int i, node;
+
+	for (i = 0; i < sz; i++) {
+		for_each_online_node(node)
+			if (vqs_map[i] == node)
+				break;
+		if (vqs_map[i] != node)
+			return -1;
+	}
+	return 0;
+}
+
+int check_numa_bmp(unsigned long *numa_bmp, int sz)
+{
+	int i, node, cur, prev = 0;
+
+	for (i = 0; i < sz; i++) {
+		cur = find_next_bit(numa_bmp, sz, prev);
+		prev = cur;
+		if (cur >= sz)
+			return 0;
+		for_each_online_node(node)
+			if (cur == node)
+				break;
+		if (cur != node)
+			return -1;
+	}
+	return 0;
+}
+
+/* allocate vqs in node according to request map */
+int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int cnt,
+	int *vqs_map, int sz, vhost_work_fn_t *handle_kick)
+{
+	int r, i, j = 0;
+	r = check_numa(vqs_map, sz);
+	if (r < 0)
+		return -EINVAL;
+	for (i = 0; i < cnt ; i++) {
+		vqs[i] = kmalloc_node(sizeof(struct vhost_virtqueue),
+			GFP_KERNEL, vqs_map[i]);
+		if (vqs[i] == NULL)
+			goto err;
+		vqs[i]->handle_kick = handle_kick[i];
+		j = i;
+	}
+	return 0;
+err:
+	for (i = 0; i < j; i++)
+		kfree(vqs[i]);
+	return -ENOMEM;
+
+}
+
+void vhost_dev_free_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
+	int cnt)
+{
+	int i;
+	for (i = 0; i < cnt ; i++)
+		kfree(vqs[i]);
+	return;
+}
+
+long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs)
+{
+	int i, j, ret = 0;
+	struct vhost_sub_dev *subdev;
+	struct vhost_virtqueue *vq;
 
 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -300,24 +436,32 @@ long vhost_dev_init(struct vhost_dev *dev,
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
-	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		dev->vqs[i].log = NULL;
-		dev->vqs[i].indirect = NULL;
-		dev->vqs[i].heads = NULL;
-		dev->vqs[i].ubuf_info = NULL;
-		dev->vqs[i].dev = dev;
-		mutex_init(&dev->vqs[i].mutex);
-		vhost_vq_reset(dev, dev->vqs + i);
-		if (dev->vqs[i].handle_kick)
-			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick, POLLIN, dev);
-	}
+		vq = dev->vqs[i];
+		/* for each numa node, in-vq/out-vq */
+		vq->log = NULL;
+		vq->indirect = NULL;
+		vq->heads = NULL;
+		vq->ubuf_info = NULL;
+		vq->dev = dev;
+		mutex_init(&vq->mutex);
+		vhost_vq_reset(dev, vq);
+
+		if (vq->handle_kick) {
+			for (j = 0; j < i; j++) {
+				subdev =  dev->sub_devs[j];
+				if (vq->node_id == subdev->node_id)
+					vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, subdev);
+				else {
+					vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, dev->sub_devs[0]);
+					ret = 1;
+				}
+			}
+		}
 
-	return 0;
+	}
+	return ret;
 }
 
 /* Caller should have device mutex */
@@ -344,19 +488,26 @@ static void vhost_attach_cgroups_work(struct vhost_work *work)
 static int vhost_attach_cgroups(struct vhost_dev *dev)
 {
 	struct vhost_attach_cgroups_struct attach;
-
+	int i, ret = 0;
+	struct vhost_sub_dev *sub;
 	attach.owner = current;
-	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
-	return attach.ret;
+	for (i = 0; i < dev->node_cnt; i++) {
+		sub = dev->sub_devs[i];
+		vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+		vhost_work_queue(sub, &attach.work);
+		vhost_work_flush(sub, &attach.work);
+		ret |= attach.ret;
+	}
+	return ret;
 }
 
 /* Caller should have device mutex */
 static long vhost_dev_set_owner(struct vhost_dev *dev)
 {
 	struct task_struct *worker;
-	int err;
+	int err, i, j, cur, prev = 0;
+	int sz = sizeof(unsigned long);
+	const struct cpumask *mask;
 
 	/* Is there an owner already? */
 	if (dev->mm) {
@@ -366,14 +517,19 @@ static long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	/* No owner, become one */
 	dev->mm = get_task_mm(current);
-	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker)) {
-		err = PTR_ERR(worker);
-		goto err_worker;
+
+	for (i = 0, j = 0; i < dev->node_cnt; i++, j++) {
+		cur = find_next_bit(&dev->allow_map, sz, prev);
+		dev->sub_devs[i]->worker = kthread_create_on_node(vhost_worker,
+			dev->sub_devs[i], cur, "vhost-%d-node-%d", current->pid, cur);
+		if (dev->sub_devs[i]->worker == NULL)
+			goto err_cgroup;
+		mask = cpumask_of_node(cur);
+		do_set_cpus_allowed(worker, mask);
 	}
 
-	dev->worker = worker;
-	wake_up_process(worker);	/* avoid contributing to loadavg */
+	for (i = 0; i < dev->node_cnt; i++)
+		wake_up_process(dev->sub_devs[i]->worker);
 
 	err = vhost_attach_cgroups(dev);
 	if (err)
@@ -385,9 +541,12 @@ static long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	return 0;
 err_cgroup:
-	kthread_stop(worker);
-	dev->worker = NULL;
-err_worker:
+
+	for (i = 0; i < j; i++) {
+		kthread_stop(dev->sub_devs[i]->worker);
+		dev->sub_devs[i]->worker = NULL;
+	}
+
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -442,28 +601,28 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	int i;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
-			vhost_poll_stop(&dev->vqs[i].poll);
-			vhost_poll_flush(&dev->vqs[i].poll);
+		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
+			vhost_poll_stop(&dev->vqs[i]->poll);
+			vhost_poll_flush(&dev->vqs[i]->poll);
 		}
 		/* Wait for all lower device DMAs done. */
-		if (dev->vqs[i].ubufs)
-			vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
+		if (dev->vqs[i]->ubufs)
+			vhost_ubuf_put_and_wait(dev->vqs[i]->ubufs);
 
 		/* Signal guest as appropriate. */
-		vhost_zerocopy_signal_used(&dev->vqs[i]);
-
-		if (dev->vqs[i].error_ctx)
-			eventfd_ctx_put(dev->vqs[i].error_ctx);
-		if (dev->vqs[i].error)
-			fput(dev->vqs[i].error);
-		if (dev->vqs[i].kick)
-			fput(dev->vqs[i].kick);
-		if (dev->vqs[i].call_ctx)
-			eventfd_ctx_put(dev->vqs[i].call_ctx);
-		if (dev->vqs[i].call)
-			fput(dev->vqs[i].call);
-		vhost_vq_reset(dev, dev->vqs + i);
+		vhost_zerocopy_signal_used(dev->vqs[i]);
+
+		if (dev->vqs[i]->error_ctx)
+			eventfd_ctx_put(dev->vqs[i]->error_ctx);
+		if (dev->vqs[i]->error)
+			fput(dev->vqs[i]->error);
+		if (dev->vqs[i]->kick)
+			fput(dev->vqs[i]->kick);
+		if (dev->vqs[i]->call_ctx)
+			eventfd_ctx_put(dev->vqs[i]->call_ctx);
+		if (dev->vqs[i]->call)
+			fput(dev->vqs[i]->call);
+		vhost_vq_reset(dev, dev->vqs[i]);
 	}
 	vhost_dev_free_iovecs(dev);
 	if (dev->log_ctx)
@@ -477,11 +636,15 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 					locked ==
 						lockdep_is_held(&dev->mutex)));
 	RCU_INIT_POINTER(dev->memory, NULL);
+
+	/* fixme,It will be considered and fixed in next verion */
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
+	/* end*/
+
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -534,14 +697,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 
 	for (i = 0; i < d->nvqs; ++i) {
 		int ok;
-		mutex_lock(&d->vqs[i].mutex);
+		mutex_lock(&d->vqs[i]->mutex);
 		/* If ring is inactive, will check when it's enabled. */
-		if (d->vqs[i].private_data)
-			ok = vq_memory_access_ok(d->vqs[i].log_base, mem,
+		if (d->vqs[i]->private_data)
+			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem,
 						 log_all);
 		else
 			ok = 1;
-		mutex_unlock(&d->vqs[i].mutex);
+		mutex_unlock(&d->vqs[i]->mutex);
 		if (!ok)
 			return 0;
 	}
@@ -650,8 +813,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		return r;
 	if (idx >= d->nvqs)
 		return -ENOBUFS;
-
-	vq = d->vqs + idx;
+	vq = d->vqs[idx];
 
 	mutex_lock(&vq->mutex);
 
@@ -750,6 +912,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		vq->log_addr = a.log_guest_addr;
 		vq->used = (void __user *)(unsigned long)a.used_user_addr;
 		break;
+
 	case VHOST_SET_VRING_KICK:
 		if (copy_from_user(&f, argp, sizeof f)) {
 			r = -EFAULT;
@@ -766,6 +929,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		} else
 			filep = eventfp;
 		break;
+
 	case VHOST_SET_VRING_CALL:
 		if (copy_from_user(&f, argp, sizeof f)) {
 			r = -EFAULT;
@@ -863,7 +1027,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
 		for (i = 0; i < d->nvqs; ++i) {
 			struct vhost_virtqueue *vq;
 			void __user *base = (void __user *)(unsigned long)p;
-			vq = d->vqs + i;
+			vq = d->vqs[i];
 			mutex_lock(&vq->mutex);
 			/* If ring is inactive, will check when it's enabled. */
 			if (vq->private_data && !vq_log_access_ok(d, vq, base))
@@ -890,9 +1054,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
 		} else
 			filep = eventfp;
 		for (i = 0; i < d->nvqs; ++i) {
-			mutex_lock(&d->vqs[i].mutex);
-			d->vqs[i].log_ctx = d->log_ctx;
-			mutex_unlock(&d->vqs[i].mutex);
+			mutex_lock(&d->vqs[i]->mutex);
+			d->vqs[i]->log_ctx = d->log_ctx;
+			mutex_unlock(&d->vqs[i]->mutex);
 		}
 		if (ctx)
 			eventfd_ctx_put(ctx);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8de1fd5..12d4237 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,12 +13,13 @@
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
 
+#define VHOST_NUMA
 /* This is for zerocopy, used buffer len is set to 1 when lower device DMA
  * done */
 #define VHOST_DMA_DONE_LEN	1
 #define VHOST_DMA_CLEAR_LEN	0
 
-struct vhost_device;
+struct vhost_dev;
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -32,6 +33,8 @@ struct vhost_work {
 	unsigned		  done_seq;
 };
 
+struct vhost_sub_dev;
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
@@ -40,11 +43,13 @@ struct vhost_poll {
 	wait_queue_t              wait;
 	struct vhost_work	  work;
 	unsigned long		  mask;
-	struct vhost_dev	 *dev;
+	struct vhost_sub_dev *subdev;
 };
 
+void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+			    poll_table *pt);
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev);
+		     unsigned long mask, struct vhost_sub_dev *dev);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -70,7 +75,7 @@ void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
-
+	int node_id;
 	/* The actual ring of buffers. */
 	struct mutex mutex;
 	unsigned int num;
@@ -143,6 +148,14 @@ struct vhost_virtqueue {
 	struct vhost_ubuf_ref *ubufs;
 };
 
+struct vhost_sub_dev {
+	struct vhost_dev *owner;
+	int node_id;
+	spinlock_t work_lock;
+	struct list_head work_list;
+	struct task_struct *worker;
+};
+
 struct vhost_dev {
 	/* Readers use RCU to access memory table pointer
 	 * log base pointer and features.
@@ -151,16 +164,24 @@ struct vhost_dev {
 	struct mm_struct *mm;
 	struct mutex mutex;
 	unsigned acked_features;
-	struct vhost_virtqueue *vqs;
+	struct vhost_virtqueue **vqs;
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
-	struct task_struct *worker;
+	/* todo, change it to bitmap */
+	unsigned long allow_map;
+	unsigned long node_cnt;
+	unsigned long zcopy_mask;
+	struct vhost_sub_dev **sub_devs;
 };
 
-long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+int check_numa_bmp(unsigned long *numa_bmp, int sz);
+int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map,
+	int sz);
+void vhost_dev_free_subdevs(struct vhost_dev *dev);
+int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
+	int cnt, int *vqs_map, int sz, vhost_work_fn_t *handle_kick);
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);
 void vhost_dev_cleanup(struct vhost_dev *, bool locked);
@@ -216,6 +237,6 @@ static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
 	return acked_features & (1 << bit);
 }
 
-void vhost_enable_zcopy(int vq);
+void vhost_enable_zcopy(struct vhost_dev *dev, int rx);
 
 #endif
diff --git a/include/linux/vhost.h b/include/linux/vhost.h
index e847f1e..d8c76f1 100644
--- a/include/linux/vhost.h
+++ b/include/linux/vhost.h
@@ -120,7 +120,7 @@ struct vhost_memory {
  * used for transmit.  Pass fd -1 to unbind from the socket and the transmit
  * device.  This can be used to stop the ring (e.g. for migration). */
 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
-
+#define VHOST_NET_SET_NUMA  _IOW(VHOST_VIRTIO, 0x31, unsigned long)
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
-- 
1.7.4.4

WARNING: multiple messages have this Message-ID (diff)

From: Liu Ping Fan <kernelfans@gmail.com>
To: kvm@vger.kernel.org, netdev@vger.kernel.org
Cc: Krishna Kumar <krkumar2@in.ibm.com>, Shirley Ma <xma@us.ibm.com>,
	Tom Lendacky <toml@us.ibm.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	qemu-devel@nongnu.org, Rusty Russell <rusty@rustcorp.com.au>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	linux-kernel@vger.kernel.org, Ryan Harper <ryanh@us.ibm.com>,
	Avi Kivity <avi@redhat.com>,
	Anthony Liguori <anthony@codemonkey.ws>
Subject: [Qemu-devel] [PATCH 1/2] [kvm/vhost]: make vhost support NUMA model.
Date: Thu, 17 May 2012 17:20:53 +0800	[thread overview]
Message-ID: <1337246456-30909-2-git-send-email-kernelfans@gmail.com> (raw)
In-Reply-To: <1337246456-30909-1-git-send-email-kernelfans@gmail.com>

From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Make vhost allocate vhost_virtqueue on different host nodes as required.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 drivers/vhost/vhost.c |  380 +++++++++++++++++++++++++++++++++++--------------
 drivers/vhost/vhost.h |   41 ++++--
 include/linux/vhost.h |    2 +-
 3 files changed, 304 insertions(+), 119 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 51e4c1e..b0d2855 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/cgroup.h>
 
@@ -37,12 +38,11 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };
 
-static unsigned vhost_zcopy_mask __read_mostly;
 
 #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
 
-static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
 	struct vhost_poll *poll;
@@ -75,12 +75,12 @@ static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev)
+		     unsigned long mask, struct vhost_sub_dev *dev)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->subdev = dev;
 
 	vhost_work_init(&poll->work, fn);
 }
@@ -103,7 +103,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 	remove_wait_queue(poll->wqh, &poll->wait);
 }
 
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
+static bool vhost_work_seq_done(struct vhost_sub_dev *dev, struct vhost_work *work,
 				unsigned seq)
 {
 	int left;
@@ -114,19 +114,19 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
 	return left <= 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+static void vhost_work_flush(struct vhost_sub_dev *sub, struct vhost_work *work)
 {
 	unsigned seq;
 	int flushing;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&sub->work_lock);
 	seq = work->queue_seq;
 	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
-	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
+	spin_unlock_irq(&sub->work_lock);
+	wait_event(work->done, vhost_work_seq_done(sub, work, seq));
+	spin_lock_irq(&sub->work_lock);
 	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&sub->work_lock);
 	BUG_ON(flushing < 0);
 }
 
@@ -134,26 +134,26 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_flush(poll->subdev, &poll->work);
 }
 
-static inline void vhost_work_queue(struct vhost_dev *dev,
+static inline void vhost_work_queue(struct vhost_sub_dev *sub,
 				    struct vhost_work *work)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->work_lock, flags);
+	spin_lock_irqsave(&sub->work_lock, flags);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
+		list_add_tail(&work->node, &sub->work_list);
 		work->queue_seq++;
-		wake_up_process(dev->worker);
+		wake_up_process(sub->worker);
 	}
-	spin_unlock_irqrestore(&dev->work_lock, flags);
+	spin_unlock_irqrestore(&sub->work_lock, flags);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	vhost_work_queue(poll->dev, &poll->work);
+	vhost_work_queue(poll->subdev, &poll->work);
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -188,7 +188,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
+	struct vhost_sub_dev *sub = data;
+	struct vhost_dev *dev = sub->owner;
 	struct vhost_work *work = NULL;
 	unsigned uninitialized_var(seq);
 
@@ -198,7 +199,7 @@ static int vhost_worker(void *data)
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
+		spin_lock_irq(&sub->work_lock);
 		if (work) {
 			work->done_seq = seq;
 			if (work->flushing)
@@ -206,18 +207,18 @@ static int vhost_worker(void *data)
 		}
 
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
+			spin_unlock_irq(&sub->work_lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
+		if (!list_empty(&sub->work_list)) {
+			work = list_first_entry(&sub->work_list,
 						struct vhost_work, node);
 			list_del_init(&work->node);
 			seq = work->queue_seq;
 		} else
 			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
+		spin_unlock_irq(&sub->work_lock);
 
 		if (work) {
 			__set_current_state(TASK_RUNNING);
@@ -244,54 +245,189 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 	vq->ubuf_info = NULL;
 }
 
-void vhost_enable_zcopy(int vq)
+void vhost_enable_zcopy(struct vhost_dev *dev, int rx)
 {
-	vhost_zcopy_mask |= 0x1 << vq;
+	int i;
+	if (rx == 0)
+		for (i = 0; i < dev->node_cnt; i++)
+			dev->zcopy_mask |= 0x1<<(2*i+1);
 }
 
-/* Helper to allocate iovec buffers for all vqs. */
-static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
+/* Need for vq dynamicly allocator, which is important to migrate among NUMA */
+static int vhost_vq_alloc_iovecs(struct vhost_virtqueue *vq)
 {
-	int i;
 	bool zcopy;
+	int i;
+	struct vhost_dev *dev = vq->dev;
+	int node = vq->node_id;
+	vq->indirect = kmalloc_node(sizeof *vq->indirect  *
+					   UIO_MAXIOV, GFP_KERNEL, node);
+	vq->log = kmalloc_node(sizeof *vq->log * UIO_MAXIOV,
+				  GFP_KERNEL, node);
+	vq->heads = kmalloc_node(sizeof *vq->heads *
+					UIO_MAXIOV, GFP_KERNEL, node);
+	for (i = 0; i < dev->node_cnt*2; i++) {
+		if (dev->vqs[i] == vq) {
+			zcopy = dev->zcopy_mask & (0x1 << i);
+			break;
+		}
+	}
+	if (zcopy)
+		vq->ubuf_info =
+			kmalloc_node(sizeof *vq->ubuf_info *
+				UIO_MAXIOV, GFP_KERNEL, node);
+	if (!vq->indirect || !vq->log || !vq->heads ||
+		(zcopy && !vq->ubuf_info)) {
+		kfree(vq->indirect);
+		kfree(vq->log);
+		kfree(vq->heads);
+		kfree(vq->ubuf_info);
 
-	for (i = 0; i < dev->nvqs; ++i) {
-		dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
-					       UIO_MAXIOV, GFP_KERNEL);
-		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
-					  GFP_KERNEL);
-		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
-					    UIO_MAXIOV, GFP_KERNEL);
-		zcopy = vhost_zcopy_mask & (0x1 << i);
-		if (zcopy)
-			dev->vqs[i].ubuf_info =
-				kmalloc(sizeof *dev->vqs[i].ubuf_info *
-					UIO_MAXIOV, GFP_KERNEL);
-		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
-			!dev->vqs[i].heads ||
-			(zcopy && !dev->vqs[i].ubuf_info))
+		return -ENOMEM;
+	} else
+		return 0;
+}
+
+/* Helper to allocate iovec buffers for all vqs. */
+static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
+{
+	int i, ret;
+	for (i = 0; i < dev->nvqs; i++) {
+		ret = vhost_vq_alloc_iovecs(dev->vqs[i]);
+		if (ret < 0) {
+			i -= 1;
 			goto err_nomem;
+		}
 	}
 	return 0;
-
 err_nomem:
 	for (; i >= 0; --i)
-		vhost_vq_free_iovecs(&dev->vqs[i]);
+		vhost_vq_free_iovecs(dev->vqs[i]);
 	return -ENOMEM;
 }
 
 static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 {
 	int i;
-
 	for (i = 0; i < dev->nvqs; ++i)
-		vhost_vq_free_iovecs(&dev->vqs[i]);
+		vhost_vq_free_iovecs(dev->vqs[i]);
 }
 
-long vhost_dev_init(struct vhost_dev *dev,
-		    struct vhost_virtqueue *vqs, int nvqs)
+int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map,
+	int sz)
+{
+	int i, j = 0;
+	int cur, prev = 0;
+	struct vhost_sub_dev *sub;
+	/* Todo,replace allow_map with dynamic allocated */
+	dev->allow_map = *numa_map;
+	dev->sub_devs = kmalloc(dev->node_cnt*sizeof(void *), GFP_KERNEL);
+
+	while (1) {
+		cur = find_next_bit(numa_map, sz, prev);
+		if (cur >= sz)
+			break;
+		prev = cur;
+		sub =  kmalloc_node(sizeof(struct vhost_sub_dev), GFP_KERNEL, cur);
+		if (sub == NULL)
+			goto err;
+		j++;
+		sub->node_id = cur;
+		sub->owner = dev;
+		spin_lock_init(&sub->work_lock);
+		INIT_LIST_HEAD(&sub->work_list);
+		dev->sub_devs[i] = sub;
+	}
+
+	dev->node_cnt = j;
+	return 0;
+err:
+	for (i = 0; i < j; i++) {
+		kfree(dev->sub_devs[i]);
+		dev->sub_devs[i] = NULL;
+	}
+	return -ENOMEM;
+
+}
+
+void vhost_dev_free_subdevs(struct vhost_dev *dev)
 {
 	int i;
+	for (i = 0; i < dev->node_cnt; i++)
+		kfree(dev->sub_devs[i]);
+	return;
+}
+
+static int check_numa(int *vqs_map, int sz)
+{
+	int i, node;
+
+	for (i = 0; i < sz; i++) {
+		for_each_online_node(node)
+			if (vqs_map[i] == node)
+				break;
+		if (vqs_map[i] != node)
+			return -1;
+	}
+	return 0;
+}
+
+int check_numa_bmp(unsigned long *numa_bmp, int sz)
+{
+	int i, node, cur, prev = 0;
+
+	for (i = 0; i < sz; i++) {
+		cur = find_next_bit(numa_bmp, sz, prev);
+		prev = cur;
+		if (cur >= sz)
+			return 0;
+		for_each_online_node(node)
+			if (cur == node)
+				break;
+		if (cur != node)
+			return -1;
+	}
+	return 0;
+}
+
+/* allocate vqs in node according to request map */
+int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int cnt,
+	int *vqs_map, int sz, vhost_work_fn_t *handle_kick)
+{
+	int r, i, j = 0;
+	r = check_numa(vqs_map, sz);
+	if (r < 0)
+		return -EINVAL;
+	for (i = 0; i < cnt ; i++) {
+		vqs[i] = kmalloc_node(sizeof(struct vhost_virtqueue),
+			GFP_KERNEL, vqs_map[i]);
+		if (vqs[i] == NULL)
+			goto err;
+		vqs[i]->handle_kick = handle_kick[i];
+		j = i;
+	}
+	return 0;
+err:
+	for (i = 0; i < j; i++)
+		kfree(vqs[i]);
+	return -ENOMEM;
+
+}
+
+void vhost_dev_free_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
+	int cnt)
+{
+	int i;
+	for (i = 0; i < cnt ; i++)
+		kfree(vqs[i]);
+	return;
+}
+
+long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs)
+{
+	int i, j, ret = 0;
+	struct vhost_sub_dev *subdev;
+	struct vhost_virtqueue *vq;
 
 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -300,24 +436,32 @@ long vhost_dev_init(struct vhost_dev *dev,
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
-	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		dev->vqs[i].log = NULL;
-		dev->vqs[i].indirect = NULL;
-		dev->vqs[i].heads = NULL;
-		dev->vqs[i].ubuf_info = NULL;
-		dev->vqs[i].dev = dev;
-		mutex_init(&dev->vqs[i].mutex);
-		vhost_vq_reset(dev, dev->vqs + i);
-		if (dev->vqs[i].handle_kick)
-			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick, POLLIN, dev);
-	}
+		vq = dev->vqs[i];
+		/* for each numa node, in-vq/out-vq */
+		vq->log = NULL;
+		vq->indirect = NULL;
+		vq->heads = NULL;
+		vq->ubuf_info = NULL;
+		vq->dev = dev;
+		mutex_init(&vq->mutex);
+		vhost_vq_reset(dev, vq);
+
+		if (vq->handle_kick) {
+			for (j = 0; j < i; j++) {
+				subdev =  dev->sub_devs[j];
+				if (vq->node_id == subdev->node_id)
+					vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, subdev);
+				else {
+					vhost_poll_init(&vq->poll, vq->handle_kick, POLLIN, dev->sub_devs[0]);
+					ret = 1;
+				}
+			}
+		}
 
-	return 0;
+	}
+	return ret;
 }
 
 /* Caller should have device mutex */
@@ -344,19 +488,26 @@ static void vhost_attach_cgroups_work(struct vhost_work *work)
 static int vhost_attach_cgroups(struct vhost_dev *dev)
 {
 	struct vhost_attach_cgroups_struct attach;
-
+	int i, ret = 0;
+	struct vhost_sub_dev *sub;
 	attach.owner = current;
-	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
-	return attach.ret;
+	for (i = 0; i < dev->node_cnt; i++) {
+		sub = dev->sub_devs[i];
+		vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+		vhost_work_queue(sub, &attach.work);
+		vhost_work_flush(sub, &attach.work);
+		ret |= attach.ret;
+	}
+	return ret;
 }
 
 /* Caller should have device mutex */
 static long vhost_dev_set_owner(struct vhost_dev *dev)
 {
 	struct task_struct *worker;
-	int err;
+	int err, i, j, cur, prev = 0;
+	int sz = sizeof(unsigned long);
+	const struct cpumask *mask;
 
 	/* Is there an owner already? */
 	if (dev->mm) {
@@ -366,14 +517,19 @@ static long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	/* No owner, become one */
 	dev->mm = get_task_mm(current);
-	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker)) {
-		err = PTR_ERR(worker);
-		goto err_worker;
+
+	for (i = 0, j = 0; i < dev->node_cnt; i++, j++) {
+		cur = find_next_bit(&dev->allow_map, sz, prev);
+		dev->sub_devs[i]->worker = kthread_create_on_node(vhost_worker,
+			dev->sub_devs[i], cur, "vhost-%d-node-%d", current->pid, cur);
+		if (dev->sub_devs[i]->worker == NULL)
+			goto err_cgroup;
+		mask = cpumask_of_node(cur);
+		do_set_cpus_allowed(worker, mask);
 	}
 
-	dev->worker = worker;
-	wake_up_process(worker);	/* avoid contributing to loadavg */
+	for (i = 0; i < dev->node_cnt; i++)
+		wake_up_process(dev->sub_devs[i]->worker);
 
 	err = vhost_attach_cgroups(dev);
 	if (err)
@@ -385,9 +541,12 @@ static long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	return 0;
 err_cgroup:
-	kthread_stop(worker);
-	dev->worker = NULL;
-err_worker:
+
+	for (i = 0; i < j; i++) {
+		kthread_stop(dev->sub_devs[i]->worker);
+		dev->sub_devs[i]->worker = NULL;
+	}
+
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -442,28 +601,28 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	int i;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
-			vhost_poll_stop(&dev->vqs[i].poll);
-			vhost_poll_flush(&dev->vqs[i].poll);
+		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
+			vhost_poll_stop(&dev->vqs[i]->poll);
+			vhost_poll_flush(&dev->vqs[i]->poll);
 		}
 		/* Wait for all lower device DMAs done. */
-		if (dev->vqs[i].ubufs)
-			vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
+		if (dev->vqs[i]->ubufs)
+			vhost_ubuf_put_and_wait(dev->vqs[i]->ubufs);
 
 		/* Signal guest as appropriate. */
-		vhost_zerocopy_signal_used(&dev->vqs[i]);
-
-		if (dev->vqs[i].error_ctx)
-			eventfd_ctx_put(dev->vqs[i].error_ctx);
-		if (dev->vqs[i].error)
-			fput(dev->vqs[i].error);
-		if (dev->vqs[i].kick)
-			fput(dev->vqs[i].kick);
-		if (dev->vqs[i].call_ctx)
-			eventfd_ctx_put(dev->vqs[i].call_ctx);
-		if (dev->vqs[i].call)
-			fput(dev->vqs[i].call);
-		vhost_vq_reset(dev, dev->vqs + i);
+		vhost_zerocopy_signal_used(dev->vqs[i]);
+
+		if (dev->vqs[i]->error_ctx)
+			eventfd_ctx_put(dev->vqs[i]->error_ctx);
+		if (dev->vqs[i]->error)
+			fput(dev->vqs[i]->error);
+		if (dev->vqs[i]->kick)
+			fput(dev->vqs[i]->kick);
+		if (dev->vqs[i]->call_ctx)
+			eventfd_ctx_put(dev->vqs[i]->call_ctx);
+		if (dev->vqs[i]->call)
+			fput(dev->vqs[i]->call);
+		vhost_vq_reset(dev, dev->vqs[i]);
 	}
 	vhost_dev_free_iovecs(dev);
 	if (dev->log_ctx)
@@ -477,11 +636,15 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 					locked ==
 						lockdep_is_held(&dev->mutex)));
 	RCU_INIT_POINTER(dev->memory, NULL);
+
+	/* fixme,It will be considered and fixed in next verion */
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
+	/* end*/
+
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -534,14 +697,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 
 	for (i = 0; i < d->nvqs; ++i) {
 		int ok;
-		mutex_lock(&d->vqs[i].mutex);
+		mutex_lock(&d->vqs[i]->mutex);
 		/* If ring is inactive, will check when it's enabled. */
-		if (d->vqs[i].private_data)
-			ok = vq_memory_access_ok(d->vqs[i].log_base, mem,
+		if (d->vqs[i]->private_data)
+			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem,
 						 log_all);
 		else
 			ok = 1;
-		mutex_unlock(&d->vqs[i].mutex);
+		mutex_unlock(&d->vqs[i]->mutex);
 		if (!ok)
 			return 0;
 	}
@@ -650,8 +813,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		return r;
 	if (idx >= d->nvqs)
 		return -ENOBUFS;
-
-	vq = d->vqs + idx;
+	vq = d->vqs[idx];
 
 	mutex_lock(&vq->mutex);
 
@@ -750,6 +912,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		vq->log_addr = a.log_guest_addr;
 		vq->used = (void __user *)(unsigned long)a.used_user_addr;
 		break;
+
 	case VHOST_SET_VRING_KICK:
 		if (copy_from_user(&f, argp, sizeof f)) {
 			r = -EFAULT;
@@ -766,6 +929,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		} else
 			filep = eventfp;
 		break;
+
 	case VHOST_SET_VRING_CALL:
 		if (copy_from_user(&f, argp, sizeof f)) {
 			r = -EFAULT;
@@ -863,7 +1027,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
 		for (i = 0; i < d->nvqs; ++i) {
 			struct vhost_virtqueue *vq;
 			void __user *base = (void __user *)(unsigned long)p;
-			vq = d->vqs + i;
+			vq = d->vqs[i];
 			mutex_lock(&vq->mutex);
 			/* If ring is inactive, will check when it's enabled. */
 			if (vq->private_data && !vq_log_access_ok(d, vq, base))
@@ -890,9 +1054,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
 		} else
 			filep = eventfp;
 		for (i = 0; i < d->nvqs; ++i) {
-			mutex_lock(&d->vqs[i].mutex);
-			d->vqs[i].log_ctx = d->log_ctx;
-			mutex_unlock(&d->vqs[i].mutex);
+			mutex_lock(&d->vqs[i]->mutex);
+			d->vqs[i]->log_ctx = d->log_ctx;
+			mutex_unlock(&d->vqs[i]->mutex);
 		}
 		if (ctx)
 			eventfd_ctx_put(ctx);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8de1fd5..12d4237 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,12 +13,13 @@
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
 
+#define VHOST_NUMA
 /* This is for zerocopy, used buffer len is set to 1 when lower device DMA
  * done */
 #define VHOST_DMA_DONE_LEN	1
 #define VHOST_DMA_CLEAR_LEN	0
 
-struct vhost_device;
+struct vhost_dev;
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -32,6 +33,8 @@ struct vhost_work {
 	unsigned		  done_seq;
 };
 
+struct vhost_sub_dev;
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
@@ -40,11 +43,13 @@ struct vhost_poll {
 	wait_queue_t              wait;
 	struct vhost_work	  work;
 	unsigned long		  mask;
-	struct vhost_dev	 *dev;
+	struct vhost_sub_dev *subdev;
 };
 
+void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+			    poll_table *pt);
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev);
+		     unsigned long mask, struct vhost_sub_dev *dev);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -70,7 +75,7 @@ void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
-
+	int node_id;
 	/* The actual ring of buffers. */
 	struct mutex mutex;
 	unsigned int num;
@@ -143,6 +148,14 @@ struct vhost_virtqueue {
 	struct vhost_ubuf_ref *ubufs;
 };
 
+struct vhost_sub_dev {
+	struct vhost_dev *owner;
+	int node_id;
+	spinlock_t work_lock;
+	struct list_head work_list;
+	struct task_struct *worker;
+};
+
 struct vhost_dev {
 	/* Readers use RCU to access memory table pointer
 	 * log base pointer and features.
@@ -151,16 +164,24 @@ struct vhost_dev {
 	struct mm_struct *mm;
 	struct mutex mutex;
 	unsigned acked_features;
-	struct vhost_virtqueue *vqs;
+	struct vhost_virtqueue **vqs;
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
-	struct task_struct *worker;
+	/* todo, change it to bitmap */
+	unsigned long allow_map;
+	unsigned long node_cnt;
+	unsigned long zcopy_mask;
+	struct vhost_sub_dev **sub_devs;
 };
 
-long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+int check_numa_bmp(unsigned long *numa_bmp, int sz);
+int vhost_dev_alloc_subdevs(struct vhost_dev *dev, unsigned long *numa_map,
+	int sz);
+void vhost_dev_free_subdevs(struct vhost_dev *dev);
+int vhost_dev_alloc_vqs(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
+	int cnt, int *vqs_map, int sz, vhost_work_fn_t *handle_kick);
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);
 void vhost_dev_cleanup(struct vhost_dev *, bool locked);
@@ -216,6 +237,6 @@ static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
 	return acked_features & (1 << bit);
 }
 
-void vhost_enable_zcopy(int vq);
+void vhost_enable_zcopy(struct vhost_dev *dev, int rx);
 
 #endif
diff --git a/include/linux/vhost.h b/include/linux/vhost.h
index e847f1e..d8c76f1 100644
--- a/include/linux/vhost.h
+++ b/include/linux/vhost.h
@@ -120,7 +120,7 @@ struct vhost_memory {
  * used for transmit.  Pass fd -1 to unbind from the socket and the transmit
  * device.  This can be used to stop the ring (e.g. for migration). */
 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
-
+#define VHOST_NET_SET_NUMA  _IOW(VHOST_VIRTIO, 0x31, unsigned long)
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
-- 
1.7.4.4

next prev parent reply	other threads:[~2012-05-17  9:20 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-17  9:20 [RFC:kvm] export host NUMA info to guest & make emulated device NUMA attr Liu Ping Fan
2012-05-17  9:20 ` [Qemu-devel] " Liu Ping Fan
2012-05-17  9:20 ` Liu Ping Fan
2012-05-17  9:20 ` Liu Ping Fan [this message]
2012-05-17  9:20   ` [Qemu-devel] [PATCH 1/2] [kvm/vhost]: make vhost support NUMA model Liu Ping Fan
2012-05-17  9:20 ` [PATCH 2/2] [kvm/vhost-net]: make vhost net own NUMA attribute Liu Ping Fan
2012-05-17  9:20   ` [Qemu-devel] " Liu Ping Fan
2012-05-17  9:20   ` Liu Ping Fan
2012-05-17  9:20 ` [PATCH 1/2] [kvm/virtio]: make virtio support NUMA attr Liu Ping Fan
2012-05-17  9:20   ` [Qemu-devel] " Liu Ping Fan
2012-05-17  9:20 ` [PATCH 2/2] [net/virtio_net]: make virtio_net support NUMA info Liu Ping Fan
2012-05-17  9:20   ` [Qemu-devel] " Liu Ping Fan
2012-05-18 16:14 ` [RFC:kvm] export host NUMA info to guest & make emulated device NUMA attr Shirley Ma
2012-05-18 16:14   ` [Qemu-devel] " Shirley Ma
2012-05-22  9:28   ` Liu ping fan
2012-05-22  9:28     ` [Qemu-devel] " Liu ping fan
2012-05-23 14:52     ` Andrew Theurer
2012-05-23 14:52       ` [Qemu-devel] " Andrew Theurer
2012-05-23 15:16       ` Michael S. Tsirkin
2012-05-23 15:16         ` [Qemu-devel] " Michael S. Tsirkin
2012-05-23 15:16         ` Michael S. Tsirkin
2012-05-25  3:29         ` Liu ping fan
2012-05-25  3:29           ` [Qemu-devel] " Liu ping fan
2012-05-25  3:29           ` Liu ping fan
2012-05-25  4:05       ` Liu ping fan
2012-05-25  4:05         ` [Qemu-devel] " Liu ping fan

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:51e4c1e dfblob:b0d2855 dfblob:8de1fd5 dfblob:12d4237
dfblob:e847f1e dfblob:d8c76f1 dfblob:51e4c1e dfblob:b0d2855
dfblob:8de1fd5 dfblob:12d4237 dfblob:e847f1e dfblob:d8c76f1 )
 OR (
bs:"[Qemu-devel] [PATCH 1/2] [kvm/vhost]: make vhost support NUMA model." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1337246456-30909-2-git-send-email-kernelfans@gmail.com \
    --to=kernelfans@gmail.com \
    --cc=anthony@codemonkey.ws \
    --cc=avi@redhat.com \
    --cc=krkumar2@in.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=qemu-devel@nongnu.org \
    --cc=rusty@rustcorp.com.au \
    --cc=ryanh@us.ibm.com \
    --cc=toml@us.ibm.com \
    --cc=vatsa@linux.vnet.ibm.com \
    --cc=xma@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.