Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v2 kernel 4/7] virtio-balloon: speed up inflate/deflate process
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 184 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 162 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8d649a2..2d18ff6 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -41,10 +41,28 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+/*
+ * VIRTIO_BALLOON_PFNS_LIMIT is used to limit the size of page bitmap
+ * to prevent a very large page bitmap, there are two reasons for this:
+ * 1) to save memory.
+ * 2) allocate a large bitmap may fail.
+ *
+ * The actual limit of pfn is determined by:
+ * pfn_limit = min(max_pfn, VIRTIO_BALLOON_PFNS_LIMIT);
+ *
+ * If system has more pages than VIRTIO_BALLOON_PFNS_LIMIT, we will scan
+ * the page list and send the PFNs with several times. To reduce the
+ * overhead of scanning the page list. VIRTIO_BALLOON_PFNS_LIMIT should
+ * be set with a value which can cover most cases.
+ */
+#define VIRTIO_BALLOON_PFNS_LIMIT ((32 * (1ULL << 30)) >> PAGE_SHIFT) /* 32GB */
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern unsigned long get_max_pfn(void);
+
 struct virtio_balloon {
 	struct virtio_device *vdev;
 	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -62,6 +80,15 @@ struct virtio_balloon {
 
 	/* Number of balloon pages we've told the Host we're not using. */
 	unsigned int num_pages;
+	/* Pointer of the bitmap header. */
+	void *bmap_hdr;
+	/* Bitmap and length used to tell the host the pages */
+	unsigned long *page_bitmap;
+	unsigned long bmap_len;
+	/* Pfn limit */
+	unsigned long pfn_limit;
+	/* Used to record the processed pfn range */
+	unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
 	/*
 	 * The pages we've told the Host we're not using are enqueued
 	 * at vb_dev_info->pages list.
@@ -105,12 +132,45 @@ static void balloon_ack(struct virtqueue *vq)
 	wake_up(&vb->acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+	vb->min_pfn = ULONG_MAX;
+	vb->max_pfn = 0;
+}
+
+static inline void update_pfn_range(struct virtio_balloon *vb,
+				 struct page *page)
+{
+	unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+	if (balloon_pfn < vb->min_pfn)
+		vb->min_pfn = balloon_pfn;
+	if (balloon_pfn > vb->max_pfn)
+		vb->max_pfn = balloon_pfn;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
 	struct scatterlist sg;
 	unsigned int len;
 
-	sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+		struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+		unsigned long bmap_len;
+
+		/* cmd and req_id are not used here, set them to 0 */
+		hdr->cmd = cpu_to_virtio16(vb->vdev, 0);
+		hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+		hdr->reserved = cpu_to_virtio16(vb->vdev, 0);
+		hdr->req_id = cpu_to_virtio64(vb->vdev, 0);
+		hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+		bmap_len = min(vb->bmap_len,
+			(vb->end_pfn - vb->start_pfn) / BITS_PER_BYTE);
+		hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+		sg_init_one(&sg, hdr,
+			 sizeof(struct balloon_bmap_hdr) + bmap_len);
+	} else
+		sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
 
 	/* We should always be able to add one buffer to an empty queue. */
 	virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
@@ -118,7 +178,6 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 	/* When host has read buffer, this completes via balloon_ack */
 	wait_event(vb->acked, virtqueue_get_buf(vq, &len));
-
 }
 
 static void set_page_pfns(struct virtio_balloon *vb,
@@ -133,13 +192,53 @@ static void set_page_pfns(struct virtio_balloon *vb,
 					  page_to_balloon_pfn(page) + i);
 }
 
-static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
+static void set_page_bitmap(struct virtio_balloon *vb,
+			 struct list_head *pages, struct virtqueue *vq)
+{
+	unsigned long pfn;
+	struct page *page;
+	bool found;
+
+	vb->min_pfn = rounddown(vb->min_pfn, BITS_PER_LONG);
+	vb->max_pfn = roundup(vb->max_pfn, BITS_PER_LONG);
+	for (pfn = vb->min_pfn; pfn < vb->max_pfn;
+			pfn += vb->pfn_limit) {
+		vb->start_pfn = pfn + vb->pfn_limit;
+		vb->end_pfn = pfn;
+		memset(vb->page_bitmap, 0, vb->bmap_len);
+		found = false;
+		list_for_each_entry(page, pages, lru) {
+			unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+			if (balloon_pfn < pfn ||
+				 balloon_pfn >= pfn + vb->pfn_limit)
+				continue;
+			set_bit(balloon_pfn - pfn, vb->page_bitmap);
+			if (balloon_pfn > vb->end_pfn)
+				vb->end_pfn = balloon_pfn;
+			if (balloon_pfn < vb->start_pfn)
+				vb->start_pfn = balloon_pfn;
+			found = true;
+		}
+		if (found) {
+			vb->start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG);
+			vb->end_pfn = roundup(vb->end_pfn, BITS_PER_LONG);
+			tell_host(vb, vq);
+		}
+	}
+}
+
+static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num,
+				 bool use_bmap)
 {
 	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
-	unsigned num_allocated_pages;
+	unsigned int num_allocated_pages;
 
-	/* We can only do one array worth at a time. */
-	num = min(num, ARRAY_SIZE(vb->pfns));
+	if (use_bmap)
+		init_pfn_range(vb);
+	else
+		/* We can only do one array worth at a time. */
+		num = min(num, ARRAY_SIZE(vb->pfns));
 
 	mutex_lock(&vb->balloon_lock);
 	for (vb->num_pfns = 0; vb->num_pfns < num;
@@ -154,7 +253,10 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 			msleep(200);
 			break;
 		}
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		if (use_bmap)
+			update_pfn_range(vb, page);
+		else
+			set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
 		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
@@ -163,8 +265,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 
 	num_allocated_pages = vb->num_pfns;
 	/* Did we get any? */
-	if (vb->num_pfns != 0)
-		tell_host(vb, vb->inflate_vq);
+	if (vb->num_pfns != 0) {
+		if (use_bmap)
+			set_page_bitmap(vb, &vb_dev_info->pages,
+					vb->inflate_vq);
+		else
+			tell_host(vb, vb->inflate_vq);
+	}
 	mutex_unlock(&vb->balloon_lock);
 
 	return num_allocated_pages;
@@ -184,15 +291,19 @@ static void release_pages_balloon(struct virtio_balloon *vb,
 	}
 }
 
-static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
+static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num,
+				bool use_bmap)
 {
-	unsigned num_freed_pages;
+	unsigned int num_freed_pages;
 	struct page *page;
 	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
 	LIST_HEAD(pages);
 
-	/* We can only do one array worth at a time. */
-	num = min(num, ARRAY_SIZE(vb->pfns));
+	if (use_bmap)
+		init_pfn_range(vb);
+	else
+		/* We can only do one array worth at a time. */
+		num = min(num, ARRAY_SIZE(vb->pfns));
 
 	mutex_lock(&vb->balloon_lock);
 	for (vb->num_pfns = 0; vb->num_pfns < num;
@@ -200,7 +311,10 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 		page = balloon_page_dequeue(vb_dev_info);
 		if (!page)
 			break;
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		if (use_bmap)
+			update_pfn_range(vb, page);
+		else
+			set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
 		list_add(&page->lru, &pages);
 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
 	}
@@ -211,9 +325,14 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 	 * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
 	 * is true, we *have* to do it in this order
 	 */
-	if (vb->num_pfns != 0)
-		tell_host(vb, vb->deflate_vq);
-	release_pages_balloon(vb, &pages);
+	if (vb->num_pfns != 0) {
+		if (use_bmap)
+			set_page_bitmap(vb, &pages, vb->deflate_vq);
+		else
+			tell_host(vb, vb->deflate_vq);
+
+		release_pages_balloon(vb, &pages);
+	}
 	mutex_unlock(&vb->balloon_lock);
 	return num_freed_pages;
 }
@@ -347,13 +466,15 @@ static int virtballoon_oom_notify(struct notifier_block *self,
 	struct virtio_balloon *vb;
 	unsigned long *freed;
 	unsigned num_freed_pages;
+	bool use_bmap;
 
 	vb = container_of(self, struct virtio_balloon, nb);
 	if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
 		return NOTIFY_OK;
 
 	freed = parm;
-	num_freed_pages = leak_balloon(vb, oom_pages);
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+	num_freed_pages = leak_balloon(vb, oom_pages, use_bmap);
 	update_balloon_size(vb);
 	*freed += num_freed_pages;
 
@@ -373,15 +494,17 @@ static void update_balloon_size_func(struct work_struct *work)
 {
 	struct virtio_balloon *vb;
 	s64 diff;
+	bool use_bmap;
 
 	vb = container_of(work, struct virtio_balloon,
 			  update_balloon_size_work);
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 	diff = towards_target(vb);
 
 	if (diff > 0)
-		diff -= fill_balloon(vb, diff);
+		diff -= fill_balloon(vb, diff, use_bmap);
 	else if (diff < 0)
-		diff += leak_balloon(vb, -diff);
+		diff += leak_balloon(vb, -diff, use_bmap);
 	update_balloon_size(vb);
 
 	if (diff)
@@ -489,7 +612,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 static int virtballoon_probe(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb;
-	int err;
+	int err, hdr_len;
 
 	if (!vdev->config->get) {
 		dev_err(&vdev->dev, "%s failure: config access disabled\n",
@@ -508,6 +631,18 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	spin_lock_init(&vb->stop_update_lock);
 	vb->stop_update = false;
 	vb->num_pages = 0;
+	vb->pfn_limit = VIRTIO_BALLOON_PFNS_LIMIT;
+	vb->pfn_limit = min(vb->pfn_limit, get_max_pfn());
+	vb->bmap_len = ALIGN(vb->pfn_limit, BITS_PER_LONG) /
+		 BITS_PER_BYTE + 2 * sizeof(unsigned long);
+	hdr_len = sizeof(struct balloon_bmap_hdr);
+	vb->bmap_hdr = kzalloc(hdr_len + vb->bmap_len, GFP_KERNEL);
+
+	/* Clear the feature bit if memory allocation fails */
+	if (!vb->bmap_hdr)
+		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+	else
+		vb->page_bitmap = vb->bmap_hdr + hdr_len;
 	mutex_init(&vb->balloon_lock);
 	init_waitqueue_head(&vb->acked);
 	vb->vdev = vdev;
@@ -541,9 +676,12 @@ out:
 
 static void remove_common(struct virtio_balloon *vb)
 {
+	bool use_bmap;
+
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 	/* There might be pages left in the balloon: free them. */
 	while (vb->num_pages)
-		leak_balloon(vb, vb->num_pages);
+		leak_balloon(vb, vb->num_pages, use_bmap);
 	update_balloon_size(vb);
 
 	/* Now we reset the device so we can clean up the queues. */
@@ -565,6 +703,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	cancel_work_sync(&vb->update_balloon_stats_work);
 
 	remove_common(vb);
+	kfree(vb->page_bitmap);
 	kfree(vb);
 }
 
@@ -603,6 +742,7 @@ static unsigned int features[] = {
 	VIRTIO_BALLOON_F_MUST_TELL_HOST,
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+	VIRTIO_BALLOON_F_PAGE_BITMAP,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 3/7] mm: add a function to get the max pfn
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, Andrew Morton, virtualization,
	Mel Gorman
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

Expose the function to get the max pfn, so it can be used in the
virtio-balloon device driver.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 mm/page_alloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6903b69..2083b40 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4515,6 +4515,12 @@ void show_free_areas(unsigned int filter)
 	show_swap_cache_info();
 }
 
+unsigned long get_max_pfn(void)
+{
+	return max_pfn;
+}
+EXPORT_SYMBOL(get_max_pfn);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 2/7] virtio-balloon: define new feature bit and page bitmap head
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

Add a new feature which supports sending the page information with
a bitmap. The current implementation uses PFNs array, which is not
very efficient. Using bitmap can improve the performance of
inflating/deflating significantly

The page bitmap header will used to tell the host some information
about the page bitmap. e.g. the page size, page bitmap length and
start pfn.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 include/uapi/linux/virtio_balloon.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 343d7dd..d3b182a 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,6 +34,7 @@
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_PAGE_BITMAP	3 /* Send page info with bitmap */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -82,4 +83,22 @@ struct virtio_balloon_stat {
 	__virtio64 val;
 } __attribute__((packed));
 
+/* Page bitmap header structure */
+struct balloon_bmap_hdr {
+	/* Used to distinguish different request */
+	__virtio16 cmd;
+	/* Shift width of page in the bitmap */
+	__virtio16 page_shift;
+	/* flag used to identify different status */
+	__virtio16 flag;
+	/* Reserved */
+	__virtio16 reserved;
+	/* ID of the request */
+	__virtio64 req_id;
+	/* The pfn of 0 bit in the bitmap */
+	__virtio64 start_pfn;
+	/* The length of the bitmap, in bytes */
+	__virtio64 bmap_len;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 1/7] virtio-balloon: rework deflate to add page to a list
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

will allow faster notifications using a bitmap down the road.
balloon_pfn_to_page() can be removed because it's useless.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 476c0e3..8d649a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -98,12 +98,6 @@ static u32 page_to_balloon_pfn(struct page *page)
 	return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE;
 }
 
-static struct page *balloon_pfn_to_page(u32 pfn)
-{
-	BUG_ON(pfn % VIRTIO_BALLOON_PAGES_PER_PAGE);
-	return pfn_to_page(pfn / VIRTIO_BALLOON_PAGES_PER_PAGE);
-}
-
 static void balloon_ack(struct virtqueue *vq)
 {
 	struct virtio_balloon *vb = vq->vdev->priv;
@@ -176,18 +170,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 	return num_allocated_pages;
 }
 
-static void release_pages_balloon(struct virtio_balloon *vb)
+static void release_pages_balloon(struct virtio_balloon *vb,
+				 struct list_head *pages)
 {
-	unsigned int i;
-	struct page *page;
+	struct page *page, *next;
 
-	/* Find pfns pointing at start of each page, get pages and free them. */
-	for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-		page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev,
-							   vb->pfns[i]));
+	list_for_each_entry_safe(page, next, pages, lru) {
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
 			adjust_managed_page_count(page, 1);
+		list_del(&page->lru);
 		put_page(page); /* balloon reference */
 	}
 }
@@ -197,6 +189,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 	unsigned num_freed_pages;
 	struct page *page;
 	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
+	LIST_HEAD(pages);
 
 	/* We can only do one array worth at a time. */
 	num = min(num, ARRAY_SIZE(vb->pfns));
@@ -208,6 +201,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 		if (!page)
 			break;
 		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		list_add(&page->lru, &pages);
 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
 	}
 
@@ -219,7 +213,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 	 */
 	if (vb->num_pfns != 0)
 		tell_host(vb, vb->deflate_vq);
-	release_pages_balloon(vb);
+	release_pages_balloon(vb, &pages);
 	mutex_unlock(&vb->balloon_lock);
 	return num_freed_pages;
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 0/7] Extend virtio-balloon for fast (de)inflating & fast live migration
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	virtualization

This patch set contains two parts of changes to the virtio-balloon. 

One is the change for speeding up the inflating & deflating process,
the main idea of this optimization is to use bitmap to send the page
information to host instead of the PFNs, to reduce the overhead of
virtio data transmission, address translation and madvise(). This can
help to improve the performance by about 85%.

Another change is for speeding up live migration. By skipping process
guest's free pages in the first round of data copy, to reduce needless
data processing, this can help to save quite a lot of CPU cycles and
network bandwidth. We put guest's free page information in bitmap and
send it to host with the virt queue of virtio-balloon. For an idle 8GB
guest, this can help to shorten the total live migration time from 2Sec
to about 500ms in the 10Gbps network environment.  


Changes from v1 to v2:
    * Abandon the patch for dropping page cache.
    * Put some structures to uapi head file.
    * Use a new way to determine the page bitmap size.
    * Use a unified way to send the free page information with the bitmap 
    * Address the issues referred in MST's comments

Liang Li (7):
  virtio-balloon: rework deflate to add page to a list
  virtio-balloon: define new feature bit and page bitmap head
  mm: add a function to get the max pfn
  virtio-balloon: speed up inflate/deflate process
  virtio-balloon: define feature bit and head for misc virt queue
  mm: add the related functions to get free page info
  virtio-balloon: tell host vm's free page info

 drivers/virtio/virtio_balloon.c     | 306 +++++++++++++++++++++++++++++++-----
 include/uapi/linux/virtio_balloon.h |  41 +++++
 mm/page_alloc.c                     |  52 ++++++
 3 files changed, 359 insertions(+), 40 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH] virtio-blk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-29  1:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: virtualization, linux-kernel, Michael S. Tsirkin
In-Reply-To: <20160628114553.GA18354@infradead.org>

On Tue, 06/28 04:45, Christoph Hellwig wrote:
> On Tue, Jun 28, 2016 at 10:39:15AM +0800, Fam Zheng wrote:
> > Userspace listens to the KOBJ_ADD uevent generated in add_disk. At that
> > point we haven't created the serial attribute file, therefore depending
> > on how fast udev reacts, the /dev/disk/by-id/ entry doesn't always get
> > created.
> > 
> > This race condition can be easily reproduced by hot plugging a number of
> > virtio-blk disks.
> > 
> > Also in systemd, there used to be a related workaround in udev rules
> > called 'WAIT_FOR="serial"', but it is removed in later versions.
> > 
> > Now let's generate a KOBJ_CHANGE event after the attributes are ready.
> 
> The same race is present in other drivers as well, e.g. nvme.  Please
> find a way to make this work properly without needing to hack every
> driver to send events manually.

OK, I'll take a look today!

Fam

^ permalink raw reply

* [PATCH v2 4/4] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner
From: Pan Xinhui @ 2016-06-28 14:43 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390
  Cc: dave, Pan Xinhui, peterz, mpe, boqun.feng, will.deacon,
	waiman.long, mingo, paulus, benh, schwidefsky, paulmck
In-Reply-To: <1467124991-13164-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

An over-committed guest with more vCPUs than pCPUs has a heavy overload in
the two spin_on_owner. This blames on the lock holder preemption issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown]         [H] 0xc0000000000768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 kernel/locking/mutex.c      | 15 +++++++++++++--
 kernel/locking/rwsem-xadd.c | 16 +++++++++++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 79d2d76..ef0451b2 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,13 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 		 */
 		barrier();
 
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * Use vcpu_is_preempted to detech lock holder preemption issue
+		 * and break. vcpu_is_preempted is a macro defined by false if
+		 * arch does not support vcpu preempted check,
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			ret = false;
 			break;
 		}
@@ -261,8 +267,13 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 
 	rcu_read_lock();
 	owner = READ_ONCE(lock->owner);
+
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task not
+	 * on cpu or its cpu is preempted
+	 */
 	if (owner)
-		retval = owner->on_cpu;
+		retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 	rcu_read_unlock();
 	/*
 	 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6..828ca7c 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -319,7 +319,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 		goto done;
 	}
 
-	ret = owner->on_cpu;
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task not
+	 * on cpu or its cpu is preempted
+	 */
+	ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
 	rcu_read_unlock();
 	return ret;
@@ -340,8 +344,14 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 		 */
 		barrier();
 
-		/* abort spinning when need_resched or owner is not running */
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * abort spinning when need_resched or owner is not running or
+		 * owner's cpu is preempted. vcpu_is_preempted is a macro
+		 * defined by false if arch does not support vcpu preempted
+		 * check
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			rcu_read_unlock();
 			return false;
 		}
-- 
2.4.11

^ permalink raw reply related

* [PATCH v2 3/4] locking/osq: Drop the overload of osq_lock()
From: Pan Xinhui @ 2016-06-28 14:43 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390
  Cc: dave, Pan Xinhui, peterz, mpe, boqun.feng, will.deacon,
	waiman.long, mingo, paulus, benh, schwidefsky, paulmck
In-Reply-To: <1467124991-13164-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

An over-committed guest with more vCPUs than pCPUs has a heavy overload in
osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 kernel/locking/osq_lock.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..39d1385 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
 	return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+	return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
 	int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	while (!READ_ONCE(node->locked)) {
 		/*
 		 * If we need to reschedule bail... so we can block.
+		 * Use vcpu_is_preempted to detech lock holder preemption issue
+		 * and break. vcpu_is_preempted is a macro defined by false if
+		 * arch does not support vcpu preempted check,
 		 */
-		if (need_resched())
+		if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
 			goto unqueue;
 
 		cpu_relax_lowlatency();
-- 
2.4.11

^ permalink raw reply related

* [PATCH v2 2/4] powerpc/spinlock: support vcpu preempted check
From: Pan Xinhui @ 2016-06-28 14:43 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390
  Cc: dave, Pan Xinhui, peterz, mpe, boqun.feng, will.deacon,
	waiman.long, mingo, paulus, benh, schwidefsky, paulmck
In-Reply-To: <1467124991-13164-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself. Currently
kernel has an interface of bool vcpu_is_preempted(int cpu). It take the cpu
as parameter and return true if the cpu is preempted. Then kernel can break
the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need supoort it. And the fact is powerNV are built into same
kernel image with pSeries. So we need return false if we are runnig as
powerNV. The another fact is that lppaca->yiled_count keeps zero on
powerNV. So we can just skip the machine type.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/spinlock.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 523673d..3ac9fcb 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,24 @@
 #define SYNC_IO
 #endif
 
+/*
+ * This support kernel to check if one cpu is preempted or not.
+ * Then we can fix some lock holder preemption issue.
+ */
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+	/*
+	 * pSeries and powerNV can be built into same kernel image. In
+	 * principle we need return false directly if we are running as
+	 * powerNV. However the yield_count is always zero on powerNV, So
+	 * skip such machine type check
+	 */
+	return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.slock == 0;
-- 
2.4.11

^ permalink raw reply related

* [PATCH v2 1/4] kernel/sched: introduce vcpu preempted check interface
From: Pan Xinhui @ 2016-06-28 14:43 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390
  Cc: dave, Pan Xinhui, peterz, mpe, boqun.feng, will.deacon,
	waiman.long, mingo, paulus, benh, schwidefsky, paulmck
In-Reply-To: <1467124991-13164-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detech if
one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu pteempted check.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 include/linux/sched.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada..cbe0574 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3293,6 +3293,18 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu)	false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11

^ permalink raw reply related

* [PATCH v2 0/4] implement vcpu preempted check
From: Pan Xinhui @ 2016-06-28 14:43 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390
  Cc: dave, Pan Xinhui, peterz, mpe, boqun.feng, will.deacon,
	waiman.long, mingo, paulus, benh, schwidefsky, paulmck

change fomr v1:
	a simplier definition of default vcpu_is_preempted
	skip mahcine type check on ppc, and add config. remove dedicated macro.
	add one patch to drop overload of rwsem_spin_on_owner and mutex_spin_on_owner. 
	add more comments
	thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

Pan Xinhui (4):
  kernel/sched: introduce vcpu preempted check interface
  powerpc/spinlock: support vcpu preempted check
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner

 arch/powerpc/include/asm/spinlock.h | 18 ++++++++++++++++++
 include/linux/sched.h               | 12 ++++++++++++
 kernel/locking/mutex.c              | 15 +++++++++++++--
 kernel/locking/osq_lock.c           | 10 +++++++++-
 kernel/locking/rwsem-xadd.c         | 16 +++++++++++++---
 5 files changed, 65 insertions(+), 6 deletions(-)

-- 
2.4.11

^ permalink raw reply

* Re: [PATCH] virtio-blk: Generate uevent after attribute available
From: Christoph Hellwig @ 2016-06-28 11:45 UTC (permalink / raw)
  To: Fam Zheng; +Cc: virtualization, linux-kernel, Michael S. Tsirkin
In-Reply-To: <20160628023915.6012-1-famz@redhat.com>

On Tue, Jun 28, 2016 at 10:39:15AM +0800, Fam Zheng wrote:
> Userspace listens to the KOBJ_ADD uevent generated in add_disk. At that
> point we haven't created the serial attribute file, therefore depending
> on how fast udev reacts, the /dev/disk/by-id/ entry doesn't always get
> created.
> 
> This race condition can be easily reproduced by hot plugging a number of
> virtio-blk disks.
> 
> Also in systemd, there used to be a related workaround in udev rules
> called 'WAIT_FOR="serial"', but it is removed in later versions.
> 
> Now let's generate a KOBJ_CHANGE event after the attributes are ready.

The same race is present in other drivers as well, e.g. nvme.  Please
find a way to make this work properly without needing to hack every
driver to send events manually.

^ permalink raw reply

* Re: [PATCH net-next V2] tun: introduce tx skb ring
From: Michael S. Tsirkin @ 2016-06-28  7:09 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, eric.dumazet, netdev, linux-kernel, virtualization, brouer,
	davem
In-Reply-To: <576B701F.2020105@redhat.com>

On Thu, Jun 23, 2016 at 01:14:07PM +0800, Jason Wang wrote:
> 
> 
> On 2016年06月23日 02:18, Michael S. Tsirkin wrote:
> > On Fri, Jun 17, 2016 at 03:41:20AM +0300, Michael S. Tsirkin wrote:
> > > >Would it help to have ptr_ring_resize that gets an array of
> > > >rings and resizes them both to same length?
> > OK, here it is. Untested so far, and no skb wrapper.
> > Pls let me know whether this is what you had in mind.
> 
> Exactly what I want.
> 
> Thanks

Ok and this for skb_array

-->
skb_array: add wrappers for resizing

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

--

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index c900708..7e01c1f 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -151,16 +151,24 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
 	return ptr_ring_init(&a->ring, size, 0, gfp);
 }
 
-void __skb_array_destroy_skb(void *ptr)
+static void __skb_array_destroy_skb(void *ptr)
 {
 	kfree_skb(ptr);
 }
 
-int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
+static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 {
 	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
 }
 
+static inline int skb_raay_resize_multiple(struct skb_array **rings, int nrings,
+					   int size, gfp_t gfp)
+{
+	BUILD_BUG_ON(offsetof(struct skb_array, ring));
+	ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp,
+				 __skb_array_destroy_skb);
+}
+
 static inline void skb_array_cleanup(struct skb_array *a)
 {
 	ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related

* Re: [PATCH v6v3 02/12] mm: migrate: support non-lru movable page migration
From: Minchan Kim @ 2016-06-28  6:39 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
	Hugh Dickins, linux-kernel, dri-devel, virtualization,
	John Einar Reitan, linux-mm, Gioh Kim, Mel Gorman, Andrew Morton,
	Joonsoo Kim, Vlastimil Babka
In-Reply-To: <5770BEC5.3010807@linux.vnet.ibm.com>

On Mon, Jun 27, 2016 at 11:21:01AM +0530, Anshuman Khandual wrote:
> On 06/16/2016 11:07 AM, Minchan Kim wrote:
> > On Thu, Jun 16, 2016 at 09:12:07AM +0530, Anshuman Khandual wrote:
> >> On 06/16/2016 05:56 AM, Minchan Kim wrote:
> >>> On Wed, Jun 15, 2016 at 12:15:04PM +0530, Anshuman Khandual wrote:
> >>>> On 06/15/2016 08:02 AM, Minchan Kim wrote:
> >>>>> Hi,
> >>>>>
> >>>>> On Mon, Jun 13, 2016 at 03:08:19PM +0530, Anshuman Khandual wrote:
> >>>>>>> On 05/31/2016 05:31 AM, Minchan Kim wrote:
> >>>>>>>>> @@ -791,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >>>>>>>>>  	int rc = -EAGAIN;
> >>>>>>>>>  	int page_was_mapped = 0;
> >>>>>>>>>  	struct anon_vma *anon_vma = NULL;
> >>>>>>>>> +	bool is_lru = !__PageMovable(page);
> >>>>>>>>>  
> >>>>>>>>>  	if (!trylock_page(page)) {
> >>>>>>>>>  		if (!force || mode == MIGRATE_ASYNC)
> >>>>>>>>> @@ -871,6 +1002,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >>>>>>>>>  		goto out_unlock_both;
> >>>>>>>>>  	}
> >>>>>>>>>  
> >>>>>>>>> +	if (unlikely(!is_lru)) {
> >>>>>>>>> +		rc = move_to_new_page(newpage, page, mode);
> >>>>>>>>> +		goto out_unlock_both;
> >>>>>>>>> +	}
> >>>>>>>>> +
> >>>>>>>
> >>>>>>> Hello Minchan,
> >>>>>>>
> >>>>>>> I might be missing something here but does this implementation support the
> >>>>>>> scenario where these non LRU pages owned by the driver mapped as PTE into
> >>>>>>> process page table ? Because the "goto out_unlock_both" statement above
> >>>>>>> skips all the PTE unmap, putting a migration PTE and removing the migration
> >>>>>>> PTE steps.
> >>>>> You're right. Unfortunately, it doesn't support right now but surely,
> >>>>> it's my TODO after landing this work.
> >>>>>
> >>>>> Could you share your usecase?
> >>>>
> >>>> Sure.
> >>>
> >>> Thanks a lot!
> >>>
> >>>>
> >>>> My driver has privately managed non LRU pages which gets mapped into user space
> >>>> process page table through f_ops->mmap() and vmops->fault() which then updates
> >>>> the file RMAP (page->mapping->i_mmap) through page_add_file_rmap(page). One thing
> >>>
> >>> Hmm, page_add_file_rmap is not exported function. How does your driver can use it?
> >>
> >> Its not using the function directly, I just re-iterated the sequence of functions
> >> above. (do_set_pte -> page_add_file_rmap) gets called after we grab the page from
> >> driver through (__do_fault->vma->vm_ops->fault()).
> >>
> >>> Do you use vm_insert_pfn?
> >>> What type your vma is? VM_PFNMMAP or VM_MIXEDMAP?
> >>
> >> I dont use vm_insert_pfn(). Here is the sequence of events how the user space
> >> VMA gets the non LRU pages from the driver.
> >>
> >> - Driver registers a character device with 'struct file_operations' binding
> >> - Then the 'fops->mmap()' just binds the incoming 'struct vma' with a 'struct
> >>   vm_operations_struct' which provides the 'vmops->fault()' routine which
> >>   basically traps all page faults on the VMA and provides one page at a time
> >>   through a driver specific allocation routine which hands over non LRU pages
> >>
> >> The VMA is not anything special as such. Its what we get when we try to do a
> >> simple mmap() on a file descriptor pointing to a character device. I can
> >> figure out all the VM_* flags it holds after creation.
> >>
> >>>
> >>> I want to make dummy driver to simulate your case.
> >>
> >> Sure. I hope the above mentioned steps will help you but in case you need more
> >> information, please do let me know.
> > 
> > I got understood now. :)
> > I will test it with dummy driver and will Cc'ed when I send a patch.
> 
> Hello Minchan,
> 
> Do you have any updates on this ? The V7 of the series still has this limitation.
> Did you get a chance to test the driver out ? I am still concerned about how to
> handle the struct address_space override problem within the struct page.

Hi Anshuman,

Slow but I am working on that. :) However, as I said, I want to do it
after soft landing of current non-lru-no-mapped page migration to solve
current real field issues.

About the overriding problem of non-lru-mapped-page, I implemented dummy
driver as miscellaneous device and in test_mmap(file_operations.mmap),
I changed a_ops with my address_space_operations.

int test_mmap(struct file *filp, struct vm_area_struct *vma)
{
        filp->f_mapping->a_ops = &test_aops;
        vma->vm_ops = &test_vm_ops;
        vma->vm_private_data = filp->private_data;
        return 0;
}

test_aops should have *set_page_dirty* overriding.

static int test_set_pag_dirty(struct page *page)
{
        if (!PageDirty(page))
                SetPageDirty*page);
        return 0;
}

Otherwise, it goes BUG_ON during radix tree operation because
currently try_to_unmap is designed for file-lru pages which lives
in page cache so it propagates page table dirty bit to PG_dirty flag
of struct page by set_page_dirty. And set_page_dirty want to mark
dirty tag in radix tree node but it's character driver so the page
cache doesn't have it. That's why we encounter BUG_ON in radix tree
operation. Anyway, to test, I implemented set_page_dirty in my dummy
driver.

With only that, it doesn't work because I need to modify migrate.c to
work non-lru-mapped-page and changing PG_isolated flag which is
override of PG_reclaim which is cleared in set_page_dirty.

With that, it seems to work. But I'm not saying it's right model now
for device drivers. In runtime, replacing filp->f_mapping->a_ops with
custom a_ops of own driver seems to be hacky to me.
So, I'm considering now new pseudo fs "movable_inode" which will
support 

struct file *movable_inode_getfile(const char *name,
                        const struct file_operations *fop,
                        const struct address_space_operations *a_ops)
{
        struct path path;
        struct qstr this;
        struct inode *inode;
        struct super_block *sb;

        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
        sb = movable_mnt.mnt_sb;
        patch.denty = d_alloc_pseudo(movable_inode_mnt->mnt_sb, &this);
        patch.mnt = mntget(movable_inode_mnt);
        
        inode = new_inode(sb);
        ..
        ..
        inode->i_mapping->a_ops = a_ops;
        d_instantiate(path.dentry, inode);

        return alloc_file(&path, FMODE_WRITE | FMODE_READ, f_op);
}

And in our driver, we can change vma->vm_file with new one.

int test_mmap(struct file *filp, struct vm_area_structd *vma)
{
        struct file *newfile = movable_inode_getfile("[test"],
                                filep->f_op, &test_aops);
        vma->vm_file = newfile;
        ..
        ..
}

When I read mmap_region in mm/mmap.c, it's reasonable usecase
which dirver's mmap changes vma->vm_file with own file.

Anyway, it needs many subtle changes in mm/vfs/driver side so
need to review from each maintainers related subsystem so I
want to not be hurry.

Thanks.

> 
> - Anshuman
> 

^ permalink raw reply

* [PATCH] virtio-blk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-28  2:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, famz, Michael S. Tsirkin

Userspace listens to the KOBJ_ADD uevent generated in add_disk. At that
point we haven't created the serial attribute file, therefore depending
on how fast udev reacts, the /dev/disk/by-id/ entry doesn't always get
created.

This race condition can be easily reproduced by hot plugging a number of
virtio-blk disks.

Also in systemd, there used to be a related workaround in udev rules
called 'WAIT_FOR="serial"', but it is removed in later versions.

Now let's generate a KOBJ_CHANGE event after the attributes are ready.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/virtio_blk.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42758b5..5056007 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -567,6 +567,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
 	struct request_queue *q;
+	struct device *ddev;
 	int err, index;
 
 	u64 cap;
@@ -746,6 +747,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 					 &dev_attr_cache_type_ro);
 	if (err)
 		goto out_del_disk;
+	ddev = disk_to_dev(vblk->disk);
+	kobject_uevent(&ddev->kobj, KOBJ_CHANGE);
 	return 0;
 
 out_del_disk:
-- 
2.9.0

^ permalink raw reply related

* Re: [PATCH v5 0/6] Support calling functions on dedicated physical cpu
From: Juergen Gross @ 2016-06-27 10:34 UTC (permalink / raw)
  To: linux-kernel, xen-devel, david.vrabel
  Cc: jeremy, jdelvare, peterz, hpa, akataria, x86, virtualization,
	chrisw, mingo, Douglas_Warzecha, pali.rohar, boris.ostrovsky,
	tglx, linux
In-Reply-To: <1459952266-3687-1-git-send-email-jgross@suse.com>

On 06/04/16 16:17, Juergen Gross wrote:
> Some hardware (e.g. Dell Studio laptops) require special functions to
> be called on physical cpu 0 in order to avoid occasional hangs. When
> running as dom0 under Xen this could be achieved only via special boot
> parameters (vcpu pinning) limiting the hypervisor in it's scheduling
> decisions.
> 
> This patch series is adding a generic function to be able to temporarily
> pin a (virtual) cpu to a dedicated physical cpu for executing above
> mentioned functions on that specific cpu. The drivers (dcdbas and i8k)
> requiring this functionality are modified accordingly.

David, as I've got no reactions when asking for the last outstanding
Acks (patches 2 and 5) and neither Ingo nor Peter seem to be willing
to take the patches, I'd like to suggest we carry the series via the
xen tree. Are you okay with this suggestion?


Juergen

> 
> Changes in V5:
> - patch 3: rename and reshuffle parameters of smp_call_on_cpu() as requested
>   by Peter Zijlstra
> - patch 3: test target cpu to be online as requested by Peter Zijlstra
> - patch 4: less wordy messages as requested by David Vrabel
> 
> Changes in V4:
> - move patches 5 and 6 further up in the series
> - patch 2 (was 5): WARN_ONCE in case platform doesn't support pinning
>   as requested by Peter Zijlstra
> - patch 3 (was 2): change return value in case of illegal cpu as
>   requested by Peter Zijlstra
> - patch 3 (was 2): make pinning of vcpu an option as suggested by
>   Peter Zijlstra
> - patches 5 and 6 (were 3 and 4): add call to get_online_cpus()
> 
> Changes in V3:
> - use get_cpu()/put_cpu() as suggested by David Vrabel
> 
> Changes in V2:
> - instead of manipulating the allowed set of cpus use cpu specific
>   workqueue as requested by Peter Zijlstra
> - add include/linux/hypervisor.h to hide architecture specific stuff
>   from generic kernel code
> 
> Juergen Gross (6):
>   xen: sync xen header
>   virt, sched: add generic vcpu pinning support
>   smp: add function to execute a function synchronously on a cpu
>   xen: add xen_pin_vcpu() to support calling functions on a dedicated
>     pcpu
>   dcdbas: make use of smp_call_on_cpu()
>   hwmon: use smp_call_on_cpu() for dell-smm i8k
> 
>  MAINTAINERS                       |   1 +
>  arch/x86/include/asm/hypervisor.h |   4 ++
>  arch/x86/kernel/cpu/hypervisor.c  |  11 +++++
>  arch/x86/xen/enlighten.c          |  40 +++++++++++++++
>  drivers/firmware/dcdbas.c         |  51 +++++++++----------
>  drivers/hwmon/dell-smm-hwmon.c    |  35 +++++++------
>  include/linux/hypervisor.h        |  17 +++++++
>  include/linux/smp.h               |   3 ++
>  include/xen/interface/sched.h     | 100 +++++++++++++++++++++++++++++++-------
>  kernel/smp.c                      |  51 +++++++++++++++++++
>  kernel/up.c                       |  18 +++++++
>  11 files changed, 273 insertions(+), 58 deletions(-)
>  create mode 100644 include/linux/hypervisor.h
> 

^ permalink raw reply

* Re: [PATCH v6v3 02/12] mm: migrate: support non-lru movable page migration
From: Anshuman Khandual @ 2016-06-27  5:51 UTC (permalink / raw)
  To: Minchan Kim, Anshuman Khandual
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
	Hugh Dickins, linux-kernel, dri-devel, virtualization,
	John Einar Reitan, linux-mm, Gioh Kim, Mel Gorman, Andrew Morton,
	Joonsoo Kim, Vlastimil Babka
In-Reply-To: <20160616053754.GQ17127@bbox>

On 06/16/2016 11:07 AM, Minchan Kim wrote:
> On Thu, Jun 16, 2016 at 09:12:07AM +0530, Anshuman Khandual wrote:
>> On 06/16/2016 05:56 AM, Minchan Kim wrote:
>>> On Wed, Jun 15, 2016 at 12:15:04PM +0530, Anshuman Khandual wrote:
>>>> On 06/15/2016 08:02 AM, Minchan Kim wrote:
>>>>> Hi,
>>>>>
>>>>> On Mon, Jun 13, 2016 at 03:08:19PM +0530, Anshuman Khandual wrote:
>>>>>>> On 05/31/2016 05:31 AM, Minchan Kim wrote:
>>>>>>>>> @@ -791,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>>>>>>>>>  	int rc = -EAGAIN;
>>>>>>>>>  	int page_was_mapped = 0;
>>>>>>>>>  	struct anon_vma *anon_vma = NULL;
>>>>>>>>> +	bool is_lru = !__PageMovable(page);
>>>>>>>>>  
>>>>>>>>>  	if (!trylock_page(page)) {
>>>>>>>>>  		if (!force || mode == MIGRATE_ASYNC)
>>>>>>>>> @@ -871,6 +1002,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>>>>>>>>>  		goto out_unlock_both;
>>>>>>>>>  	}
>>>>>>>>>  
>>>>>>>>> +	if (unlikely(!is_lru)) {
>>>>>>>>> +		rc = move_to_new_page(newpage, page, mode);
>>>>>>>>> +		goto out_unlock_both;
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>
>>>>>>> Hello Minchan,
>>>>>>>
>>>>>>> I might be missing something here but does this implementation support the
>>>>>>> scenario where these non LRU pages owned by the driver mapped as PTE into
>>>>>>> process page table ? Because the "goto out_unlock_both" statement above
>>>>>>> skips all the PTE unmap, putting a migration PTE and removing the migration
>>>>>>> PTE steps.
>>>>> You're right. Unfortunately, it doesn't support right now but surely,
>>>>> it's my TODO after landing this work.
>>>>>
>>>>> Could you share your usecase?
>>>>
>>>> Sure.
>>>
>>> Thanks a lot!
>>>
>>>>
>>>> My driver has privately managed non LRU pages which gets mapped into user space
>>>> process page table through f_ops->mmap() and vmops->fault() which then updates
>>>> the file RMAP (page->mapping->i_mmap) through page_add_file_rmap(page). One thing
>>>
>>> Hmm, page_add_file_rmap is not exported function. How does your driver can use it?
>>
>> Its not using the function directly, I just re-iterated the sequence of functions
>> above. (do_set_pte -> page_add_file_rmap) gets called after we grab the page from
>> driver through (__do_fault->vma->vm_ops->fault()).
>>
>>> Do you use vm_insert_pfn?
>>> What type your vma is? VM_PFNMMAP or VM_MIXEDMAP?
>>
>> I dont use vm_insert_pfn(). Here is the sequence of events how the user space
>> VMA gets the non LRU pages from the driver.
>>
>> - Driver registers a character device with 'struct file_operations' binding
>> - Then the 'fops->mmap()' just binds the incoming 'struct vma' with a 'struct
>>   vm_operations_struct' which provides the 'vmops->fault()' routine which
>>   basically traps all page faults on the VMA and provides one page at a time
>>   through a driver specific allocation routine which hands over non LRU pages
>>
>> The VMA is not anything special as such. Its what we get when we try to do a
>> simple mmap() on a file descriptor pointing to a character device. I can
>> figure out all the VM_* flags it holds after creation.
>>
>>>
>>> I want to make dummy driver to simulate your case.
>>
>> Sure. I hope the above mentioned steps will help you but in case you need more
>> information, please do let me know.
> 
> I got understood now. :)
> I will test it with dummy driver and will Cc'ed when I send a patch.

Hello Minchan,

Do you have any updates on this ? The V7 of the series still has this limitation.
Did you get a chance to test the driver out ? I am still concerned about how to
handle the struct address_space override problem within the struct page.

- Anshuman

^ permalink raw reply

* [PATCH] virtio: Return correct errno for function init_vq's failure
From: Minfei Huang @ 2016-06-27  2:09 UTC (permalink / raw)
  To: mst; +Cc: Minfei Huang, linux-kernel, Minfei Huang, virtualization

The error number -ENOENT or 0 will be returned, if we can not allocate
more memory in function init_vq. If host can support multiple virtual
queues, and we fails to allocate necessary memory structures for vq,
kernel may crash due to incorrect returning.

To fix it, kernel will return correct value in init_vq.

Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: Minfei Huang <minfei.hmf@alibaba-inc.com>
---
 drivers/block/virtio_blk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42758b5..40ecb2b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -393,11 +393,10 @@ static int init_vq(struct virtio_blk *vblk)
 	if (err)
 		num_vqs = 1;
 
+	err = -ENOMEM;
 	vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
-	if (!vblk->vqs) {
-		err = -ENOMEM;
+	if (!vblk->vqs)
 		goto out;
-	}
 
 	names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
 	if (!names)
-- 
2.7.4 (Apple Git-66)

^ permalink raw reply related

* Re: DMA from stack in virtio_net and virtio_console
From: Michael S. Tsirkin @ 2016-06-26  2:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Amit Shah, Network Development, Alex Williamson,
	Linux Virtualization
In-Reply-To: <CALCETrXLQ7ajLaB4FENsg8XmRtjko10BqXR+KNcL7vuVeDrS8w@mail.gmail.com>

On Fri, Jun 24, 2016 at 06:45:50PM -0700, Andy Lutomirski wrote:
> virtio_net does DMA on the stack when it calls sg_init_one in
> virtio_set_queues, virtnet_vlan_rx_add_vid, and
> virtnet_vlan_rx_kill_vid.  Michael, I think these are examples we
> missed somehow when fixing these issues earlier on.
> 
> virtio_console does it here:
> 
> sg_init_one(sg, &cpkt, sizeof(cpkt));
> 
> This will cause problems on some architectures (Xen at the very least,
> and it'll cause more subtle problems on other architectures if they
> start using the DMA API), and it will blow up horribly with virtually
> mapped stacks.
> 
> Could you fix these, please?
> 
> Thanks,
> Andy

Will do, thanks for the report.

-- 
MST

^ permalink raw reply

* DMA from stack in virtio_net and virtio_console
From: Andy Lutomirski @ 2016-06-25  1:45 UTC (permalink / raw)
  To: Michael S. Tsirkin, Network Development, Amit Shah,
	Linux Virtualization, Jason Wang, Alex Williamson

virtio_net does DMA on the stack when it calls sg_init_one in
virtio_set_queues, virtnet_vlan_rx_add_vid, and
virtnet_vlan_rx_kill_vid.  Michael, I think these are examples we
missed somehow when fixing these issues earlier on.

virtio_console does it here:

sg_init_one(sg, &cpkt, sizeof(cpkt));

This will cause problems on some architectures (Xen at the very least,
and it'll cause more subtle problems on other architectures if they
start using the DMA API), and it will blow up horribly with virtually
mapped stacks.

Could you fix these, please?

Thanks,
Andy

^ permalink raw reply

* [PATCH V2 3/3] vhost: device IOTLB API
From: Jason Wang @ 2016-06-23  6:04 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel; +Cc: vkaplans, wexu, peterx
In-Reply-To: <1466661872-29165-1-git-send-email-jasowang@redhat.com>

This patch tries to implement an device IOTLB for vhost. This could be
used with for co-operation with userspace(qemu) implementation of DMA
remapping.

The idea is simple, cache the translation in a software device IOTLB
(which was implemented as interval tree) in vhost and use vhost_net
file descriptor for reporting IOTLB miss and IOTLB
update/invalidation. When vhost meets an IOTLB miss, the fault
address, size and access could be read from the file. After userspace
finishes the translation, it write the translated address to the
vhost_net file to update the device IOTLB.

When device IOTLB (VHOST_F_DEVICE_IOTLB) is enabled all vq address
set by ioctl were treated as iova instead of virtual address and the
accessing could only be done through IOTLB instead of direct
userspace memory access. Before each rounds or vq processing, all vq
metadata were prefetched in device IOTLB to make sure no translation
fault happens during vq processing.

In most cases, virtqueue were mapped contiguous even in virtual
address. So the IOTLB translation for virtqueue itself maybe a little
bit slower. We can add fast path on top of this patch.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c        |  50 +++-
 drivers/vhost/vhost.c      | 634 ++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h      |  32 ++-
 include/uapi/linux/vhost.h |  28 ++
 4 files changed, 697 insertions(+), 47 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index fc66956..4ccebad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-			 (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+			 (1ULL << VHOST_F_DEVICE_IOTLB)
 };
 
 enum {
@@ -334,7 +335,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 {
 	unsigned long uninitialized_var(endtime);
 	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				    out_num, in_num, NULL, NULL);
+				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
 		preempt_disable();
@@ -344,7 +345,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 			cpu_relax_lowlatency();
 		preempt_enable();
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					out_num, in_num, NULL, NULL);
+				      out_num, in_num, NULL, NULL);
 	}
 
 	return r;
@@ -377,6 +378,9 @@ static void handle_tx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 
 	hdr_size = nvq->vhost_hlen;
@@ -638,6 +642,10 @@ static void handle_rx(struct vhost_net *net)
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
+
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 	vhost_net_disable_vq(net, vq);
 
@@ -1086,6 +1094,11 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 		mutex_unlock(&n->dev.mutex);
 		return -EFAULT;
 	}
+	if ((features & (1ULL << VHOST_F_DEVICE_IOTLB))) {
+		if (vhost_init_device_iotlb(&n->dev, true))
+			return -EFAULT;
+	}
+
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].vq.mutex);
 		n->vqs[i].vq.acked_features = features;
@@ -1168,9 +1181,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
 }
 #endif
 
+static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+	int noblock = file->f_flags & O_NONBLOCK;
+
+	return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
+					struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_write_iter(dev, from);
+}
+
+static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
+{
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_poll(file, dev, wait);
+}
+
 static const struct file_operations vhost_net_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_net_release,
+	.read_iter      = vhost_net_chr_read_iter,
+	.write_iter     = vhost_net_chr_write_iter,
+	.poll           = vhost_net_chr_poll,
 	.unlocked_ioctl = vhost_net_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = vhost_net_compat_ioctl,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 166e779..11d2f55 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -35,6 +35,10 @@ static ushort max_mem_regions = 64;
 module_param(max_mem_regions, ushort, 0444);
 MODULE_PARM_DESC(max_mem_regions,
 	"Maximum number of memory regions in memory map. (default: 64)");
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+	"Maximum number of iotlb entries. (default: 2048)");
 
 enum {
 	VHOST_MEMORY_F_LOG = 0x1,
@@ -309,6 +313,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
+	vq->iotlb = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -413,9 +418,14 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
 	dev->umem = NULL;
+	dev->iotlb = NULL;
 	dev->mm = NULL;
 	spin_lock_init(&dev->work_lock);
 	INIT_LIST_HEAD(&dev->work_list);
+	init_waitqueue_head(&dev->wait);
+	INIT_LIST_HEAD(&dev->read_list);
+	INIT_LIST_HEAD(&dev->pending_list);
+	spin_lock_init(&dev->iotlb_lock);
 	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
@@ -563,6 +573,15 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_free(struct vhost_umem *umem,
+			    struct vhost_umem_node *node)
+{
+	vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+	list_del(&node->link);
+	kfree(node);
+	umem->numem--;
+}
+
 static void vhost_umem_clean(struct vhost_umem *umem)
 {
 	struct vhost_umem_node *node, *tmp;
@@ -570,14 +589,31 @@ static void vhost_umem_clean(struct vhost_umem *umem)
 	if (!umem)
 		return;
 
-	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
-		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
-		list_del(&node->link);
-		kvfree(node);
-	}
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
+		vhost_umem_free(umem, node);
+
 	kvfree(umem);
 }
 
+static void vhost_clear_msg(struct vhost_dev *dev)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&dev->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &dev->read_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	spin_unlock(&dev->iotlb_lock);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -606,6 +642,10 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	/* No one will access memory at this point */
 	vhost_umem_clean(dev->umem);
 	dev->umem = NULL;
+	vhost_umem_clean(dev->iotlb);
+	dev->iotlb = NULL;
+	vhost_clear_msg(dev);
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -681,28 +721,379 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 	return 1;
 }
 
-#define vhost_put_user(vq, x, ptr)  __put_user(x, ptr)
+static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
+			  struct iovec iov[], int iov_size, int access);
 
 static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
 			      const void *from, unsigned size)
 {
-	return __copy_to_user(to, from, size);
-}
+	int ret;
 
-#define vhost_get_user(vq, x, ptr) __get_user(x, ptr)
+	if (!vq->iotlb)
+		return __copy_to_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that all vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter t;
+		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_WO);
+		if (ret < 0)
+			goto out;
+		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
+		ret = copy_to_iter(from, size, &t);
+		if (ret == size)
+			ret = 0;
+	}
+out:
+	return ret;
+}
 
 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 				void *from, unsigned size)
 {
-	return __copy_from_user(to, from, size);
+	int ret;
+
+	if (!vq->iotlb)
+		return __copy_from_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter f;
+		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_RO);
+		if (ret < 0) {
+			vq_err(vq, "IOTLB translation failure: uaddr "
+			       "%p size 0x%llx\n", from,
+			       (unsigned long long) size);
+			goto out;
+		}
+		iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
+		ret = copy_from_iter(to, size, &f);
+		if (ret == size)
+			ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
+				     void *addr, unsigned size)
+{
+	int ret;
+
+	/* This function should be called after iotlb
+	 * prefetch, which means we're sure that vq
+	 * could be access through iotlb. So -EAGAIN should
+	 * not happen in this case.
+	 */
+	/* TODO: more fast path */
+	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
+			     ARRAY_SIZE(vq->iotlb_iov),
+			     VHOST_ACCESS_RO);
+	if (ret < 0) {
+		vq_err(vq, "IOTLB translation failure: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
+		vq_err(vq, "Non atomic userspace memory access: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	return vq->iotlb_iov[0].iov_base;
+}
+
+#define vhost_put_user(vq, x, ptr) \
+({ \
+	int ret = -EFAULT; \
+	if (!vq->iotlb) { \
+		ret = __put_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) to = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (to != NULL) \
+			ret = __put_user(x, to); \
+		else \
+			ret = -EFAULT;	\
+	} \
+	ret; \
+})
+
+#define vhost_get_user(vq, x, ptr) \
+({ \
+	int ret; \
+	if (!vq->iotlb) { \
+		ret = __get_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) from = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (from != NULL) \
+			ret = __get_user(x, from); \
+		else \
+			ret = -EFAULT; \
+	} \
+	ret; \
+})
+
+static void vhost_dev_lock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_lock(&d->vqs[i]->mutex);
+}
+
+static void vhost_dev_unlock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_unlock(&d->vqs[i]->mutex);
+}
+
+static int vhost_new_umem_range(struct vhost_umem *umem,
+				u64 start, u64 size, u64 end,
+				u64 userspace_addr, int perm)
+{
+	struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
+
+	if (!node)
+		return -ENOMEM;
+
+	if (umem->numem == max_iotlb_entries) {
+		tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
+		vhost_umem_free(umem, tmp);
+	}
+
+	node->start = start;
+	node->size = size;
+	node->last = end;
+	node->userspace_addr = userspace_addr;
+	node->perm = perm;
+	INIT_LIST_HEAD(&node->link);
+	list_add_tail(&node->link, &umem->umem_list);
+	vhost_umem_interval_tree_insert(node, &umem->umem_tree);
+	umem->numem++;
+
+	return 0;
+}
+
+static void vhost_del_umem_range(struct vhost_umem *umem,
+				 u64 start, u64 end)
+{
+	struct vhost_umem_node *node;
+
+	while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   start, end)))
+		vhost_umem_free(umem, node);
+}
+
+static void vhost_iotlb_notify_vq(struct vhost_dev *d,
+				  struct vhost_iotlb_msg *msg)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&d->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &d->pending_list, node) {
+		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
+		if (msg->iova <= vq_msg->iova &&
+		    msg->iova + msg->size - 1 > vq_msg->iova &&
+		    vq_msg->type == VHOST_IOTLB_MISS) {
+			vhost_poll_queue(&node->vq->poll);
+			list_del(&node->node);
+			kfree(node);
+		}
+	}
+
+	spin_unlock(&d->iotlb_lock);
+}
+
+static int umem_access_ok(u64 uaddr, u64 size, int access)
+{
+	if ((access & VHOST_ACCESS_RO) &&
+	    !access_ok(VERIFY_READ, uaddr, size))
+		return -EFAULT;
+	if ((access & VHOST_ACCESS_WO) &&
+	    !access_ok(VERIFY_WRITE, uaddr, size))
+		return -EFAULT;
+	return 0;
+}
+
+int vhost_process_iotlb_msg(struct vhost_dev *dev,
+			    struct vhost_iotlb_msg *msg)
+{
+	int ret = 0;
+
+	vhost_dev_lock_vqs(dev);
+	switch (msg->type) {
+	case VHOST_IOTLB_UPDATE:
+		if (!dev->iotlb) {
+			ret = -EFAULT;
+			break;
+		}
+		if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
+			ret = -EFAULT;
+			break;
+		}
+		if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
+					 msg->iova + msg->size - 1,
+					 msg->uaddr, msg->perm)) {
+			ret = -ENOMEM;
+			break;
+		}
+		vhost_iotlb_notify_vq(dev, msg);
+		break;
+	case VHOST_IOTLB_INVALIDATE:
+		vhost_del_umem_range(dev->iotlb, msg->iova,
+				     msg->iova + msg->size - 1);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	vhost_dev_unlock_vqs(dev);
+	return ret;
+}
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from)
+{
+	struct vhost_msg_node node;
+	unsigned size = sizeof(struct vhost_msg);
+	size_t ret;
+	int err;
+
+	if (iov_iter_count(from) < size)
+		return 0;
+	ret = copy_from_iter(&node.msg, size, from);
+	if (ret != size)
+		goto done;
+
+	switch (node.msg.type) {
+	case VHOST_IOTLB_MSG:
+		err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
+		if (err)
+			ret = err;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+done:
+	return ret;
+}
+EXPORT_SYMBOL(vhost_chr_write_iter);
+
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &dev->wait, wait);
+
+	if (!list_empty(&dev->read_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+EXPORT_SYMBOL(vhost_chr_poll);
+
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock)
+{
+	DEFINE_WAIT(wait);
+	struct vhost_msg_node *node;
+	ssize_t ret = 0;
+	unsigned size = sizeof(struct vhost_msg);
+
+	if (iov_iter_count(to) < size)
+		return 0;
+
+	while (1) {
+		if (!noblock)
+			prepare_to_wait(&dev->wait, &wait,
+					TASK_INTERRUPTIBLE);
+
+		node = vhost_dequeue_msg(dev, &dev->read_list);
+		if (node)
+			break;
+		if (noblock) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (!dev->iotlb) {
+			ret = -EBADFD;
+			break;
+		}
+
+		schedule();
+	}
+
+	if (!noblock)
+		finish_wait(&dev->wait, &wait);
+
+	if (node) {
+		ret = copy_to_iter(&node->msg, size, to);
+
+		if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+			kfree(node);
+			return ret;
+		}
+
+		vhost_enqueue_msg(dev, &dev->pending_list, node);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
+
+static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
+{
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_msg_node *node;
+	struct vhost_iotlb_msg *msg;
+
+	node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+	if (!node)
+		return -ENOMEM;
+
+	msg = &node->msg.iotlb;
+	msg->type = VHOST_IOTLB_MISS;
+	msg->iova = iova;
+	msg->perm = access;
+
+	vhost_enqueue_msg(dev, &dev->read_list, node);
+
+	return 0;
 }
 
 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
 			struct vring_used __user *used)
+
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
 	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
 	       access_ok(VERIFY_READ, avail,
 			 sizeof *avail + num * sizeof *avail->ring + s) &&
@@ -710,6 +1101,54 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int iotlb_access_ok(struct vhost_virtqueue *vq,
+			   int access, u64 addr, u64 len)
+{
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->iotlb;
+	u64 s = 0, size;
+
+	while (len > s) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   addr,
+							   addr + len - 1);
+		if (node == NULL || node->start > addr) {
+			vhost_iotlb_miss(vq, addr, access);
+			return false;
+		} else if (!(node->perm & access)) {
+			/* Report the possible access violation by
+			 * request another translation from userspace.
+			 */
+			return false;
+		}
+
+		size = node->size - addr + node->start;
+		s += size;
+		addr += size;
+	}
+
+	return true;
+}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+	unsigned int num = vq->num;
+
+	if (!vq->iotlb)
+		return 1;
+
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof *vq->desc) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+			       sizeof *vq->avail +
+			       num * sizeof *vq->avail->ring + s) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+			       sizeof *vq->used +
+			       num * sizeof *vq->used->ring + s);
+}
+EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+
 /* Can we log writes? */
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
@@ -736,16 +1175,35 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
+	if (vq->iotlb) {
+		/* When device IOTLB was used, the access validation
+		 * will be validated during prefetching.
+		 */
+		return 1;
+	}
 	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&
 		vq_log_access_ok(vq, vq->log_base);
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
+static struct vhost_umem *vhost_umem_alloc(void)
+{
+	struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
+
+	if (!umem)
+		return NULL;
+
+	umem->umem_tree = RB_ROOT;
+	umem->numem = 0;
+	INIT_LIST_HEAD(&umem->umem_list);
+
+	return umem;
+}
+
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
 	struct vhost_memory mem, *newmem;
 	struct vhost_memory_region *region;
-	struct vhost_umem_node *node;
 	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
@@ -767,28 +1225,23 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		return -EFAULT;
 	}
 
-	newumem = vhost_kvzalloc(sizeof(*newumem));
+	newumem = vhost_umem_alloc();
 	if (!newumem) {
 		kvfree(newmem);
 		return -ENOMEM;
 	}
 
-	newumem->umem_tree = RB_ROOT;
-	INIT_LIST_HEAD(&newumem->umem_list);
-
 	for (region = newmem->regions;
 	     region < newmem->regions + mem.nregions;
 	     region++) {
-		node = vhost_kvzalloc(sizeof(*node));
-		if (!node)
+		if (vhost_new_umem_range(newumem,
+					 region->guest_phys_addr,
+					 region->memory_size,
+					 region->guest_phys_addr +
+					 region->memory_size - 1,
+					 region->userspace_addr,
+					 VHOST_ACCESS_RW))
 			goto err;
-		node->start = region->guest_phys_addr;
-		node->size = region->memory_size;
-		node->last = node->start + node->size - 1;
-		node->userspace_addr = region->userspace_addr;
-		INIT_LIST_HEAD(&node->link);
-		list_add_tail(&node->link, &newumem->umem_list);
-		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
 	}
 
 	if (!memory_access_ok(d, newumem, 0))
@@ -1032,6 +1485,30 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
 
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
+{
+	struct vhost_umem *niotlb, *oiotlb;
+	int i;
+
+	niotlb = vhost_umem_alloc();
+	if (!niotlb)
+		return -ENOMEM;
+
+	oiotlb = d->iotlb;
+	d->iotlb = niotlb;
+
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
+		d->vqs[i]->iotlb = niotlb;
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
+
+	vhost_umem_clean(oiotlb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
+
 /* Caller must have device mutex */
 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 {
@@ -1246,15 +1723,20 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 	if (r)
 		goto err;
 	vq->signalled_used_valid = false;
-	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+	if (!vq->iotlb &&
+	    !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
 		r = -EFAULT;
 		goto err;
 	}
 	r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
-	if (r)
+	if (r) {
+		vq_err(vq, "Can't access used idx at %p\n",
+		       &vq->used->idx);
 		goto err;
+	}
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
 	return 0;
+
 err:
 	vq->is_le = is_le;
 	return r;
@@ -1262,10 +1744,11 @@ err:
 EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
-			  struct iovec iov[], int iov_size)
+			  struct iovec iov[], int iov_size, int access)
 {
 	const struct vhost_umem_node *node;
-	struct vhost_umem *umem = vq->umem;
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
@@ -1276,12 +1759,21 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			ret = -ENOBUFS;
 			break;
 		}
+
 		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
 							addr, addr + len - 1);
 		if (node == NULL || node->start > addr) {
-			ret = -EFAULT;
+			if (umem != dev->iotlb) {
+				ret = -EFAULT;
+				break;
+			}
+			ret = -EAGAIN;
+			break;
+		} else if (!(node->perm & access)) {
+			ret = -EPERM;
 			break;
 		}
+
 		_iov = iov + ret;
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
@@ -1292,6 +1784,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		++ret;
 	}
 
+	if (ret == -EAGAIN)
+		vhost_iotlb_miss(vq, addr, access);
 	return ret;
 }
 
@@ -1326,7 +1820,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	unsigned int i = 0, count, found = 0;
 	u32 len = vhost32_to_cpu(vq, indirect->len);
 	struct iov_iter from;
-	int ret;
+	int ret, access;
 
 	/* Sanity check */
 	if (unlikely(len % sizeof desc)) {
@@ -1338,9 +1832,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	}
 
 	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
-			     UIO_MAXIOV);
+			     UIO_MAXIOV, VHOST_ACCESS_RO);
 	if (unlikely(ret < 0)) {
-		vq_err(vq, "Translation failure %d in indirect.\n", ret);
+		if (ret != -EAGAIN)
+			vq_err(vq, "Translation failure %d in indirect.\n", ret);
 		return ret;
 	}
 	iov_iter_init(&from, READ, vq->indirect, ret, len);
@@ -1378,16 +1873,22 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			return -EINVAL;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d indirect idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d indirect idx %d\n",
+					ret, i);
 			return ret;
 		}
 		/* If this is an input descriptor, increment that count. */
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			*in_num += ret;
 			if (unlikely(log)) {
 				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
@@ -1426,7 +1927,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	u16 last_avail_idx;
 	__virtio16 avail_idx;
 	__virtio16 ring_head;
-	int ret;
+	int ret, access;
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
@@ -1500,22 +2001,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 					   out_num, in_num,
 					   log, log_num, &desc);
 			if (unlikely(ret < 0)) {
-				vq_err(vq, "Failure detected "
-				       "in indirect descriptor at idx %d\n", i);
+				if (ret != -EAGAIN)
+					vq_err(vq, "Failure detected "
+						"in indirect descriptor at idx %d\n", i);
 				return ret;
 			}
 			continue;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d descriptor idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d descriptor idx %d\n",
+					ret, i);
 			return ret;
 		}
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			/* If this is an input descriptor,
 			 * increment that count. */
 			*in_num += ret;
@@ -1781,6 +2288,47 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_disable_notify);
 
+/* Create a new message. */
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
+{
+	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->vq = vq;
+	node->msg.type = type;
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_new_msg);
+
+void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
+		       struct vhost_msg_node *node)
+{
+	spin_lock(&dev->iotlb_lock);
+	list_add_tail(&node->node, head);
+	spin_unlock(&dev->iotlb_lock);
+
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+}
+EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
+
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head)
+{
+	struct vhost_msg_node *node = NULL;
+
+	spin_lock(&dev->iotlb_lock);
+	if (!list_empty(head)) {
+		node = list_first_entry(head, struct vhost_msg_node,
+					node);
+		list_del(&node->node);
+	}
+	spin_unlock(&dev->iotlb_lock);
+
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
+
+
 static int __init vhost_init(void)
 {
 	return 0;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b93b6a0..8601fc6 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -63,13 +63,15 @@ struct vhost_umem_node {
 	__u64 last;
 	__u64 size;
 	__u64 userspace_addr;
-	__u64 flags_padding;
+	__u32 perm;
+	__u32 flags_padding;
 	__u64 __subtree_last;
 };
 
 struct vhost_umem {
 	struct rb_root umem_tree;
 	struct list_head umem_list;
+	int numem;
 };
 
 /* The virtqueue structure describes a queue attached to a device. */
@@ -117,10 +119,12 @@ struct vhost_virtqueue {
 	u64 log_addr;
 
 	struct iovec iov[UIO_MAXIOV];
+	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -137,6 +141,12 @@ struct vhost_virtqueue {
 	u32 busyloop_timeout;
 };
 
+struct vhost_msg_node {
+  struct vhost_msg msg;
+  struct vhost_virtqueue *vq;
+  struct list_head node;
+};
+
 struct vhost_dev {
 	struct mm_struct *mm;
 	struct mutex mutex;
@@ -148,6 +158,11 @@ struct vhost_dev {
 	struct list_head work_list;
 	struct task_struct *worker;
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
+	spinlock_t iotlb_lock;
+	struct list_head read_list;
+	struct list_head pending_list;
+	wait_queue_head_t wait;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
@@ -184,6 +199,21 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
+void vhost_enqueue_msg(struct vhost_dev *dev,
+		       struct list_head *head,
+		       struct vhost_msg_node *node);
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head);
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait);
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock);
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from);
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 61a8777..8cb0a65 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -47,6 +47,32 @@ struct vhost_vring_addr {
 	__u64 log_guest_addr;
 };
 
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
@@ -146,6 +172,8 @@ struct vhost_memory {
 #define VHOST_F_LOG_ALL 26
 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
 #define VHOST_NET_F_VIRTIO_NET_HDR 27
+/* Vhost have device IOTLB */
+#define VHOST_F_DEVICE_IOTLB 63
 
 /* VHOST_SCSI specific definitions */
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V2 2/3] vhost: convert pre sorted vhost memory array to interval tree
From: Jason Wang @ 2016-06-23  6:04 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel; +Cc: vkaplans, wexu, peterx
In-Reply-To: <1466661872-29165-1-git-send-email-jasowang@redhat.com>

Current pre-sorted memory region array has some limitations for future
device IOTLB conversion:

1) need extra work for adding and removing a single region, and it's
   expected to be slow because of sorting or memory re-allocation.
2) need extra work of removing a large range which may intersect
   several regions with different size.
3) need trick for a replacement policy like LRU

To overcome the above shortcomings, this patch convert it to interval
tree which can easily address the above issue with almost no extra
work.

The patch could be used for:

- Extend the current API and only let the userspace to send diffs of
  memory table.
- Simplify Device IOTLB implementation.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   |   8 +--
 drivers/vhost/vhost.c | 182 ++++++++++++++++++++++++++++----------------------
 drivers/vhost/vhost.h |  27 ++++++--
 3 files changed, 128 insertions(+), 89 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d3e45f..fc66956 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1038,20 +1038,20 @@ static long vhost_net_reset_owner(struct vhost_net *n)
 	struct socket *tx_sock = NULL;
 	struct socket *rx_sock = NULL;
 	long err;
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 
 	mutex_lock(&n->dev.mutex);
 	err = vhost_dev_check_owner(&n->dev);
 	if (err)
 		goto done;
-	memory = vhost_dev_reset_owner_prepare();
-	if (!memory) {
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
 		err = -ENOMEM;
 		goto done;
 	}
 	vhost_net_stop(n, &tx_sock, &rx_sock);
 	vhost_net_flush(n);
-	vhost_dev_reset_owner(&n->dev, memory);
+	vhost_dev_reset_owner(&n->dev, umem);
 	vhost_net_vq_reset(n);
 done:
 	mutex_unlock(&n->dev.mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9f2a63a..166e779 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
 
@@ -42,6 +43,10 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
+INTERVAL_TREE_DEFINE(struct vhost_umem_node,
+		     rb, __u64, __subtree_last,
+		     START, LAST, , vhost_umem_interval_tree);
+
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
@@ -300,10 +305,10 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
-	vq->memory = NULL;
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
+	vq->umem = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -407,7 +412,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	mutex_init(&dev->mutex);
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
-	dev->memory = NULL;
+	dev->umem = NULL;
 	dev->mm = NULL;
 	spin_lock_init(&dev->work_lock);
 	INIT_LIST_HEAD(&dev->work_list);
@@ -512,27 +517,36 @@ err_mm:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 
-struct vhost_memory *vhost_dev_reset_owner_prepare(void)
+static void *vhost_kvzalloc(unsigned long size)
 {
-	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+	if (!n)
+		n = vzalloc(size);
+	return n;
+}
+
+struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+{
+	return vhost_kvzalloc(sizeof(struct vhost_umem));
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 
 /* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
 {
 	int i;
 
 	vhost_dev_cleanup(dev, true);
 
 	/* Restore memory to default empty mapping. */
-	memory->nregions = 0;
-	dev->memory = memory;
+	INIT_LIST_HEAD(&umem->umem_list);
+	dev->umem = umem;
 	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 	 * VQs aren't running.
 	 */
 	for (i = 0; i < dev->nvqs; ++i)
-		dev->vqs[i]->memory = memory;
+		dev->vqs[i]->umem = umem;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 
@@ -549,6 +563,21 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_clean(struct vhost_umem *umem)
+{
+	struct vhost_umem_node *node, *tmp;
+
+	if (!umem)
+		return;
+
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
+		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+		list_del(&node->link);
+		kvfree(node);
+	}
+	kvfree(umem);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -575,8 +604,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 		fput(dev->log_file);
 	dev->log_file = NULL;
 	/* No one will access memory at this point */
-	kvfree(dev->memory);
-	dev->memory = NULL;
+	vhost_umem_clean(dev->umem);
+	dev->umem = NULL;
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -602,25 +631,25 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 }
 
 /* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
+static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 			       int log_all)
 {
-	int i;
+	struct vhost_umem_node *node;
 
-	if (!mem)
+	if (!umem)
 		return 0;
 
-	for (i = 0; i < mem->nregions; ++i) {
-		struct vhost_memory_region *m = mem->regions + i;
-		unsigned long a = m->userspace_addr;
-		if (m->memory_size > ULONG_MAX)
+	list_for_each_entry(node, &umem->umem_list, link) {
+		unsigned long a = node->userspace_addr;
+
+		if (node->size > ULONG_MAX)
 			return 0;
 		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
-				    m->memory_size))
+				    node->size))
 			return 0;
 		else if (log_all && !log_access_ok(log_base,
-						   m->guest_phys_addr,
-						   m->memory_size))
+						   node->start,
+						   node->size))
 			return 0;
 	}
 	return 1;
@@ -628,7 +657,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
 
 /* Can we switch to this memory table? */
 /* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
+static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 			    int log_all)
 {
 	int i;
@@ -641,7 +670,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 		/* If ring is inactive, will check when it's enabled. */
 		if (d->vqs[i]->private_data)
-			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
+			ok = vq_memory_access_ok(d->vqs[i]->log_base,
+						 umem, log);
 		else
 			ok = 1;
 		mutex_unlock(&d->vqs[i]->mutex);
@@ -684,7 +714,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
 {
-	return memory_access_ok(dev, dev->memory, 1);
+	return memory_access_ok(dev, dev->umem, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_log_access_ok);
 
@@ -695,7 +725,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
-	return vq_memory_access_ok(log_base, vq->memory,
+	return vq_memory_access_ok(log_base, vq->umem,
 				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
 					sizeof *vq->used +
@@ -711,28 +741,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
-static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
-{
-	const struct vhost_memory_region *r1 = p1, *r2 = p2;
-	if (r1->guest_phys_addr < r2->guest_phys_addr)
-		return 1;
-	if (r1->guest_phys_addr > r2->guest_phys_addr)
-		return -1;
-	return 0;
-}
-
-static void *vhost_kvzalloc(unsigned long size)
-{
-	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-
-	if (!n)
-		n = vzalloc(size);
-	return n;
-}
-
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
-	struct vhost_memory mem, *newmem, *oldmem;
+	struct vhost_memory mem, *newmem;
+	struct vhost_memory_region *region;
+	struct vhost_umem_node *node;
+	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
 
@@ -752,24 +766,52 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		kvfree(newmem);
 		return -EFAULT;
 	}
-	sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
-		vhost_memory_reg_sort_cmp, NULL);
 
-	if (!memory_access_ok(d, newmem, 0)) {
+	newumem = vhost_kvzalloc(sizeof(*newumem));
+	if (!newumem) {
 		kvfree(newmem);
-		return -EFAULT;
+		return -ENOMEM;
 	}
-	oldmem = d->memory;
-	d->memory = newmem;
+
+	newumem->umem_tree = RB_ROOT;
+	INIT_LIST_HEAD(&newumem->umem_list);
+
+	for (region = newmem->regions;
+	     region < newmem->regions + mem.nregions;
+	     region++) {
+		node = vhost_kvzalloc(sizeof(*node));
+		if (!node)
+			goto err;
+		node->start = region->guest_phys_addr;
+		node->size = region->memory_size;
+		node->last = node->start + node->size - 1;
+		node->userspace_addr = region->userspace_addr;
+		INIT_LIST_HEAD(&node->link);
+		list_add_tail(&node->link, &newumem->umem_list);
+		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
+	}
+
+	if (!memory_access_ok(d, newumem, 0))
+		goto err;
+
+	oldumem = d->umem;
+	d->umem = newumem;
 
 	/* All memory accesses are done under some VQ mutex. */
 	for (i = 0; i < d->nvqs; ++i) {
 		mutex_lock(&d->vqs[i]->mutex);
-		d->vqs[i]->memory = newmem;
+		d->vqs[i]->umem = newumem;
 		mutex_unlock(&d->vqs[i]->mutex);
 	}
-	kvfree(oldmem);
+
+	kvfree(newmem);
+	vhost_umem_clean(oldumem);
 	return 0;
+
+err:
+	vhost_umem_clean(newumem);
+	kvfree(newmem);
+	return -EFAULT;
 }
 
 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
@@ -1072,28 +1114,6 @@ done:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
 
-static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
-						     __u64 addr, __u32 len)
-{
-	const struct vhost_memory_region *reg;
-	int start = 0, end = mem->nregions;
-
-	while (start < end) {
-		int slot = start + (end - start) / 2;
-		reg = mem->regions + slot;
-		if (addr >= reg->guest_phys_addr)
-			end = slot;
-		else
-			start = slot + 1;
-	}
-
-	reg = mem->regions + start;
-	if (addr >= reg->guest_phys_addr &&
-		reg->guest_phys_addr + reg->memory_size > addr)
-		return reg;
-	return NULL;
-}
-
 /* TODO: This is really inefficient.  We need something like get_user()
  * (instruction directly accesses the data, with an exception table entry
  * returning -EFAULT). See Documentation/x86/exception-tables.txt.
@@ -1244,29 +1264,29 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size)
 {
-	const struct vhost_memory_region *reg;
-	struct vhost_memory *mem;
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
 
-	mem = vq->memory;
 	while ((u64)len > s) {
 		u64 size;
 		if (unlikely(ret >= iov_size)) {
 			ret = -ENOBUFS;
 			break;
 		}
-		reg = find_region(mem, addr, len);
-		if (unlikely(!reg)) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							addr, addr + len - 1);
+		if (node == NULL || node->start > addr) {
 			ret = -EFAULT;
 			break;
 		}
 		_iov = iov + ret;
-		size = reg->memory_size - addr + reg->guest_phys_addr;
+		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(reg->userspace_addr + addr - reg->guest_phys_addr);
+			(node->userspace_addr + addr - node->start);
 		s += size;
 		addr += size;
 		++ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d36d8be..b93b6a0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -53,6 +53,25 @@ struct vhost_log {
 	u64 len;
 };
 
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+struct vhost_umem_node {
+	struct rb_node rb;
+	struct list_head link;
+	__u64 start;
+	__u64 last;
+	__u64 size;
+	__u64 userspace_addr;
+	__u64 flags_padding;
+	__u64 __subtree_last;
+};
+
+struct vhost_umem {
+	struct rb_root umem_tree;
+	struct list_head umem_list;
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -101,7 +120,7 @@ struct vhost_virtqueue {
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -119,7 +138,6 @@ struct vhost_virtqueue {
 };
 
 struct vhost_dev {
-	struct vhost_memory *memory;
 	struct mm_struct *mm;
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
@@ -129,14 +147,15 @@ struct vhost_dev {
 	spinlock_t work_lock;
 	struct list_head work_list;
 	struct task_struct *worker;
+	struct vhost_umem *umem;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
+struct vhost_umem *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *, bool locked);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V2 1/3] vhost: introduce vhost memory accessors
From: Jason Wang @ 2016-06-23  6:04 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel; +Cc: vkaplans, wexu, peterx
In-Reply-To: <1466661872-29165-1-git-send-email-jasowang@redhat.com>

This patch introduces vhost memory accessors which were just wrappers
for userspace address access helpers. This is a requirement for vhost
device iotlb implementation which will add iotlb translations in those
accessors.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 669fef1..9f2a63a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -651,6 +651,22 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 	return 1;
 }
 
+#define vhost_put_user(vq, x, ptr)  __put_user(x, ptr)
+
+static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
+			      const void *from, unsigned size)
+{
+	return __copy_to_user(to, from, size);
+}
+
+#define vhost_get_user(vq, x, ptr) __get_user(x, ptr)
+
+static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
+				void *from, unsigned size)
+{
+	return __copy_from_user(to, from, size);
+}
+
 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
@@ -1156,7 +1172,8 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
-	if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0)
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+			   &vq->used->flags) < 0)
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1174,7 +1191,8 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
-	if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)))
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+			   vhost_avail_event(vq)))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -1212,7 +1230,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 		r = -EFAULT;
 		goto err;
 	}
-	r = __get_user(last_used_idx, &vq->used->idx);
+	r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
 	if (r)
 		goto err;
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
@@ -1392,7 +1410,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
-	if (unlikely(__get_user(avail_idx, &vq->avail->idx))) {
+	if (unlikely(vhost_get_user(vq, avail_idx, &vq->avail->idx))) {
 		vq_err(vq, "Failed to access avail idx at %p\n",
 		       &vq->avail->idx);
 		return -EFAULT;
@@ -1414,8 +1432,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(__get_user(ring_head,
-				&vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+	if (unlikely(vhost_get_user(vq, ring_head,
+		     &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
@@ -1450,7 +1468,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			       i, vq->num, head);
 			return -EINVAL;
 		}
-		ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
+		ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
+					   sizeof desc);
 		if (unlikely(ret)) {
 			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
 			       i, vq->desc + i);
@@ -1538,15 +1557,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
 	if (count == 1) {
-		if (__put_user(heads[0].id, &used->id)) {
+		if (vhost_put_user(vq, heads[0].id, &used->id)) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
-		if (__put_user(heads[0].len, &used->len)) {
+		if (vhost_put_user(vq, heads[0].len, &used->len)) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
-	} else if (__copy_to_user(used, heads, count * sizeof *used)) {
+	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}
@@ -1590,7 +1609,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+			   &vq->used->idx)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -1622,7 +1642,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 		__virtio16 flags;
-		if (__get_user(flags, &vq->avail->flags)) {
+		if (vhost_get_user(vq, flags, &vq->avail->flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
 		}
@@ -1636,7 +1656,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
-	if (__get_user(event, vhost_used_event(vq))) {
+	if (vhost_get_user(vq, event, vhost_used_event(vq))) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
@@ -1678,7 +1698,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	__virtio16 avail_idx;
 	int r;
 
-	r = __get_user(avail_idx, &vq->avail->idx);
+	r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
 	if (r)
 		return false;
 
@@ -1713,7 +1733,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
 	smp_mb();
-	r = __get_user(avail_idx, &vq->avail->idx);
+	r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
 	if (r) {
 		vq_err(vq, "Failed to check avail idx at %p: %d\n",
 		       &vq->avail->idx, r);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V2 0/3] basic device IOTLB support for vhost_net
From: Jason Wang @ 2016-06-23  6:04 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel; +Cc: vkaplans, wexu, peterx

This patch tries to implement an device IOTLB for vhost. This could be
used with for co-operation with userspace IOMMU implementation (qemu)
for a secure DMA environment (DMAR) in guest.

The idea is simple. When vhost meets an IOTLB miss, it will request
the assistance of userspace to do the translation, this is done
through:

- when there's a IOTLB miss, it will notify userspace through
  vhost_net fd and then userspace read the fault address, size and
  access from vhost fd.
- userspace write the translation result back to vhost fd, vhost can
  then update its IOTLB.

The codes were optimized for fixed mapping users e.g dpdk in guest. It
will be slow if dynamic mappings were used in guest. We could do
optimizations on top.

The codes were designed to be architecture independent. It should be
easily ported to any architecture.

Stress tested with l2fwd/vfio in guest with 4K/2M/1G page size. On 1G
hugepage case, 100% TLB hit rate were noticed.

Have a benchmark on this. Test was done with l2fwd in guest.For 2MB
page, no difference in 64B performance and I notice a 4%-5% drop for
1500B performance compare to UIO in guest. We can add some shortcut to
bypass the IOTLB for virtqueue accessing, but I think it's better to
do this on top.

Changes from V1:
- Fix i386 build warnings
- Drop access paramter for vhost_get_vq_desc() (fix VHOST SCSI build error)

Changes from RFC V3:
- rebase on latest
- minor tweak on commit log
- use VHOST_ACCESS_RO instead of VHOST_ACCESS_WO in vhost_copy_from_user()
- switch to use atomic userspace access helper in vhost_get/put_user()
- remove debug codes in vhost_iotlb_miss()
- use FIFO instead of FILO when doing TLB replacement
- fix unbalanced lock in vhost_process_iotlb_msg()

Changes from RFC V2:
- introduce memory accessors for vhost
- switch from ioctls to oridinary file read/write for iotlb miss and
  updating
- do not assume virtqueue were virtually mapped contiguously, all
  virtqueue access were done throug IOTLB
- verify memory access during IOTLB update and fail early
- introduce a module parameter for the size of IOTLB

Changes from RFC V1:
- support any size/range of updating and invalidation through
  introducing the interval tree.
- convert from per device iotlb request to per virtqueue iotlb
  request, this solves the possible deadlock in V1.
- read/write permission check support.

Please review.

Jason Wang (3):
  vhost: introduce vhost memory accessors
  vhost: convert pre sorted vhost memory array to interval tree
  vhost: device IOTLB API

 drivers/vhost/net.c        |  58 +++-
 drivers/vhost/vhost.c      | 806 +++++++++++++++++++++++++++++++++++++++------
 drivers/vhost/vhost.h      |  57 +++-
 include/uapi/linux/vhost.h |  28 ++
 4 files changed, 829 insertions(+), 120 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH net-next V2] tun: introduce tx skb ring
From: Jason Wang @ 2016-06-23  5:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, eric.dumazet, netdev, linux-kernel, virtualization, brouer,
	davem
In-Reply-To: <20160622211620-mutt-send-email-mst@redhat.com>



On 2016年06月23日 02:18, Michael S. Tsirkin wrote:
> On Fri, Jun 17, 2016 at 03:41:20AM +0300, Michael S. Tsirkin wrote:
>> >Would it help to have ptr_ring_resize that gets an array of
>> >rings and resizes them both to same length?
> OK, here it is. Untested so far, and no skb wrapper.
> Pls let me know whether this is what you had in mind.

Exactly what I want.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox