Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v2 3/4] virtio_balloon: introduce migration primitives to balloon pages
From: Rafael Aquini @ 2012-06-28 21:49 UTC (permalink / raw)
  To: linux-mm
  Cc: Rik van Riel, Rafael Aquini, Konrad Rzeszutek Wilk,
	Michael S. Tsirkin, linux-kernel, virtualization, Andi Kleen,
	Andrew Morton
In-Reply-To: <cover.1340916058.git.aquini@redhat.com>

This patch makes balloon pages movable at allocation time and introduces the
infrastructure needed to perform the balloon page migration operation.

Signed-off-by: Rafael Aquini <aquini@redhat.com>
---
 drivers/virtio/virtio_balloon.c |   96 ++++++++++++++++++++++++++++++++++++++-
 include/linux/virtio_balloon.h  |    6 +++
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index d47c5c2..53386aa 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -27,6 +27,8 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -140,8 +142,9 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
 	spin_lock(&vb->pfn_list_lock);
 	for (vb->num_pfns = 0; vb->num_pfns < num;
 	     vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
-					__GFP_NOMEMALLOC | __GFP_NOWARN);
+		struct page *page = alloc_page(GFP_HIGHUSER_MOVABLE |
+						__GFP_NORETRY | __GFP_NOWARN |
+						__GFP_NOMEMALLOC);
 		if (!page) {
 			if (printk_ratelimit())
 				dev_printk(KERN_INFO, &vb->vdev->dev,
@@ -154,6 +157,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
 		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
 		totalram_pages--;
 		list_add(&page->lru, &vb->pages);
+		page->mapping = balloon_mapping;
 	}
 
 	/* Didn't get any?  Oh well. */
@@ -195,6 +199,7 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
 	for (vb->num_pfns = 0; vb->num_pfns < num;
 	     vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 		page = list_first_entry(&vb->pages, struct page, lru);
+		page->mapping = NULL;
 		list_del(&page->lru);
 		set_page_pfns(vb->pfns + vb->num_pfns, page);
 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
@@ -365,6 +370,77 @@ static int init_vqs(struct virtio_balloon *vb)
 	return 0;
 }
 
+/*
+ * Populate balloon_mapping->a_ops->migratepage method to perform the balloon
+ * page migration task.
+ *
+ * After a ballooned page gets isolated by compaction procedures, this is the
+ * function that performs the page migration on behalf of move_to_new_page(),
+ * when the last calls (page)->mapping->a_ops->migratepage.
+ *
+ * Page migration for virtio balloon is done in a simple swap fashion which
+ * follows these two steps:
+ *  1) insert newpage into vb->pages list and update the host about it;
+ *  2) update the host about the removed old page from vb->pages list;
+ */
+int virtballoon_migratepage(struct address_space *mapping,
+		struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+	struct virtio_balloon *vb = (void *)mapping->backing_dev_info;
+	struct scatterlist sg;
+
+	/* balloon's page migration 1st step */
+	spin_lock(&vb->pfn_list_lock);
+	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+	list_add(&newpage->lru, &vb->pages);
+	set_page_pfns(vb->pfns, newpage);
+	sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+	spin_unlock(&vb->pfn_list_lock);
+	tell_host(vb, vb->inflate_vq, &sg);
+
+	/* balloon's page migration 2nd step */
+	spin_lock(&vb->pfn_list_lock);
+	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+	set_page_pfns(vb->pfns, page);
+	sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+	spin_unlock(&vb->pfn_list_lock);
+	tell_host(vb, vb->deflate_vq, &sg);
+
+	return 0;
+}
+
+/*
+ * Populate balloon_mapping->a_ops->invalidatepage method to help compaction on
+ * isolating a page from the balloon page list.
+ */
+void virtballoon_isolatepage(struct page *page, unsigned long mode)
+{
+	struct address_space *mapping = page->mapping;
+	struct virtio_balloon *vb = (void *)mapping->backing_dev_info;
+	spin_lock(&vb->pfn_list_lock);
+	list_del(&page->lru);
+	spin_unlock(&vb->pfn_list_lock);
+}
+
+/*
+ * Populate balloon_mapping->a_ops->freepage method to help compaction on
+ * re-inserting an isolated page into the balloon page list.
+ */
+void virtballoon_putbackpage(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct virtio_balloon *vb = (void *)mapping->backing_dev_info;
+	spin_lock(&vb->pfn_list_lock);
+	list_add(&page->lru, &vb->pages);
+	spin_unlock(&vb->pfn_list_lock);
+}
+
+static const struct address_space_operations virtio_balloon_aops = {
+	.migratepage = virtballoon_migratepage,
+	.invalidatepage = virtballoon_isolatepage,
+	.freepage = virtballoon_putbackpage,
+};
+
 static int virtballoon_probe(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb;
@@ -384,6 +460,19 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 	vb->need_stats_update = 0;
 
+	/* Init the ballooned page->mapping special balloon_mapping */
+	balloon_mapping = kmalloc(sizeof(*balloon_mapping), GFP_KERNEL);
+	if (!balloon_mapping) {
+		err = -ENOMEM;
+		goto out_free_mapping;
+	}
+
+	INIT_RADIX_TREE(&balloon_mapping->page_tree, GFP_ATOMIC | __GFP_NOWARN);
+	INIT_LIST_HEAD(&balloon_mapping->i_mmap_nonlinear);
+	spin_lock_init(&balloon_mapping->tree_lock);
+	balloon_mapping->a_ops = &virtio_balloon_aops;
+	balloon_mapping->backing_dev_info = (void *)vb;
+
 	err = init_vqs(vb);
 	if (err)
 		goto out_free_vb;
@@ -398,6 +487,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
 
 out_del_vqs:
 	vdev->config->del_vqs(vdev);
+out_free_mapping:
+	kfree(balloon_mapping);
 out_free_vb:
 	kfree(vb);
 out:
@@ -424,6 +515,7 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev)
 	kthread_stop(vb->thread);
 	remove_common(vb);
 	kfree(vb);
+	kfree(balloon_mapping);
 }
 
 #ifdef CONFIG_PM
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
index 652dc8b..db21300 100644
--- a/include/linux/virtio_balloon.h
+++ b/include/linux/virtio_balloon.h
@@ -56,4 +56,10 @@ struct virtio_balloon_stat {
 	u64 val;
 } __attribute__((packed));
 
+#if defined(CONFIG_COMPACTION)
+extern struct address_space *balloon_mapping;
+#else
+struct address_space *balloon_mapping;
+#endif
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH v2 4/4] mm: add vm event counters for balloon pages compaction
From: Rafael Aquini @ 2012-06-28 21:49 UTC (permalink / raw)
  To: linux-mm
  Cc: Rik van Riel, Rafael Aquini, Konrad Rzeszutek Wilk,
	Michael S. Tsirkin, linux-kernel, virtualization, Andi Kleen,
	Andrew Morton
In-Reply-To: <cover.1340916058.git.aquini@redhat.com>

This patch is only for testing report purposes and shall be dropped in case of
the rest of this patchset getting accepted for merging.

Signed-off-by: Rafael Aquini <aquini@redhat.com>
---
 drivers/virtio/virtio_balloon.c |    1 +
 include/linux/vm_event_item.h   |    2 ++
 mm/compaction.c                 |    1 +
 mm/migrate.c                    |    6 ++++--
 mm/vmstat.c                     |    4 ++++
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 53386aa..c4a929d 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -406,6 +406,7 @@ int virtballoon_migratepage(struct address_space *mapping,
 	spin_unlock(&vb->pfn_list_lock);
 	tell_host(vb, vb->deflate_vq, &sg);
 
+	count_vm_event(COMPACTBALLOONMIGRATED);
 	return 0;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 06f8e38..e330c5a 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -40,6 +40,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_COMPACTION
 		COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
+		COMPACTBALLOONMIGRATED, COMPACTBALLOONFAILED,
+		COMPACTBALLOONISOLATED, COMPACTBALLOONFREED,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
diff --git a/mm/compaction.c b/mm/compaction.c
index 6c6e572..650cdda 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -947,6 +947,7 @@ bool isolate_balloon_page(struct page *page)
 			if (is_balloon_page(page) && (page_count(page) == 2)) {
 				page->mapping->a_ops->invalidatepage(page, 0);
 				unlock_page(page);
+				count_vm_event(COMPACTBALLOONISOLATED);
 				return true;
 			}
 			unlock_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 59c7bc5..5838719 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -78,9 +78,10 @@ void putback_lru_pages(struct list_head *l)
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (unlikely(is_balloon_page(page)))
+		if (unlikely(is_balloon_page(page))) {
+			count_vm_event(COMPACTBALLOONFAILED);
 			WARN_ON(!putback_balloon_page(page));
-		else
+		} else
 			putback_lru_page(page);
 	}
 }
@@ -878,6 +879,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 				    page_is_file_cache(page));
 		put_page(page);
 		__free_page(page);
+		count_vm_event(COMPACTBALLOONFREED);
 		return rc;
 	}
 out:
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9..3b7109f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -767,6 +767,10 @@ const char * const vmstat_text[] = {
 	"compact_stall",
 	"compact_fail",
 	"compact_success",
+	"compact_balloon_migrated",
+	"compact_balloon_failed",
+	"compact_balloon_isolated",
+	"compact_balloon_freed",
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
1.7.10.2

^ permalink raw reply related

* Re: [patch net-next 1/4] net: introduce new priv_flag indicating iface capable of change mac when running
From: Ben Hutchings @ 2012-06-28 22:17 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, netdev, shimoda.hiroaki, virtualization, danny.kukawka,
	edumazet, davem
In-Reply-To: <1340892639-1111-2-git-send-email-jpirko@redhat.com>

On Thu, 2012-06-28 at 16:10 +0200, Jiri Pirko wrote:
> Introduce IFF_LIFE_ADDR_CHANGE priv_flag and use it to disable
> netif_running() check in eth_mac_addr()
>
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/if.h |    2 ++
>  net/ethernet/eth.c |    2 +-
>  2 files changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/if.h b/include/linux/if.h
> index f995c66..fd9ee7c 100644
> --- a/include/linux/if.h
> +++ b/include/linux/if.h
> @@ -81,6 +81,8 @@
>  #define IFF_UNICAST_FLT	0x20000		/* Supports unicast filtering	*/
>  #define IFF_TEAM_PORT	0x40000		/* device used as team port */
>  #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
> +#define IFF_LIFE_ADDR_CHANGE 0x100000	/* device supports hardware address
> +					 * change when it's running */
[...]

Any device that has IFF_UNICAST_FLT can update the unicast MAC filter
while it's running; doesn't that go hand-in-hand with being able to
handle changes to the primary MAC address?  Is the new flag really
necessary at all?

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH v2 0/4] make balloon pages movable by compaction
From: Minchan Kim @ 2012-06-29  1:37 UTC (permalink / raw)
  To: Rafael Aquini
  Cc: Rik van Riel, Konrad Rzeszutek Wilk, Michael S. Tsirkin,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <cover.1340916058.git.aquini@redhat.com>

Hi Rafael,

On 06/29/2012 06:49 AM, Rafael Aquini wrote:

> This patchset follows the main idea discussed at 2012 LSFMMS section:
> "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/


Could you summarize the problem, solution instead of link URL in cover-letter?
IIUC, the problem is that it is hard to get contiguous memory in guest-side 
after ballooning happens because guest-side memory could be very fragmented
by ballooned page. It makes THP page allocation of guest-side very poor success ratio.

The solution is that when memory ballooning happens, we allocates ballooned page
as a movable page in guest-side because they can be migrated easily so compaction of
guest-side could put together them into either side so that we can get contiguous memory.
For it, compaction should be aware of ballooned page.

Right?

-- 
Kind regards,
Minchan Kim

^ permalink raw reply

* Re: [PATCH v2 0/4] make balloon pages movable by compaction
From: Rafael Aquini @ 2012-06-29  3:51 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Rik van Riel, Konrad Rzeszutek Wilk, Michael S. Tsirkin,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <4FED06C8.1090003@kernel.org>

On Fri, Jun 29, 2012 at 10:37:12AM +0900, Minchan Kim wrote:
> Hi Rafael,
> 
> On 06/29/2012 06:49 AM, Rafael Aquini wrote:
> 
> > This patchset follows the main idea discussed at 2012 LSFMMS section:
> > "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/
> 
> 
> Could you summarize the problem, solution instead of link URL in cover-letter?
> IIUC, the problem is that it is hard to get contiguous memory in guest-side 
> after ballooning happens because guest-side memory could be very fragmented
> by ballooned page. It makes THP page allocation of guest-side very poor success ratio.
> 
> The solution is that when memory ballooning happens, we allocates ballooned page
> as a movable page in guest-side because they can be migrated easily so compaction of
> guest-side could put together them into either side so that we can get contiguous memory.
> For it, compaction should be aware of ballooned page.
> 
> Right?
>
Yes, you surely got it correct, sir. 

Thanks Minchan, for taking time to provide me such feedback. I'll rework commit
messages to make them more elucidative, yet concise for the next submission.

Please, let me know if you have other concerns I shall be addressing here.

Best regards!

^ permalink raw reply

* Re: [PATCH v2 0/4] make balloon pages movable by compaction
From: Rusty Russell @ 2012-06-29  4:31 UTC (permalink / raw)
  To: linux-mm
  Cc: Rik van Riel, Rafael Aquini, Michael S. Tsirkin,
	Konrad Rzeszutek Wilk, linux-kernel, virtualization, Andi Kleen,
	Andrew Morton
In-Reply-To: <cover.1340916058.git.aquini@redhat.com>

On Thu, 28 Jun 2012 18:49:38 -0300, Rafael Aquini <aquini@redhat.com> wrote:
> This patchset follows the main idea discussed at 2012 LSFMMS section:
> "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/
> 
> to introduce the required changes to the virtio_balloon driver, as well as
> changes to the core compaction & migration bits, in order to allow
> memory balloon pages become movable within a guest.
> 
> Rafael Aquini (4):
>   mm: introduce compaction and migration for virtio ballooned pages
>   virtio_balloon: handle concurrent accesses to virtio_balloon struct
>     elements
>   virtio_balloon: introduce migration primitives to balloon pages
>   mm: add vm event counters for balloon pages compaction
> 
>  drivers/virtio/virtio_balloon.c |  142 +++++++++++++++++++++++++++++++++++----
>  include/linux/mm.h              |   16 +++++
>  include/linux/virtio_balloon.h  |    6 ++
>  include/linux/vm_event_item.h   |    2 +
>  mm/compaction.c                 |  111 ++++++++++++++++++++++++------
>  mm/migrate.c                    |   32 ++++++++-
>  mm/vmstat.c                     |    4 ++
>  7 files changed, 280 insertions(+), 33 deletions(-)
> 
> 
> V2: address Mel Gorman's review comments

If Mel is happy, I am happy.  Seems sensible that the virtio_baloon
changes go in at the same time as the mm changes, so:

Acked-by: Rusty Russell <rusty@rustcorp.com.au>

Cheers,
Rusty.

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Minchan Kim @ 2012-06-29  5:32 UTC (permalink / raw)
  To: Rafael Aquini
  Cc: Rik van Riel, Michael S. Tsirkin, Konrad Rzeszutek Wilk,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <d0f33a6492501a0d420abbf184f9b956cff3e3fc.1340916058.git.aquini@redhat.com>

On 06/29/2012 06:49 AM, Rafael Aquini wrote:

> This patch introduces the helper functions as well as the necessary changes
> to teach compaction and migration bits how to cope with pages which are
> part of a guest memory balloon, in order to make them movable by memory
> compaction procedures.
> 
> Signed-off-by: Rafael Aquini <aquini@redhat.com>


Just a few comment but not critical. :)

> ---
>  include/linux/mm.h |   16 ++++++++
>  mm/compaction.c    |  110 +++++++++++++++++++++++++++++++++++++++++++---------
>  mm/migrate.c       |   30 +++++++++++++-
>  3 files changed, 136 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b36d08c..35568fc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1629,5 +1629,21 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
>  static inline bool page_is_guard(struct page *page) { return false; }
>  #endif /* CONFIG_DEBUG_PAGEALLOC */
>  
> +#if (defined(CONFIG_VIRTIO_BALLOON) || \
> +	defined(CONFIG_VIRTIO_BALLOON_MODULE)) && defined(CONFIG_COMPACTION)
> +extern bool isolate_balloon_page(struct page *);
> +extern bool putback_balloon_page(struct page *);
> +extern struct address_space *balloon_mapping;
> +
> +static inline bool is_balloon_page(struct page *page)
> +{
> +        return (page->mapping == balloon_mapping) ? true : false;
> +}


What lock should it protect?

> +#else
> +static inline bool is_balloon_page(struct page *page)       { return false; }
> +static inline bool isolate_balloon_page(struct page *page)  { return false; }
> +static inline bool putback_balloon_page(struct page *page)  { return false; }
> +#endif /* (VIRTIO_BALLOON || VIRTIO_BALLOON_MODULE) && COMPACTION */
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 7ea259d..6c6e572 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -14,6 +14,7 @@
>  #include <linux/backing-dev.h>
>  #include <linux/sysctl.h>
>  #include <linux/sysfs.h>
> +#include <linux/export.h>
>  #include "internal.h"
>  
>  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> @@ -312,32 +313,40 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			continue;
>  		}
>  
> -		if (!PageLRU(page))
> -			continue;
> -
>  		/*
> -		 * PageLRU is set, and lru_lock excludes isolation,
> -		 * splitting and collapsing (collapsing has already
> -		 * happened if PageLRU is set).
> +		 * It is possible to migrate LRU pages and balloon pages.
> +		 * Skip any other type of page.
>  		 */
> -		if (PageTransHuge(page)) {
> -			low_pfn += (1 << compound_order(page)) - 1;
> -			continue;
> -		}
> +		if (likely(PageLRU(page))) {


We can't make sure it is likely because there might be so many pages for kernel.

> +			/*
> +			 * PageLRU is set, and lru_lock excludes isolation,
> +			 * splitting and collapsing (collapsing has already
> +			 * happened if PageLRU is set).
> +			 */
> +			if (PageTransHuge(page)) {
> +				low_pfn += (1 << compound_order(page)) - 1;
> +				continue;
> +			}
>  
> -		if (!cc->sync)
> -			mode |= ISOLATE_ASYNC_MIGRATE;
> +			if (!cc->sync)
> +				mode |= ISOLATE_ASYNC_MIGRATE;
>  
> -		lruvec = mem_cgroup_page_lruvec(page, zone);
> +			lruvec = mem_cgroup_page_lruvec(page, zone);
>  
> -		/* Try isolate the page */
> -		if (__isolate_lru_page(page, mode) != 0)
> -			continue;
> +			/* Try isolate the page */
> +			if (__isolate_lru_page(page, mode) != 0)
> +				continue;
>  
> -		VM_BUG_ON(PageTransCompound(page));
> +			VM_BUG_ON(PageTransCompound(page));
> +
> +			/* Successfully isolated */
> +			del_page_from_lru_list(page, lruvec, page_lru(page));
> +		} else if (is_balloon_page(page)) {
> +			if (!isolate_balloon_page(page))
> +				continue;
> +		} else
> +			continue;
>  
> -		/* Successfully isolated */
> -		del_page_from_lru_list(page, lruvec, page_lru(page));
>  		list_add(&page->lru, migratelist);
>  		cc->nr_migratepages++;
>  		nr_isolated++;
> @@ -903,4 +912,67 @@ void compaction_unregister_node(struct node *node)
>  }
>  #endif /* CONFIG_SYSFS && CONFIG_NUMA */
>  
> +#if defined(CONFIG_VIRTIO_BALLOON) || defined(CONFIG_VIRTIO_BALLOON_MODULE)
> +/*
> + * Balloon pages special page->mapping.
> + * users must properly allocate and initialize an instance of balloon_mapping,
> + * and set it as the page->mapping for balloon enlisted page instances.
> + *
> + * address_space_operations necessary methods for ballooned pages:
> + *   .migratepage    - used to perform balloon's page migration (as is)
> + *   .invalidatepage - used to isolate a page from balloon's page list
> + *   .freepage       - used to reinsert an isolated page to balloon's page list
> + */
> +struct address_space *balloon_mapping;
> +EXPORT_SYMBOL_GPL(balloon_mapping);
> +
> +/* __isolate_lru_page() counterpart for a ballooned page */
> +bool isolate_balloon_page(struct page *page)
> +{
> +	if (WARN_ON(!is_balloon_page(page)))
> +		return false;
> +
> +	if (likely(get_page_unless_zero(page))) {
> +		/*
> +		 * We can race against move_to_new_page() & __unmap_and_move().
> +		 * If we stumble across a locked balloon page and succeed on
> +		 * isolating it, the result tends to be disastrous.
> +		 */
> +		if (likely(trylock_page(page))) {
> +			/*
> +			 * A ballooned page, by default, has just one refcount.
> +			 * Prevent concurrent compaction threads from isolating
> +			 * an already isolated balloon page.
> +			 */
> +			if (is_balloon_page(page) && (page_count(page) == 2)) {
> +				page->mapping->a_ops->invalidatepage(page, 0);


Could you add more meaningful name wrapping raw invalidatepage?
But I don't know what is proper name. ;)


> +				unlock_page(page);
> +				return true;
> +			}
> +			unlock_page(page);
> +		}
> +		/* Drop refcount taken for this already isolated page */
> +		put_page(page);
> +	}
> +	return false;
> +}
> +
> +/* putback_lru_page() counterpart for a ballooned page */
> +bool putback_balloon_page(struct page *page)
> +{
> +	if (WARN_ON(!is_balloon_page(page)))
> +		return false;
> +
> +	if (likely(trylock_page(page))) {
> +		if(is_balloon_page(page)) {
> +			page->mapping->a_ops->freepage(page);


Ditto.

> +			put_page(page);
> +			unlock_page(page);
> +			return true;
> +		}
> +		unlock_page(page);
> +	}
> +	return false;
> +}
> +#endif /* CONFIG_VIRTIO_BALLOON || CONFIG_VIRTIO_BALLOON_MODULE */
>  #endif /* CONFIG_COMPACTION */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index be26d5c..59c7bc5 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -78,7 +78,10 @@ void putback_lru_pages(struct list_head *l)
>  		list_del(&page->lru);
>  		dec_zone_page_state(page, NR_ISOLATED_ANON +
>  				page_is_file_cache(page));
> -		putback_lru_page(page);
> +		if (unlikely(is_balloon_page(page)))
> +			WARN_ON(!putback_balloon_page(page));
> +		else
> +			putback_lru_page(page);
>  	}
>  }
>  
> @@ -783,6 +786,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>  		}
>  	}
>  
> +	if (is_balloon_page(page)) {
> +		/*
> +		 * A ballooned page does not need any special attention from
> +		 * physical to virtual reverse mapping procedures.
> +		 * Skip any attempt to unmap PTEs or to remap swap cache,
> +		 * in order to avoid burning cycles at rmap level.
> +		 */
> +		remap_swapcache = 0;
> +		goto skip_unmap;
> +	}
> +
>  	/*
>  	 * Corner case handling:
>  	 * 1. When a new swap-cache page is read into, it is added to the LRU
> @@ -852,6 +866,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
>  			goto out;
>  
>  	rc = __unmap_and_move(page, newpage, force, offlining, mode);
> +
> +	if (is_balloon_page(newpage)) {
> +		/*
> +		 * A ballooned page has been migrated already. Now, it is the
> +		 * time to wrap-up counters, handle the old page back to Buddy
> +		 * and return.
> +		 */
> +		list_del(&page->lru);
> +		dec_zone_page_state(page, NR_ISOLATED_ANON +
> +				    page_is_file_cache(page));
> +		put_page(page);
> +		__free_page(page);
> +		return rc;
> +	}
>  out:
>  	if (rc != -EAGAIN) {
>  		/*



-- 
Kind regards,
Minchan Kim

^ permalink raw reply

* Re: [patch net-next 1/4] net: introduce new priv_flag indicating iface capable of change mac when running
From: Jiri Pirko @ 2012-06-29  6:41 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: mst, netdev, shimoda.hiroaki, virtualization, danny.kukawka,
	edumazet, davem
In-Reply-To: <1340921854.2577.16.camel@bwh-desktop.uk.solarflarecom.com>

Fri, Jun 29, 2012 at 12:17:34AM CEST, bhutchings@solarflare.com wrote:
>On Thu, 2012-06-28 at 16:10 +0200, Jiri Pirko wrote:
>> Introduce IFF_LIFE_ADDR_CHANGE priv_flag and use it to disable
>> netif_running() check in eth_mac_addr()
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> ---
>>  include/linux/if.h |    2 ++
>>  net/ethernet/eth.c |    2 +-
>>  2 files changed, 3 insertions(+), 1 deletion(-)
>> 
>> diff --git a/include/linux/if.h b/include/linux/if.h
>> index f995c66..fd9ee7c 100644
>> --- a/include/linux/if.h
>> +++ b/include/linux/if.h
>> @@ -81,6 +81,8 @@
>>  #define IFF_UNICAST_FLT	0x20000		/* Supports unicast filtering	*/
>>  #define IFF_TEAM_PORT	0x40000		/* device used as team port */
>>  #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
>> +#define IFF_LIFE_ADDR_CHANGE 0x100000	/* device supports hardware address
>> +					 * change when it's running */
>[...]
>
>Any device that has IFF_UNICAST_FLT can update the unicast MAC filter
>while it's running; doesn't that go hand-in-hand with being able to
>handle changes to the primary MAC address?  Is the new flag really
>necessary at all?

Hmm, this makes sense. But, can you guarantee that all devices behave like this?

Also, there are many devices that does not support unicast filtering
and yet they support updating mac adress while running.

Jirka

>
>Ben.
>
>-- 
>Ben Hutchings, Staff Engineer, Solarflare
>Not speaking for my employer; that's the marketing department's job.
>They asked us to note that Solarflare product names are trademarked.
>

^ permalink raw reply

* [patch net-next v2 0/4] net: introduce and use IFF_LIFE_ADDR_CHANGE
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem

three drivers updated, but this can be used in many others.

v1->v2:
%s/LIFE/LIVE

Jiri Pirko (4):
  net: introduce new priv_flag indicating iface capable of change mac
    when running
  virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
  team: use IFF_LIVE_ADDR_CHANGE priv_flag
  dummy: use IFF_LIVE_ADDR_CHANGE priv_flag

 drivers/net/dummy.c      |   15 ++-------------
 drivers/net/team/team.c  |    9 +++++----
 drivers/net/virtio_net.c |   11 +++++------
 include/linux/if.h       |    2 ++
 net/ethernet/eth.c       |    2 +-
 5 files changed, 15 insertions(+), 24 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* [patch net-next v2 1/4] net: introduce new priv_flag indicating iface capable of change mac when running
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Introduce IFF_LIVE_ADDR_CHANGE priv_flag and use it to disable
netif_running() check in eth_mac_addr()

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/if.h |    2 ++
 net/ethernet/eth.c |    2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/if.h b/include/linux/if.h
index f995c66..1ec407b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -81,6 +81,8 @@
 #define IFF_UNICAST_FLT	0x20000		/* Supports unicast filtering	*/
 #define IFF_TEAM_PORT	0x40000		/* device used as team port */
 #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
+#define IFF_LIVE_ADDR_CHANGE 0x100000	/* device supports hardware address
+					 * change when it's running */
 
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 36e5880..db6a6c1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -283,7 +283,7 @@ int eth_mac_addr(struct net_device *dev, void *p)
 {
 	struct sockaddr *addr = p;
 
-	if (netif_running(dev))
+	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
 		return -EBUSY;
 	if (!is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 2/4] virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/virtio_net.c |   11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 36a16d5..1db445b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -679,12 +679,11 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct virtio_device *vdev = vi->vdev;
-	struct sockaddr *addr = p;
+	int ret;
 
-	if (!is_valid_ether_addr(addr->sa_data))
-		return -EADDRNOTAVAIL;
-	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
+	ret = eth_mac_addr(dev, p);
+	if (ret)
+		return ret;
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
 		vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
@@ -1063,7 +1062,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 		return -ENOMEM;
 
 	/* Set up network device as normal. */
-	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
 	dev->netdev_ops = &virtnet_netdev;
 	dev->features = NETIF_F_HIGHDMA;
 
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 3/4] team: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/team/team.c |    9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 89853c3..9b94f53 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1188,10 +1188,11 @@ static int team_set_mac_address(struct net_device *dev, void *p)
 {
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
-	struct sockaddr *addr = p;
+	int err;
 
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
-	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	err = eth_mac_addr(dev, p);
+	if (err)
+		return err;
 	rcu_read_lock();
 	list_for_each_entry_rcu(port, &team->port_list, list)
 		if (team->ops.port_change_mac)
@@ -1393,7 +1394,7 @@ static void team_setup(struct net_device *dev)
 	 * bring us to promisc mode in case a unicast addr is added.
 	 * Let this up to underlay drivers.
 	 */
-	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
 
 	dev->features |= NETIF_F_LLTX;
 	dev->features |= NETIF_F_GRO;
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 4/4] dummy: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/dummy.c |   15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index bab0158..9d6a067 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -40,18 +40,6 @@
 
 static int numdummies = 1;
 
-static int dummy_set_address(struct net_device *dev, void *p)
-{
-	struct sockaddr *sa = p;
-
-	if (!is_valid_ether_addr(sa->sa_data))
-		return -EADDRNOTAVAIL;
-
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
-	memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
-	return 0;
-}
-
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
 {
@@ -118,7 +106,7 @@ static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_start_xmit		= dummy_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= set_multicast_list,
-	.ndo_set_mac_address	= dummy_set_address,
+	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_get_stats64	= dummy_get_stats64,
 };
 
@@ -134,6 +122,7 @@ static void dummy_setup(struct net_device *dev)
 	dev->tx_queue_len = 0;
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
 	eth_hw_addr_random(dev);
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Mel Gorman @ 2012-06-29 15:31 UTC (permalink / raw)
  To: Rafael Aquini
  Cc: Rik van Riel, Konrad Rzeszutek Wilk, Michael S. Tsirkin,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <d0f33a6492501a0d420abbf184f9b956cff3e3fc.1340916058.git.aquini@redhat.com>

On Thu, Jun 28, 2012 at 06:49:39PM -0300, Rafael Aquini wrote:
> This patch introduces the helper functions as well as the necessary changes
> to teach compaction and migration bits how to cope with pages which are
> part of a guest memory balloon, in order to make them movable by memory
> compaction procedures.
> 
> Signed-off-by: Rafael Aquini <aquini@redhat.com>

I have two minor comments but it is not critical they be addressed. If you
doa V3 then fix it but if it is picked up as it is, I'm ok with that.
From a compaction point of view;

Acked-by: Mel Gorman <mel@csn.ul.ie>

> ---
>  include/linux/mm.h |   16 ++++++++
>  mm/compaction.c    |  110 +++++++++++++++++++++++++++++++++++++++++++---------
>  mm/migrate.c       |   30 +++++++++++++-
>  3 files changed, 136 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b36d08c..35568fc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1629,5 +1629,21 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
>  static inline bool page_is_guard(struct page *page) { return false; }
>  #endif /* CONFIG_DEBUG_PAGEALLOC */
>  
> +#if (defined(CONFIG_VIRTIO_BALLOON) || \
> +	defined(CONFIG_VIRTIO_BALLOON_MODULE)) && defined(CONFIG_COMPACTION)
> +extern bool isolate_balloon_page(struct page *);
> +extern bool putback_balloon_page(struct page *);
> +extern struct address_space *balloon_mapping;
> +
> +static inline bool is_balloon_page(struct page *page)
> +{
> +        return (page->mapping == balloon_mapping) ? true : false;
> +}
> +#else
> +static inline bool is_balloon_page(struct page *page)       { return false; }
> +static inline bool isolate_balloon_page(struct page *page)  { return false; }
> +static inline bool putback_balloon_page(struct page *page)  { return false; }
> +#endif /* (VIRTIO_BALLOON || VIRTIO_BALLOON_MODULE) && COMPACTION */
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */

isolate_balloon_page is only used in compaction.c and could declared static
to compaction.c. You could move them all between struct compact_control
and release_pages to avoid forward declarations.

> <SNIP>
> +/* putback_lru_page() counterpart for a ballooned page */
> +bool putback_balloon_page(struct page *page)
> +{
> +	if (WARN_ON(!is_balloon_page(page)))
> +		return false;
> +
> +	if (likely(trylock_page(page))) {
> +		if(is_balloon_page(page)) {

Stick a space in there. Run checkpatch.pl if you haven't already. I suspect
it would have caught this.

As I said, not worth spinning a V3 for :)

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Rafael Aquini @ 2012-06-29 17:36 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Rik van Riel, Michael S. Tsirkin, Konrad Rzeszutek Wilk,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <4FED3DDB.1000903@kernel.org>

On Fri, Jun 29, 2012 at 02:32:11PM +0900, Minchan Kim wrote:
> On 06/29/2012 06:49 AM, Rafael Aquini wrote:
> 
> > This patch introduces the helper functions as well as the necessary changes
> > to teach compaction and migration bits how to cope with pages which are
> > part of a guest memory balloon, in order to make them movable by memory
> > compaction procedures.
> > 
> > Signed-off-by: Rafael Aquini <aquini@redhat.com>
> 
> 
> Just a few comment but not critical. :)
> 
> > ---
> >  include/linux/mm.h |   16 ++++++++
> >  mm/compaction.c    |  110 +++++++++++++++++++++++++++++++++++++++++++---------
> >  mm/migrate.c       |   30 +++++++++++++-
> >  3 files changed, 136 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index b36d08c..35568fc 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1629,5 +1629,21 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
> >  static inline bool page_is_guard(struct page *page) { return false; }
> >  #endif /* CONFIG_DEBUG_PAGEALLOC */
> >  
> > +#if (defined(CONFIG_VIRTIO_BALLOON) || \
> > +	defined(CONFIG_VIRTIO_BALLOON_MODULE)) && defined(CONFIG_COMPACTION)
> > +extern bool isolate_balloon_page(struct page *);
> > +extern bool putback_balloon_page(struct page *);
> > +extern struct address_space *balloon_mapping;
> > +
> > +static inline bool is_balloon_page(struct page *page)
> > +{
> > +        return (page->mapping == balloon_mapping) ? true : false;
> > +}
> 
> 
> What lock should it protect?
> 
I'm afraid I didn't quite get what you meant by that question. If you were
referring to lock protection to the address_space balloon_mapping, we don't need
it. balloon_mapping, once initialized lives forever (as long as driver is
loaded, actually) as a static reference that just helps us on identifying pages 
that are enlisted in a memory balloon as well as it keeps the callback pointers 
to functions that will make those pages mobility magic happens.



> > +#else
> > +static inline bool is_balloon_page(struct page *page)       { return false; }
> > +static inline bool isolate_balloon_page(struct page *page)  { return false; }
> > +static inline bool putback_balloon_page(struct page *page)  { return false; }
> > +#endif /* (VIRTIO_BALLOON || VIRTIO_BALLOON_MODULE) && COMPACTION */
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* _LINUX_MM_H */
> > diff --git a/mm/compaction.c b/mm/compaction.c
> > index 7ea259d..6c6e572 100644
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -14,6 +14,7 @@
> >  #include <linux/backing-dev.h>
> >  #include <linux/sysctl.h>
> >  #include <linux/sysfs.h>
> > +#include <linux/export.h>
> >  #include "internal.h"
> >  
> >  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> > @@ -312,32 +313,40 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >  			continue;
> >  		}
> >  
> > -		if (!PageLRU(page))
> > -			continue;
> > -
> >  		/*
> > -		 * PageLRU is set, and lru_lock excludes isolation,
> > -		 * splitting and collapsing (collapsing has already
> > -		 * happened if PageLRU is set).
> > +		 * It is possible to migrate LRU pages and balloon pages.
> > +		 * Skip any other type of page.
> >  		 */
> > -		if (PageTransHuge(page)) {
> > -			low_pfn += (1 << compound_order(page)) - 1;
> > -			continue;
> > -		}
> > +		if (likely(PageLRU(page))) {
> 
> 
> We can't make sure it is likely because there might be so many pages for kernel.
> 
I thought that by that far in codepath that would be the likelihood since most
pages of an ordinary workload will be at LRU lists. Following that idea, it
sounded neat to hint the compiler to not branch for that block. But, if in the
end that is just a "bad hint", I'll get rid of it right away.


> > +			/*
> > +			 * PageLRU is set, and lru_lock excludes isolation,
> > +			 * splitting and collapsing (collapsing has already
> > +			 * happened if PageLRU is set).
> > +			 */
> > +			if (PageTransHuge(page)) {
> > +				low_pfn += (1 << compound_order(page)) - 1;
> > +				continue;
> > +			}
> >  
> > -		if (!cc->sync)
> > -			mode |= ISOLATE_ASYNC_MIGRATE;
> > +			if (!cc->sync)
> > +				mode |= ISOLATE_ASYNC_MIGRATE;
> >  
> > -		lruvec = mem_cgroup_page_lruvec(page, zone);
> > +			lruvec = mem_cgroup_page_lruvec(page, zone);
> >  
> > -		/* Try isolate the page */
> > -		if (__isolate_lru_page(page, mode) != 0)
> > -			continue;
> > +			/* Try isolate the page */
> > +			if (__isolate_lru_page(page, mode) != 0)
> > +				continue;
> >  
> > -		VM_BUG_ON(PageTransCompound(page));
> > +			VM_BUG_ON(PageTransCompound(page));
> > +
> > +			/* Successfully isolated */
> > +			del_page_from_lru_list(page, lruvec, page_lru(page));
> > +		} else if (is_balloon_page(page)) {
> > +			if (!isolate_balloon_page(page))
> > +				continue;
> > +		} else
> > +			continue;
> >  
> > -		/* Successfully isolated */
> > -		del_page_from_lru_list(page, lruvec, page_lru(page));
> >  		list_add(&page->lru, migratelist);
> >  		cc->nr_migratepages++;
> >  		nr_isolated++;
> > @@ -903,4 +912,67 @@ void compaction_unregister_node(struct node *node)
> >  }
> >  #endif /* CONFIG_SYSFS && CONFIG_NUMA */
> >  
> > +#if defined(CONFIG_VIRTIO_BALLOON) || defined(CONFIG_VIRTIO_BALLOON_MODULE)
> > +/*
> > + * Balloon pages special page->mapping.
> > + * users must properly allocate and initialize an instance of balloon_mapping,
> > + * and set it as the page->mapping for balloon enlisted page instances.
> > + *
> > + * address_space_operations necessary methods for ballooned pages:
> > + *   .migratepage    - used to perform balloon's page migration (as is)
> > + *   .invalidatepage - used to isolate a page from balloon's page list
> > + *   .freepage       - used to reinsert an isolated page to balloon's page list
> > + */
> > +struct address_space *balloon_mapping;
> > +EXPORT_SYMBOL_GPL(balloon_mapping);
> > +
> > +/* __isolate_lru_page() counterpart for a ballooned page */
> > +bool isolate_balloon_page(struct page *page)
> > +{
> > +	if (WARN_ON(!is_balloon_page(page)))
> > +		return false;
> > +
> > +	if (likely(get_page_unless_zero(page))) {
> > +		/*
> > +		 * We can race against move_to_new_page() & __unmap_and_move().
> > +		 * If we stumble across a locked balloon page and succeed on
> > +		 * isolating it, the result tends to be disastrous.
> > +		 */
> > +		if (likely(trylock_page(page))) {
> > +			/*
> > +			 * A ballooned page, by default, has just one refcount.
> > +			 * Prevent concurrent compaction threads from isolating
> > +			 * an already isolated balloon page.
> > +			 */
> > +			if (is_balloon_page(page) && (page_count(page) == 2)) {
> > +				page->mapping->a_ops->invalidatepage(page, 0);
> 
> 
> Could you add more meaningful name wrapping raw invalidatepage?
> But I don't know what is proper name. ;)
> 
If I understood you correctely, your suggestion is to add two extra callback
pointers to struct address_space_operations, instead of re-using those which are
already there, and are suitable for the mission. Is this really necessary? It
seems just like unecessary bloat to struct address_space_operations, IMHO.


> 
> > +				unlock_page(page);
> > +				return true;
> > +			}
> > +			unlock_page(page);
> > +		}
> > +		/* Drop refcount taken for this already isolated page */
> > +		put_page(page);
> > +	}
> > +	return false;
> > +}
> > +
> > +/* putback_lru_page() counterpart for a ballooned page */
> > +bool putback_balloon_page(struct page *page)
> > +{
> > +	if (WARN_ON(!is_balloon_page(page)))
> > +		return false;
> > +
> > +	if (likely(trylock_page(page))) {
> > +		if(is_balloon_page(page)) {
> > +			page->mapping->a_ops->freepage(page);
> 
> 
> Ditto.
> 
> > +			put_page(page);
> > +			unlock_page(page);
> > +			return true;
> > +		}
> > +		unlock_page(page);
> > +	}
> > +	return false;
> > +}
> > +#endif /* CONFIG_VIRTIO_BALLOON || CONFIG_VIRTIO_BALLOON_MODULE */
> >  #endif /* CONFIG_COMPACTION */
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index be26d5c..59c7bc5 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -78,7 +78,10 @@ void putback_lru_pages(struct list_head *l)
> >  		list_del(&page->lru);
> >  		dec_zone_page_state(page, NR_ISOLATED_ANON +
> >  				page_is_file_cache(page));
> > -		putback_lru_page(page);
> > +		if (unlikely(is_balloon_page(page)))
> > +			WARN_ON(!putback_balloon_page(page));
> > +		else
> > +			putback_lru_page(page);
> >  	}
> >  }
> >  
> > @@ -783,6 +786,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
> >  		}
> >  	}
> >  
> > +	if (is_balloon_page(page)) {
> > +		/*
> > +		 * A ballooned page does not need any special attention from
> > +		 * physical to virtual reverse mapping procedures.
> > +		 * Skip any attempt to unmap PTEs or to remap swap cache,
> > +		 * in order to avoid burning cycles at rmap level.
> > +		 */
> > +		remap_swapcache = 0;
> > +		goto skip_unmap;
> > +	}
> > +
> >  	/*
> >  	 * Corner case handling:
> >  	 * 1. When a new swap-cache page is read into, it is added to the LRU
> > @@ -852,6 +866,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
> >  			goto out;
> >  
> >  	rc = __unmap_and_move(page, newpage, force, offlining, mode);
> > +
> > +	if (is_balloon_page(newpage)) {
> > +		/*
> > +		 * A ballooned page has been migrated already. Now, it is the
> > +		 * time to wrap-up counters, handle the old page back to Buddy
> > +		 * and return.
> > +		 */
> > +		list_del(&page->lru);
> > +		dec_zone_page_state(page, NR_ISOLATED_ANON +
> > +				    page_is_file_cache(page));
> > +		put_page(page);
> > +		__free_page(page);
> > +		return rc;
> > +	}
> >  out:
> >  	if (rc != -EAGAIN) {
> >  		/*
> 
> 
> 
> -- 
> Kind regards,
> Minchan Kim

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Rafael Aquini @ 2012-06-29 17:43 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Rik van Riel, Konrad Rzeszutek Wilk, Michael S. Tsirkin,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <20120629153157.GB13141@csn.ul.ie>

On Fri, Jun 29, 2012 at 04:31:57PM +0100, Mel Gorman wrote:
> On Thu, Jun 28, 2012 at 06:49:39PM -0300, Rafael Aquini wrote:
> > This patch introduces the helper functions as well as the necessary changes
> > to teach compaction and migration bits how to cope with pages which are
> > part of a guest memory balloon, in order to make them movable by memory
> > compaction procedures.
> > 
> > Signed-off-by: Rafael Aquini <aquini@redhat.com>
> 
> I have two minor comments but it is not critical they be addressed. If you
> doa V3 then fix it but if it is picked up as it is, I'm ok with that.
> From a compaction point of view;
> 
> Acked-by: Mel Gorman <mel@csn.ul.ie>
> 
Thanks Mel!

> > ---
> >  include/linux/mm.h |   16 ++++++++
> >  mm/compaction.c    |  110 +++++++++++++++++++++++++++++++++++++++++++---------
> >  mm/migrate.c       |   30 +++++++++++++-
> >  3 files changed, 136 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index b36d08c..35568fc 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1629,5 +1629,21 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
> >  static inline bool page_is_guard(struct page *page) { return false; }
> >  #endif /* CONFIG_DEBUG_PAGEALLOC */
> >  
> > +#if (defined(CONFIG_VIRTIO_BALLOON) || \
> > +	defined(CONFIG_VIRTIO_BALLOON_MODULE)) && defined(CONFIG_COMPACTION)
> > +extern bool isolate_balloon_page(struct page *);
> > +extern bool putback_balloon_page(struct page *);
> > +extern struct address_space *balloon_mapping;
> > +
> > +static inline bool is_balloon_page(struct page *page)
> > +{
> > +        return (page->mapping == balloon_mapping) ? true : false;
> > +}
> > +#else
> > +static inline bool is_balloon_page(struct page *page)       { return false; }
> > +static inline bool isolate_balloon_page(struct page *page)  { return false; }
> > +static inline bool putback_balloon_page(struct page *page)  { return false; }
> > +#endif /* (VIRTIO_BALLOON || VIRTIO_BALLOON_MODULE) && COMPACTION */
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* _LINUX_MM_H */
> 
> isolate_balloon_page is only used in compaction.c and could declared static
> to compaction.c. You could move them all between struct compact_control
> and release_pages to avoid forward declarations.
> 
Sounds good, will do it.


> > <SNIP>
> > +/* putback_lru_page() counterpart for a ballooned page */
> > +bool putback_balloon_page(struct page *page)
> > +{
> > +	if (WARN_ON(!is_balloon_page(page)))
> > +		return false;
> > +
> > +	if (likely(trylock_page(page))) {
> > +		if(is_balloon_page(page)) {
> 
> Stick a space in there. Run checkpatch.pl if you haven't already. I suspect
> it would have caught this.
> 
Ugh! Sorry, that one has completely escaped me... :( I'll fix it right away.


> As I said, not worth spinning a V3 for :)
> 
I'll respin a v3 to address your suggestions and a couple extra points Minchan
has raised. If anything else pops up to your eyes, please, let me know.

Rafael

^ permalink raw reply

* Re: [PATCH v2 0/4] make balloon pages movable by compaction
From: Rafael Aquini @ 2012-06-29 17:46 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Rik van Riel, Michael S. Tsirkin, Konrad Rzeszutek Wilk,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <87r4syzqkn.fsf@rustcorp.com.au>

On Fri, Jun 29, 2012 at 02:01:52PM +0930, Rusty Russell wrote:
> On Thu, 28 Jun 2012 18:49:38 -0300, Rafael Aquini <aquini@redhat.com> wrote:
> > This patchset follows the main idea discussed at 2012 LSFMMS section:
> > "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/
> > 
> > to introduce the required changes to the virtio_balloon driver, as well as
> > changes to the core compaction & migration bits, in order to allow
> > memory balloon pages become movable within a guest.
> > 
> > Rafael Aquini (4):
> >   mm: introduce compaction and migration for virtio ballooned pages
> >   virtio_balloon: handle concurrent accesses to virtio_balloon struct
> >     elements
> >   virtio_balloon: introduce migration primitives to balloon pages
> >   mm: add vm event counters for balloon pages compaction
> > 
> >  drivers/virtio/virtio_balloon.c |  142 +++++++++++++++++++++++++++++++++++----
> >  include/linux/mm.h              |   16 +++++
> >  include/linux/virtio_balloon.h  |    6 ++
> >  include/linux/vm_event_item.h   |    2 +
> >  mm/compaction.c                 |  111 ++++++++++++++++++++++++------
> >  mm/migrate.c                    |   32 ++++++++-
> >  mm/vmstat.c                     |    4 ++
> >  7 files changed, 280 insertions(+), 33 deletions(-)
> > 
> > 
> > V2: address Mel Gorman's review comments
> 
> If Mel is happy, I am happy.  Seems sensible that the virtio_baloon
> changes go in at the same time as the mm changes, so:
> 
> Acked-by: Rusty Russell <rusty@rustcorp.com.au>

Thanks Rusty!

I'll be respinning a v3 to address some extra suggestions Mel did, so if you
have any concern that you'd like me to address, do not hesitate on letting me
know! 

Cheers!
Rafael

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Minchan Kim @ 2012-06-29 22:03 UTC (permalink / raw)
  To: Rafael Aquini
  Cc: Rik van Riel, Konrad Rzeszutek Wilk, Michael S. Tsirkin,
	linux-kernel, virtualization, linux-mm, Minchan Kim, Andi Kleen,
	Andrew Morton
In-Reply-To: <20120629173653.GA1774@t510.redhat.com>

Hi Rafael,

On Fri, Jun 29, 2012 at 02:36:54PM -0300, Rafael Aquini wrote:
> On Fri, Jun 29, 2012 at 02:32:11PM +0900, Minchan Kim wrote:
> > On 06/29/2012 06:49 AM, Rafael Aquini wrote:
> > 
> > > This patch introduces the helper functions as well as the necessary changes
> > > to teach compaction and migration bits how to cope with pages which are
> > > part of a guest memory balloon, in order to make them movable by memory
> > > compaction procedures.
> > > 
> > > Signed-off-by: Rafael Aquini <aquini@redhat.com>
> > 
> > 
> > Just a few comment but not critical. :)
> > 
> > > ---
> > >  include/linux/mm.h |   16 ++++++++
> > >  mm/compaction.c    |  110 +++++++++++++++++++++++++++++++++++++++++++---------
> > >  mm/migrate.c       |   30 +++++++++++++-
> > >  3 files changed, 136 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index b36d08c..35568fc 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -1629,5 +1629,21 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
> > >  static inline bool page_is_guard(struct page *page) { return false; }
> > >  #endif /* CONFIG_DEBUG_PAGEALLOC */
> > >  
> > > +#if (defined(CONFIG_VIRTIO_BALLOON) || \
> > > +	defined(CONFIG_VIRTIO_BALLOON_MODULE)) && defined(CONFIG_COMPACTION)
> > > +extern bool isolate_balloon_page(struct page *);
> > > +extern bool putback_balloon_page(struct page *);
> > > +extern struct address_space *balloon_mapping;
> > > +
> > > +static inline bool is_balloon_page(struct page *page)
> > > +{
> > > +        return (page->mapping == balloon_mapping) ? true : false;
> > > +}
> > 
> > 
> > What lock should it protect?
> > 
> I'm afraid I didn't quite get what you meant by that question. If you were
> referring to lock protection to the address_space balloon_mapping, we don't need
> it. balloon_mapping, once initialized lives forever (as long as driver is
> loaded, actually) as a static reference that just helps us on identifying pages 
> that are enlisted in a memory balloon as well as it keeps the callback pointers 
> to functions that will make those pages mobility magic happens.

Thanks. That's what I want to know.
If anyone(like me don't know of ballooning in detail) see this, it would be very helpful.

> 
> 
> 
> > > +#else
> > > +static inline bool is_balloon_page(struct page *page)       { return false; }
> > > +static inline bool isolate_balloon_page(struct page *page)  { return false; }
> > > +static inline bool putback_balloon_page(struct page *page)  { return false; }
> > > +#endif /* (VIRTIO_BALLOON || VIRTIO_BALLOON_MODULE) && COMPACTION */
> > > +
> > >  #endif /* __KERNEL__ */
> > >  #endif /* _LINUX_MM_H */
> > > diff --git a/mm/compaction.c b/mm/compaction.c
> > > index 7ea259d..6c6e572 100644
> > > --- a/mm/compaction.c
> > > +++ b/mm/compaction.c
> > > @@ -14,6 +14,7 @@
> > >  #include <linux/backing-dev.h>
> > >  #include <linux/sysctl.h>
> > >  #include <linux/sysfs.h>
> > > +#include <linux/export.h>
> > >  #include "internal.h"
> > >  
> > >  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> > > @@ -312,32 +313,40 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> > >  			continue;
> > >  		}
> > >  
> > > -		if (!PageLRU(page))
> > > -			continue;
> > > -
> > >  		/*
> > > -		 * PageLRU is set, and lru_lock excludes isolation,
> > > -		 * splitting and collapsing (collapsing has already
> > > -		 * happened if PageLRU is set).
> > > +		 * It is possible to migrate LRU pages and balloon pages.
> > > +		 * Skip any other type of page.
> > >  		 */
> > > -		if (PageTransHuge(page)) {
> > > -			low_pfn += (1 << compound_order(page)) - 1;
> > > -			continue;
> > > -		}
> > > +		if (likely(PageLRU(page))) {
> > 
> > 
> > We can't make sure it is likely because there might be so many pages for kernel.
> > 
> I thought that by that far in codepath that would be the likelihood since most
> pages of an ordinary workload will be at LRU lists. Following that idea, it
> sounded neat to hint the compiler to not branch for that block. But, if in the
> end that is just a "bad hint", I'll get rid of it right away.

Yeb. I hope you remove this.
If you want really, it should be separated patch because it's not related to your
series.

> 
> 
> > > +			/*
> > > +			 * PageLRU is set, and lru_lock excludes isolation,
> > > +			 * splitting and collapsing (collapsing has already
> > > +			 * happened if PageLRU is set).
> > > +			 */
> > > +			if (PageTransHuge(page)) {
> > > +				low_pfn += (1 << compound_order(page)) - 1;
> > > +				continue;
> > > +			}
> > >  
> > > -		if (!cc->sync)
> > > -			mode |= ISOLATE_ASYNC_MIGRATE;
> > > +			if (!cc->sync)
> > > +				mode |= ISOLATE_ASYNC_MIGRATE;
> > >  
> > > -		lruvec = mem_cgroup_page_lruvec(page, zone);
> > > +			lruvec = mem_cgroup_page_lruvec(page, zone);
> > >  
> > > -		/* Try isolate the page */
> > > -		if (__isolate_lru_page(page, mode) != 0)
> > > -			continue;
> > > +			/* Try isolate the page */
> > > +			if (__isolate_lru_page(page, mode) != 0)
> > > +				continue;
> > >  
> > > -		VM_BUG_ON(PageTransCompound(page));
> > > +			VM_BUG_ON(PageTransCompound(page));
> > > +
> > > +			/* Successfully isolated */
> > > +			del_page_from_lru_list(page, lruvec, page_lru(page));
> > > +		} else if (is_balloon_page(page)) {
> > > +			if (!isolate_balloon_page(page))
> > > +				continue;
> > > +		} else
> > > +			continue;
> > >  
> > > -		/* Successfully isolated */
> > > -		del_page_from_lru_list(page, lruvec, page_lru(page));
> > >  		list_add(&page->lru, migratelist);
> > >  		cc->nr_migratepages++;
> > >  		nr_isolated++;
> > > @@ -903,4 +912,67 @@ void compaction_unregister_node(struct node *node)
> > >  }
> > >  #endif /* CONFIG_SYSFS && CONFIG_NUMA */
> > >  
> > > +#if defined(CONFIG_VIRTIO_BALLOON) || defined(CONFIG_VIRTIO_BALLOON_MODULE)
> > > +/*
> > > + * Balloon pages special page->mapping.
> > > + * users must properly allocate and initialize an instance of balloon_mapping,
> > > + * and set it as the page->mapping for balloon enlisted page instances.
> > > + *
> > > + * address_space_operations necessary methods for ballooned pages:
> > > + *   .migratepage    - used to perform balloon's page migration (as is)
> > > + *   .invalidatepage - used to isolate a page from balloon's page list
> > > + *   .freepage       - used to reinsert an isolated page to balloon's page list
> > > + */
> > > +struct address_space *balloon_mapping;
> > > +EXPORT_SYMBOL_GPL(balloon_mapping);
> > > +
> > > +/* __isolate_lru_page() counterpart for a ballooned page */
> > > +bool isolate_balloon_page(struct page *page)
> > > +{
> > > +	if (WARN_ON(!is_balloon_page(page)))
> > > +		return false;
> > > +
> > > +	if (likely(get_page_unless_zero(page))) {
> > > +		/*
> > > +		 * We can race against move_to_new_page() & __unmap_and_move().
> > > +		 * If we stumble across a locked balloon page and succeed on
> > > +		 * isolating it, the result tends to be disastrous.
> > > +		 */
> > > +		if (likely(trylock_page(page))) {
> > > +			/*
> > > +			 * A ballooned page, by default, has just one refcount.
> > > +			 * Prevent concurrent compaction threads from isolating
> > > +			 * an already isolated balloon page.
> > > +			 */
> > > +			if (is_balloon_page(page) && (page_count(page) == 2)) {
> > > +				page->mapping->a_ops->invalidatepage(page, 0);
> > 
> > 
> > Could you add more meaningful name wrapping raw invalidatepage?
> > But I don't know what is proper name. ;)
> > 
> If I understood you correctely, your suggestion is to add two extra callback
> pointers to struct address_space_operations, instead of re-using those which are
> already there, and are suitable for the mission. Is this really necessary? It
> seems just like unecessary bloat to struct address_space_operations, IMHO.

I meant this. :)

void isolate_page_from_balloonlist(struct page* page)
{
	page->mapping->a_ops->invalidatepage(page, 0);
}

	if (is_balloon_page(page) && (page_count(page) == 2)) {
		isolate_page_from_balloonlist(page);
	}

Thanks!

> -- 
> Kind regards,
> Minchan Kim

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Rafael Aquini @ 2012-06-30  1:34 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Rik van Riel, Michael S. Tsirkin, Konrad Rzeszutek Wilk,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <20120629220333.GA2079@barrios>

Howdy Minchan,

On Sat, Jun 30, 2012 at 07:03:33AM +0900, Minchan Kim wrote:
> > > > +static inline bool is_balloon_page(struct page *page)
> > > > +{
> > > > +        return (page->mapping == balloon_mapping) ? true : false;
> > > > +}
> > > 
> > > 
> > > What lock should it protect?
> > > 
> > I'm afraid I didn't quite get what you meant by that question. If you were
> > referring to lock protection to the address_space balloon_mapping, we don't need
> > it. balloon_mapping, once initialized lives forever (as long as driver is
> > loaded, actually) as a static reference that just helps us on identifying pages 
> > that are enlisted in a memory balloon as well as it keeps the callback pointers 
> > to functions that will make those pages mobility magic happens.
> 
> Thanks. That's what I want to know.
> If anyone(like me don't know of ballooning in detail) see this, it would be very helpful.
> 
Good point! I'll make sure this explanation gets properly registered either at commit log or
at a comment along with balloon_mapping declaration, then.



> > > > +		if (likely(PageLRU(page))) {
> > > 
> > > 
> > > We can't make sure it is likely because there might be so many pages for kernel.
> > > 
> > I thought that by that far in codepath that would be the likelihood since most
> > pages of an ordinary workload will be at LRU lists. Following that idea, it
> > sounded neat to hint the compiler to not branch for that block. But, if in the
> > end that is just a "bad hint", I'll get rid of it right away.
> 
> Yeb. I hope you remove this.
> If you want really, it should be separated patch because it's not related to your
> series.
> 
That will be removed, then.



> > > > +/* __isolate_lru_page() counterpart for a ballooned page */
> > > > +bool isolate_balloon_page(struct page *page)
> > > > +{
> > > > +	if (WARN_ON(!is_balloon_page(page)))
> > > > +		return false;
> > > > +
> > > > +	if (likely(get_page_unless_zero(page))) {
> > > > +		/*
> > > > +		 * We can race against move_to_new_page() & __unmap_and_move().
> > > > +		 * If we stumble across a locked balloon page and succeed on
> > > > +		 * isolating it, the result tends to be disastrous.
> > > > +		 */
> > > > +		if (likely(trylock_page(page))) {
> > > > +			/*
> > > > +			 * A ballooned page, by default, has just one refcount.
> > > > +			 * Prevent concurrent compaction threads from isolating
> > > > +			 * an already isolated balloon page.
> > > > +			 */
> > > > +			if (is_balloon_page(page) && (page_count(page) == 2)) {
> > > > +				page->mapping->a_ops->invalidatepage(page, 0);
> > > 
> > > 
> > > Could you add more meaningful name wrapping raw invalidatepage?
> > > But I don't know what is proper name. ;)
> > > 
> > If I understood you correctely, your suggestion is to add two extra callback
> > pointers to struct address_space_operations, instead of re-using those which are
> > already there, and are suitable for the mission. Is this really necessary? It
> > seems just like unecessary bloat to struct address_space_operations, IMHO.
> 
> I meant this. :)
> 
> void isolate_page_from_balloonlist(struct page* page)
> {
> 	page->mapping->a_ops->invalidatepage(page, 0);
> }
> 
> 	if (is_balloon_page(page) && (page_count(page) == 2)) {
> 		isolate_page_from_balloonlist(page);
> 	}
> 
Humm, my feelings on your approach here: just an unecessary indirection that
doesn't bring the desired code readability improvement.
If the header comment statement on balloon_mapping->a_ops is not clear enough 
on those methods usage for ballooned pages:

..... 
/*
 * Balloon pages special page->mapping.
 * users must properly allocate and initialize an instance of balloon_mapping,
 * and set it as the page->mapping for balloon enlisted page instances.
 *
 * address_space_operations necessary methods for ballooned pages:
 *   .migratepage    - used to perform balloon's page migration (as is)
 *   .invalidatepage - used to isolate a page from balloon's page list
 *   .freepage       - used to reinsert an isolated page to balloon's page list
 */
struct address_space *balloon_mapping;
EXPORT_SYMBOL_GPL(balloon_mapping);
.....

I can add an extra commentary, to recollect folks about that usage, next to the
points where those callbacks are used at isolate_balloon_page() &
putback_balloon_page(). What do you think?


> Thanks!
> 
Thank you for such attention and valuable feedback! Have a nice weekend!

Rafael

^ permalink raw reply

* Re: [patch net-next v2 0/4] net: introduce and use IFF_LIFE_ADDR_CHANGE
From: David Miller @ 2012-06-30  8:08 UTC (permalink / raw)
  To: jpirko
  Cc: mst, netdev, shimoda.hiroaki, virtualization, danny.kukawka,
	edumazet
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 29 Jun 2012 17:10:04 +0200

> three drivers updated, but this can be used in many others.
> 
> v1->v2:
> %s/LIFE/LIVE
> 
> Jiri Pirko (4):
>   net: introduce new priv_flag indicating iface capable of change mac
>     when running
>   virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
>   team: use IFF_LIVE_ADDR_CHANGE priv_flag
>   dummy: use IFF_LIVE_ADDR_CHANGE priv_flag

Applied, thanks Jiri.

^ permalink raw reply

* RFD: virtio balloon API use (was Re: [PATCH 5 of 5] virtio: expose added descriptors immediately)
From: Michael S. Tsirkin @ 2012-07-01  9:20 UTC (permalink / raw)
  To: Rusty Russell; +Cc: virtualization, Rafael Aquini, kvm, linux-kernel
In-Reply-To: <f3831c1617c86a51d875.1320306173@localhost6.localdomain6>

On Thu, Nov 03, 2011 at 06:12:53PM +1030, Rusty Russell wrote:
> A virtio driver does virtqueue_add_buf() multiple times before finally
> calling virtqueue_kick(); previously we only exposed the added buffers
> in the virtqueue_kick() call.  This means we don't need a memory
> barrier in virtqueue_add_buf(), but it reduces concurrency as the
> device (ie. host) can't see the buffers until the kick.
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

Looking at recent mm compaction patches made me look at locking
in balloon closely. And I noticed the referenced patch (commit
ee7cd8981e15bcb365fc762afe3fc47b8242f630 upstream) interacts strangely
with virtio balloon; balloon currently does:

static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
{
        struct scatterlist sg;

        sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);

        init_completion(&vb->acked);

        /* We should always be able to add one buffer to an empty queue. */
        if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
                BUG();
        virtqueue_kick(vq);

        /* When host has read buffer, this completes via balloon_ack */
        wait_for_completion(&vb->acked);
}


While vq callback does:

static void balloon_ack(struct virtqueue *vq)
{
        struct virtio_balloon *vb;
        unsigned int len;

        vb = virtqueue_get_buf(vq, &len);
        if (vb)
                complete(&vb->acked);
}


So virtqueue_get_buf might now run concurrently with virtqueue_kick.
I audited both and this seems safe in practice but I think
we need to either declare this legal at the API level
or add locking in driver.

Further, is there a guarantee that we never get
spurious callbacks? We currently check ring not empty
but esp for non shared MSI this might not be needed.
If a spurious callback triggers, virtqueue_get_buf can run
concurrently with virtqueue_add_buf which is known to be racy.
Again I think this is currently safe as no spurious callbacks in
practice but should we guarantee no spurious callbacks at the API level
or add locking in driver?

-- 
MST

^ permalink raw reply

* Re: [RFC V2 PATCH 4/4] virtio-net: add multiqueue support
From: Michael S. Tsirkin @ 2012-07-01  9:43 UTC (permalink / raw)
  To: Jason Wang
  Cc: krkumar2, habanero, aliguori, mashirle, kvm, qemu-devel,
	virtualization, tahm, jwhan
In-Reply-To: <20120625100449.8096.65545.stgit@amd-6168-8-1.englab.nay.redhat.com>

On Mon, Jun 25, 2012 at 06:04:49PM +0800, Jason Wang wrote:
> This patch let the virtio-net can transmit and recevie packets through multiuple
> VLANClientStates and abstract them as multiple virtqueues to guest. A new
> parameter 'queues' were introduced to specify the number of queue pairs.
> 
> The main goal for vhost support is to let the multiqueue could be used without
> changes in vhost code. So each vhost_net structure were used to track a single
> VLANClientState and two virtqueues in the past. As multiple VLANClientState were
> stored in the NICState, we can infer the correspond VLANClientState from this
> and queue_index easily.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Can this patch be split up?
1. extend vhost API to allow multiqueue and minimally tweak virtio
2. add real multiqueue for virtio

Hmm?

> ---
>  hw/vhost.c      |   58 ++++---
>  hw/vhost.h      |    1 
>  hw/vhost_net.c  |    7 +
>  hw/vhost_net.h  |    2 
>  hw/virtio-net.c |  461 +++++++++++++++++++++++++++++++++++++------------------
>  hw/virtio-net.h |    3 
>  6 files changed, 355 insertions(+), 177 deletions(-)
> 
> diff --git a/hw/vhost.c b/hw/vhost.c
> index 43664e7..6318bb2 100644
> --- a/hw/vhost.c
> +++ b/hw/vhost.c
> @@ -620,11 +620,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
>  {
>      target_phys_addr_t s, l, a;
>      int r;
> +    int vhost_vq_index = (idx > 2 ? idx - 1 : idx) % dev->nvqs;
>      struct vhost_vring_file file = {
> -        .index = idx,
> +	.index = vhost_vq_index
>      };
>      struct vhost_vring_state state = {
> -        .index = idx,
> +        .index = vhost_vq_index
>      };
>      struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
>  
> @@ -670,11 +671,12 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
>          goto fail_alloc_ring;
>      }
>  
> -    r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
> +    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
>      if (r < 0) {
>          r = -errno;
>          goto fail_alloc;
>      }
> +
>      file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
>      r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
>      if (r) {
> @@ -715,7 +717,7 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
>                                      unsigned idx)
>  {
>      struct vhost_vring_state state = {
> -        .index = idx,
> +        .index = (idx > 2 ? idx - 1 : idx) % dev->nvqs,
>      };
>      int r;
>      r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state);
> @@ -829,7 +831,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
>      }
>  
>      for (i = 0; i < hdev->nvqs; ++i) {
> -        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, true);
> +        r = vdev->binding->set_host_notifier(vdev->binding_opaque,
> +					     hdev->start_idx + i,
> +					     true);
>          if (r < 0) {
>              fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
>              goto fail_vq;
> @@ -839,7 +843,9 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
>      return 0;
>  fail_vq:
>      while (--i >= 0) {
> -        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false);
> +        r = vdev->binding->set_host_notifier(vdev->binding_opaque,
> +					     hdev->start_idx + i,
> +					     false);
>          if (r < 0) {
>              fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
>              fflush(stderr);
> @@ -860,7 +866,9 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
>      int i, r;
>  
>      for (i = 0; i < hdev->nvqs; ++i) {
> -        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false);
> +        r = vdev->binding->set_host_notifier(vdev->binding_opaque,
> +					     hdev->start_idx + i,
> +					     false);
>          if (r < 0) {
>              fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
>              fflush(stderr);
> @@ -874,15 +882,17 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>  {
>      int i, r;
>      if (!vdev->binding->set_guest_notifiers) {
> -        fprintf(stderr, "binding does not support guest notifiers\n");
> +        fprintf(stderr, "binding does not support guest notifier\n");
>          r = -ENOSYS;
>          goto fail;
>      }
>  
> -    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, true);
> -    if (r < 0) {
> -        fprintf(stderr, "Error binding guest notifier: %d\n", -r);
> -        goto fail_notifiers;
> +    if (hdev->start_idx == 0) {
> +        r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, true);
> +        if (r < 0) {
> +            fprintf(stderr, "Error binding guest notifier: %d\n", -r);
> +            goto fail_notifiers;
> +        }
>      }
>  
>      r = vhost_dev_set_features(hdev, hdev->log_enabled);
> @@ -898,7 +908,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>          r = vhost_virtqueue_init(hdev,
>                                   vdev,
>                                   hdev->vqs + i,
> -                                 i);
> +                                 hdev->start_idx + i);
>          if (r < 0) {
>              goto fail_vq;
>          }
> @@ -925,11 +935,13 @@ fail_vq:
>          vhost_virtqueue_cleanup(hdev,
>                                  vdev,
>                                  hdev->vqs + i,
> -                                i);
> +                                hdev->start_idx + i);
>      }
> +    i = hdev->nvqs;
>  fail_mem:
>  fail_features:
> -    vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
> +    if (hdev->start_idx == 0)
> +        vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
>  fail_notifiers:
>  fail:
>      return r;
> @@ -944,18 +956,22 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>          vhost_virtqueue_cleanup(hdev,
>                                  vdev,
>                                  hdev->vqs + i,
> -                                i);
> +                                hdev->start_idx + i);
>      }
> +
>      for (i = 0; i < hdev->n_mem_sections; ++i) {
>          vhost_sync_dirty_bitmap(hdev, &hdev->mem_sections[i],
>                                  0, (target_phys_addr_t)~0x0ull);
>      }
> -    r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
> -    if (r < 0) {
> -        fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", r);
> -        fflush(stderr);
> +
> +    if (hdev->start_idx == 0) {
> +	r = vdev->binding->set_guest_notifiers(vdev->binding_opaque, false);
> +	if (r < 0) {
> +	    fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", r);
> +	    fflush(stderr);
> +	}
> +	assert (r >= 0);
>      }
> -    assert (r >= 0);
>  
>      hdev->started = false;
>      g_free(hdev->log);
> diff --git a/hw/vhost.h b/hw/vhost.h
> index 80e64df..fa5357a 100644
> --- a/hw/vhost.h
> +++ b/hw/vhost.h
> @@ -34,6 +34,7 @@ struct vhost_dev {
>      MemoryRegionSection *mem_sections;
>      struct vhost_virtqueue *vqs;
>      int nvqs;
> +    int start_idx;
>      unsigned long long features;
>      unsigned long long acked_features;
>      unsigned long long backend_features;
> diff --git a/hw/vhost_net.c b/hw/vhost_net.c
> index f672e9d..73a72bb 100644
> --- a/hw/vhost_net.c
> +++ b/hw/vhost_net.c
> @@ -138,13 +138,15 @@ bool vhost_net_query(VHostNetState *net, VirtIODevice *dev)
>  }
>  
>  int vhost_net_start(struct vhost_net *net,
> -                    VirtIODevice *dev)
> +                    VirtIODevice *dev,
> +                    int start_idx)
>  {
>      struct vhost_vring_file file = { };
>      int r;
>  
>      net->dev.nvqs = 2;
>      net->dev.vqs = net->vqs;
> +    net->dev.start_idx = start_idx;
>  
>      r = vhost_dev_enable_notifiers(&net->dev, dev);
>      if (r < 0) {
> @@ -227,7 +229,8 @@ bool vhost_net_query(VHostNetState *net, VirtIODevice *dev)
>  }
>  
>  int vhost_net_start(struct vhost_net *net,
> -		    VirtIODevice *dev)
> +                    VirtIODevice *dev,
> +                    int start_idx)
>  {
>      return -ENOSYS;
>  }
> diff --git a/hw/vhost_net.h b/hw/vhost_net.h
> index 91e40b1..79a4f09 100644
> --- a/hw/vhost_net.h
> +++ b/hw/vhost_net.h
> @@ -9,7 +9,7 @@ typedef struct vhost_net VHostNetState;
>  VHostNetState *vhost_net_init(VLANClientState *backend, int devfd, bool force);
>  
>  bool vhost_net_query(VHostNetState *net, VirtIODevice *dev);
> -int vhost_net_start(VHostNetState *net, VirtIODevice *dev);
> +int vhost_net_start(VHostNetState *net, VirtIODevice *dev, int start_idx);
>  void vhost_net_stop(VHostNetState *net, VirtIODevice *dev);
>  
>  void vhost_net_cleanup(VHostNetState *net);
> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> index 3f190d4..d42c4cc 100644
> --- a/hw/virtio-net.c
> +++ b/hw/virtio-net.c
> @@ -26,34 +26,43 @@
>  #define MAC_TABLE_ENTRIES    64
>  #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
>  
> -typedef struct VirtIONet
> +struct VirtIONet;
> +
> +typedef struct VirtIONetQueue
>  {
> -    VirtIODevice vdev;
> -    uint8_t mac[ETH_ALEN];
> -    uint16_t status;
>      VirtQueue *rx_vq;
>      VirtQueue *tx_vq;
> -    VirtQueue *ctrl_vq;
> -    NICState *nic;
>      QEMUTimer *tx_timer;
>      QEMUBH *tx_bh;
>      uint32_t tx_timeout;
> -    int32_t tx_burst;
>      int tx_waiting;
> -    uint32_t has_vnet_hdr;
> -    uint8_t has_ufo;
>      struct {
>          VirtQueueElement elem;
>          ssize_t len;
>      } async_tx;
> +    struct VirtIONet *n;
> +    uint8_t vhost_started;
> +} VirtIONetQueue;
> +
> +typedef struct VirtIONet
> +{
> +    VirtIODevice vdev;
> +    uint8_t mac[ETH_ALEN];
> +    uint16_t status;
> +    VirtIONetQueue vqs[MAX_QUEUE_NUM];
> +    VirtQueue *ctrl_vq;
> +    NICState *nic;
> +    int32_t tx_burst;
> +    uint32_t has_vnet_hdr;
> +    uint8_t has_ufo;
>      int mergeable_rx_bufs;
> +    int multiqueue;
>      uint8_t promisc;
>      uint8_t allmulti;
>      uint8_t alluni;
>      uint8_t nomulti;
>      uint8_t nouni;
>      uint8_t nobcast;
> -    uint8_t vhost_started;
>      struct {
>          int in_use;
>          int first_multi;
> @@ -63,6 +72,7 @@ typedef struct VirtIONet
>      } mac_table;
>      uint32_t *vlans;
>      DeviceState *qdev;
> +    uint32_t queues;
>  } VirtIONet;
>  
>  /* TODO
> @@ -74,12 +84,25 @@ static VirtIONet *to_virtio_net(VirtIODevice *vdev)
>      return (VirtIONet *)vdev;
>  }
>  
> +static int vq_get_pair_index(VirtIONet *n, VirtQueue *vq)
> +{
> +    int i;
> +    for (i = 0; i < n->queues; i++) {
> +        if (n->vqs[i].tx_vq == vq || n->vqs[i].rx_vq == vq) {
> +            return i;
> +        }
> +    }
> +    assert(1);
> +    return -1;
> +}
> +
>  static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
>      struct virtio_net_config netcfg;
>  
>      stw_p(&netcfg.status, n->status);
> +    netcfg.queues = n->queues * 2;
>      memcpy(netcfg.mac, n->mac, ETH_ALEN);
>      memcpy(config, &netcfg, sizeof(netcfg));
>  }
> @@ -103,78 +126,140 @@ static bool virtio_net_started(VirtIONet *n, uint8_t status)
>          (n->status & VIRTIO_NET_S_LINK_UP) && n->vdev.vm_running;
>  }
>  
> -static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
> +static void nc_vhost_status(VLANClientState *nc, VirtIONet *n,
> +                            uint8_t status)
>  {
> -    if (!n->nic->nc.peer) {
> +    int queue_index = nc->queue_index;
> +    VLANClientState *peer = nc->peer;
> +    VirtIONetQueue *netq = &n->vqs[nc->queue_index];
> +
> +    if (!peer) {
>          return;
>      }
> -    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
> +    if (peer->info->type != NET_CLIENT_TYPE_TAP) {
>          return;
>      }
>  
> -    if (!tap_get_vhost_net(n->nic->nc.peer)) {
> +    if (!tap_get_vhost_net(peer)) {
>          return;
>      }
> -    if (!!n->vhost_started == virtio_net_started(n, status) &&
> -                              !n->nic->nc.peer->link_down) {
> +    if (!!netq->vhost_started == virtio_net_started(n, status) &&
> +                                 !peer->link_down) {
>          return;
>      }
> -    if (!n->vhost_started) {
> -        int r;
> -        if (!vhost_net_query(tap_get_vhost_net(n->nic->nc.peer), &n->vdev)) {
> +    if (!netq->vhost_started) {
> +	/* skip ctrl vq */
> +	int r, start_idx = queue_index == 0 ? 0 : queue_index * 2 + 1;
> +        if (!vhost_net_query(tap_get_vhost_net(peer), &n->vdev)) {
>              return;
>          }
> -        r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), &n->vdev);
> +        r = vhost_net_start(tap_get_vhost_net(peer), &n->vdev, start_idx);
>          if (r < 0) {
>              error_report("unable to start vhost net: %d: "
>                           "falling back on userspace virtio", -r);
>          } else {
> -            n->vhost_started = 1;
> +            netq->vhost_started = 1;
>          }
>      } else {
> -        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), &n->vdev);
> -        n->vhost_started = 0;
> +        vhost_net_stop(tap_get_vhost_net(peer), &n->vdev);
> +        netq->vhost_started = 0;
> +    }
> +}
> +
> +static int peer_attach(VirtIONet *n, int index)
> +{
> +    if (!n->nic->ncs[index]->peer) {
> +	return -1;
> +    }
> +
> +    if (n->nic->ncs[index]->peer->info->type != NET_CLIENT_TYPE_TAP) {
> +	return -1;
> +    }
> +
> +    return tap_attach(n->nic->ncs[index]->peer);
> +}
> +
> +static int peer_detach(VirtIONet *n, int index)
> +{
> +    if (!n->nic->ncs[index]->peer) {
> +	return -1;
> +    }
> +
> +    if (n->nic->ncs[index]->peer->info->type != NET_CLIENT_TYPE_TAP) {
> +	return -1;
> +    }
> +
> +    return tap_detach(n->nic->ncs[index]->peer);
> +}
> +
> +static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
> +{
> +    int i;
> +    for (i = 0; i < n->queues; i++) {
> +	if (!n->multiqueue && i != 0)
> +	    status = 0;
> +        nc_vhost_status(n->nic->ncs[i], n, status);
>      }
>  }
>  
>  static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
> +    int i;
>  
>      virtio_net_vhost_status(n, status);
>  
> -    if (!n->tx_waiting) {
> -        return;
> -    }
> +    for (i = 0; i < n->queues; i++) {
> +        VirtIONetQueue *netq = &n->vqs[i];
> +        if (!netq->tx_waiting) {
> +            continue;
> +        }
> +
> +	if (!n->multiqueue && i != 0)
> +	    status = 0;
>  
> -    if (virtio_net_started(n, status) && !n->vhost_started) {
> -        if (n->tx_timer) {
> -            qemu_mod_timer(n->tx_timer,
> -                           qemu_get_clock_ns(vm_clock) + n->tx_timeout);
> +        if (virtio_net_started(n, status) && !netq->vhost_started) {
> +            if (netq->tx_timer) {
> +                qemu_mod_timer(netq->tx_timer,
> +                               qemu_get_clock_ns(vm_clock) + netq->tx_timeout);
> +            } else {
> +                qemu_bh_schedule(netq->tx_bh);
> +            }
>          } else {
> -            qemu_bh_schedule(n->tx_bh);
> +            if (netq->tx_timer) {
> +                qemu_del_timer(netq->tx_timer);
> +            } else {
> +                qemu_bh_cancel(netq->tx_bh);
> +            }
>          }
> -    } else {
> -        if (n->tx_timer) {
> -            qemu_del_timer(n->tx_timer);
> -        } else {
> -            qemu_bh_cancel(n->tx_bh);
> +    }
> +}
> +
> +static bool virtio_net_is_link_up(VirtIONet *n)
> +{
> +    int i;
> +    for (i = 0; i < n->queues; i++) {
> +        if (n->nic->ncs[i]->link_down) {
> +            return false;
>          }
>      }
> +    return true;
>  }
>  
>  static void virtio_net_set_link_status(VLANClientState *nc)
>  {
> -    VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
> +    VirtIONet *n = ((NICState *)(nc->opaque))->opaque;
>      uint16_t old_status = n->status;
>  
> -    if (nc->link_down)
> +    if (virtio_net_is_link_up(n)) {
>          n->status &= ~VIRTIO_NET_S_LINK_UP;
> -    else
> +    } else {
>          n->status |= VIRTIO_NET_S_LINK_UP;
> +    }
>  
> -    if (n->status != old_status)
> +    if (n->status != old_status) {
>          virtio_notify_config(&n->vdev);
> +    }
>  
>      virtio_net_set_status(&n->vdev, n->vdev.status);
>  }
> @@ -202,13 +287,15 @@ static void virtio_net_reset(VirtIODevice *vdev)
>  
>  static int peer_has_vnet_hdr(VirtIONet *n)
>  {
> -    if (!n->nic->nc.peer)
> +    if (!n->nic->ncs[0]->peer) {
>          return 0;
> +    }
>  
> -    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP)
> +    if (n->nic->ncs[0]->peer->info->type != NET_CLIENT_TYPE_TAP) {
>          return 0;
> +    }
>  
> -    n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
> +    n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->ncs[0]->peer);
>  
>      return n->has_vnet_hdr;
>  }
> @@ -218,7 +305,7 @@ static int peer_has_ufo(VirtIONet *n)
>      if (!peer_has_vnet_hdr(n))
>          return 0;
>  
> -    n->has_ufo = tap_has_ufo(n->nic->nc.peer);
> +    n->has_ufo = tap_has_ufo(n->nic->ncs[0]->peer);
>  
>      return n->has_ufo;
>  }
> @@ -228,9 +315,13 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features)
>      VirtIONet *n = to_virtio_net(vdev);
>  
>      features |= (1 << VIRTIO_NET_F_MAC);
> +    features |= (1 << VIRTIO_NET_F_MULTIQUEUE);
>  
>      if (peer_has_vnet_hdr(n)) {
> -        tap_using_vnet_hdr(n->nic->nc.peer, 1);
> +        int i;
> +        for (i = 0; i < n->queues; i++) {
> +            tap_using_vnet_hdr(n->nic->ncs[i]->peer, 1);
> +        }
>      } else {
>          features &= ~(0x1 << VIRTIO_NET_F_CSUM);
>          features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4);
> @@ -248,14 +339,15 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features)
>          features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO);
>      }
>  
> -    if (!n->nic->nc.peer ||
> -        n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
> +    if (!n->nic->ncs[0]->peer ||
> +        n->nic->ncs[0]->peer->info->type != NET_CLIENT_TYPE_TAP) {
>          return features;
>      }
> -    if (!tap_get_vhost_net(n->nic->nc.peer)) {
> +    if (!tap_get_vhost_net(n->nic->ncs[0]->peer)) {
>          return features;
>      }
> -    return vhost_net_get_features(tap_get_vhost_net(n->nic->nc.peer), features);
> +    return vhost_net_get_features(tap_get_vhost_net(n->nic->ncs[0]->peer),
> +                                  features);
>  }
>  
>  static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
> @@ -276,25 +368,38 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
>  static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
> +    int i, r;
>  
>      n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF));
> +    n->multiqueue = !!(features & (1 << VIRTIO_NET_F_MULTIQUEUE));
>  
> -    if (n->has_vnet_hdr) {
> -        tap_set_offload(n->nic->nc.peer,
> -                        (features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
> -                        (features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
> -                        (features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
> -                        (features >> VIRTIO_NET_F_GUEST_ECN)  & 1,
> -                        (features >> VIRTIO_NET_F_GUEST_UFO)  & 1);
> -    }
> -    if (!n->nic->nc.peer ||
> -        n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
> -        return;
> -    }
> -    if (!tap_get_vhost_net(n->nic->nc.peer)) {
> -        return;
> +    for (i = 0; i < n->queues; i++) {
> +        if (!n->multiqueue && i != 0) {
> +            r = peer_detach(n, i);
> +            assert(r == 0);
> +        } else {
> +            r = peer_attach(n, i);
> +            assert(r == 0);
> +
> +            if (n->has_vnet_hdr) {
> +                tap_set_offload(n->nic->ncs[i]->peer,
> +                                (features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
> +                                (features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
> +                                (features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
> +                                (features >> VIRTIO_NET_F_GUEST_ECN)  & 1,
> +                                (features >> VIRTIO_NET_F_GUEST_UFO)  & 1);
> +            }
> +            if (!n->nic->ncs[i]->peer ||
> +                n->nic->ncs[i]->peer->info->type != NET_CLIENT_TYPE_TAP) {
> +                continue;
> +            }
> +            if (!tap_get_vhost_net(n->nic->ncs[i]->peer)) {
> +                continue;
> +            }
> +            vhost_net_ack_features(tap_get_vhost_net(n->nic->ncs[i]->peer),
> +                                   features);
> +        }
>      }
> -    vhost_net_ack_features(tap_get_vhost_net(n->nic->nc.peer), features);
>  }
>  
>  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> @@ -446,7 +551,7 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
>  
> -    qemu_flush_queued_packets(&n->nic->nc);
> +    qemu_flush_queued_packets(n->nic->ncs[vq_get_pair_index(n, vq)]);
>  
>      /* We now have RX buffers, signal to the IO thread to break out of the
>       * select to re-poll the tap file descriptor */
> @@ -455,36 +560,37 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
>  
>  static int virtio_net_can_receive(VLANClientState *nc)
>  {
> -    VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
> +    int queue_index = nc->queue_index;
> +    VirtIONet *n = ((NICState *)nc->opaque)->opaque;
> +
>      if (!n->vdev.vm_running) {
>          return 0;
>      }
>  
> -    if (!virtio_queue_ready(n->rx_vq) ||
> +    if (!virtio_queue_ready(n->vqs[queue_index].rx_vq) ||
>          !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
>          return 0;
>  
>      return 1;
>  }
>  
> -static int virtio_net_has_buffers(VirtIONet *n, int bufsize)
> +static int virtio_net_has_buffers(VirtIONet *n, int bufsize, VirtQueue *vq)
>  {
> -    if (virtio_queue_empty(n->rx_vq) ||
> -        (n->mergeable_rx_bufs &&
> -         !virtqueue_avail_bytes(n->rx_vq, bufsize, 0))) {
> -        virtio_queue_set_notification(n->rx_vq, 1);
> +    if (virtio_queue_empty(vq) || (n->mergeable_rx_bufs &&
> +        !virtqueue_avail_bytes(vq, bufsize, 0))) {
> +        virtio_queue_set_notification(vq, 1);
>  
>          /* To avoid a race condition where the guest has made some buffers
>           * available after the above check but before notification was
>           * enabled, check for available buffers again.
>           */
> -        if (virtio_queue_empty(n->rx_vq) ||
> -            (n->mergeable_rx_bufs &&
> -             !virtqueue_avail_bytes(n->rx_vq, bufsize, 0)))
> +        if (virtio_queue_empty(vq) || (n->mergeable_rx_bufs &&
> +            !virtqueue_avail_bytes(vq, bufsize, 0))) {
>              return 0;
> +        }
>      }
>  
> -    virtio_queue_set_notification(n->rx_vq, 0);
> +    virtio_queue_set_notification(vq, 0);
>      return 1;
>  }
>  
> @@ -595,12 +701,15 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
>  
>  static ssize_t virtio_net_receive(VLANClientState *nc, const uint8_t *buf, size_t size)
>  {
> -    VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
> +    int queue_index = nc->queue_index;
> +    VirtIONet *n = ((NICState *)(nc->opaque))->opaque;
> +    VirtQueue *vq = n->vqs[queue_index].rx_vq;
>      struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL;
>      size_t guest_hdr_len, offset, i, host_hdr_len;
>  
> -    if (!virtio_net_can_receive(&n->nic->nc))
> +    if (!virtio_net_can_receive(n->nic->ncs[queue_index])) {
>          return -1;
> +    }
>  
>      /* hdr_len refers to the header we supply to the guest */
>      guest_hdr_len = n->mergeable_rx_bufs ?
> @@ -608,7 +717,7 @@ static ssize_t virtio_net_receive(VLANClientState *nc, const uint8_t *buf, size_
>  
>  
>      host_hdr_len = n->has_vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
> -    if (!virtio_net_has_buffers(n, size + guest_hdr_len - host_hdr_len))
> +    if (!virtio_net_has_buffers(n, size + guest_hdr_len - host_hdr_len, vq))
>          return 0;
>  
>      if (!receive_filter(n, buf, size))
> @@ -623,7 +732,7 @@ static ssize_t virtio_net_receive(VLANClientState *nc, const uint8_t *buf, size_
>  
>          total = 0;
>  
> -        if (virtqueue_pop(n->rx_vq, &elem) == 0) {
> +        if (virtqueue_pop(vq, &elem) == 0) {
>              if (i == 0)
>                  return -1;
>              error_report("virtio-net unexpected empty queue: "
> @@ -675,47 +784,50 @@ static ssize_t virtio_net_receive(VLANClientState *nc, const uint8_t *buf, size_
>          }
>  
>          /* signal other side */
> -        virtqueue_fill(n->rx_vq, &elem, total, i++);
> +        virtqueue_fill(vq, &elem, total, i++);
>      }
>  
>      if (mhdr) {
>          stw_p(&mhdr->num_buffers, i);
>      }
>  
> -    virtqueue_flush(n->rx_vq, i);
> -    virtio_notify(&n->vdev, n->rx_vq);
> +    virtqueue_flush(vq, i);
> +    virtio_notify(&n->vdev, vq);
>  
>      return size;
>  }
>  
> -static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq);
> +static int32_t virtio_net_flush_tx(VirtIONet *n, VirtIONetQueue *tvq);
>  
>  static void virtio_net_tx_complete(VLANClientState *nc, ssize_t len)
>  {
> -    VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
> +    VirtIONet *n = ((NICState *)nc->opaque)->opaque;
> +    VirtIONetQueue *netq = &n->vqs[nc->queue_index];
>  
> -    virtqueue_push(n->tx_vq, &n->async_tx.elem, n->async_tx.len);
> -    virtio_notify(&n->vdev, n->tx_vq);
> +    virtqueue_push(netq->tx_vq, &netq->async_tx.elem, netq->async_tx.len);
> +    virtio_notify(&n->vdev, netq->tx_vq);
>  
> -    n->async_tx.elem.out_num = n->async_tx.len = 0;
> +    netq->async_tx.elem.out_num = netq->async_tx.len;
>  
> -    virtio_queue_set_notification(n->tx_vq, 1);
> -    virtio_net_flush_tx(n, n->tx_vq);
> +    virtio_queue_set_notification(netq->tx_vq, 1);
> +    virtio_net_flush_tx(n, netq);
>  }
>  
>  /* TX */
> -static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
> +static int32_t virtio_net_flush_tx(VirtIONet *n, VirtIONetQueue *netq)
>  {
>      VirtQueueElement elem;
>      int32_t num_packets = 0;
> +    VirtQueue *vq = netq->tx_vq;
> +
>      if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
>          return num_packets;
>      }
>  
>      assert(n->vdev.vm_running);
>  
> -    if (n->async_tx.elem.out_num) {
> -        virtio_queue_set_notification(n->tx_vq, 0);
> +    if (netq->async_tx.elem.out_num) {
> +        virtio_queue_set_notification(vq, 0);
>          return num_packets;
>      }
>  
> @@ -747,12 +859,12 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
>              len += hdr_len;
>          }
>  
> -        ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num,
> -                                      virtio_net_tx_complete);
> +        ret = qemu_sendv_packet_async(n->nic->ncs[vq_get_pair_index(n, vq)],
> +                                      out_sg, out_num, virtio_net_tx_complete);
>          if (ret == 0) {
> -            virtio_queue_set_notification(n->tx_vq, 0);
> -            n->async_tx.elem = elem;
> -            n->async_tx.len  = len;
> +            virtio_queue_set_notification(vq, 0);
> +            netq->async_tx.elem = elem;
> +            netq->async_tx.len  = len;
>              return -EBUSY;
>          }
>  
> @@ -771,22 +883,23 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
>  static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
> +    VirtIONetQueue *netq = &n->vqs[vq_get_pair_index(n, vq)];
>  
>      /* This happens when device was stopped but VCPU wasn't. */
>      if (!n->vdev.vm_running) {
> -        n->tx_waiting = 1;
> +        netq->tx_waiting = 1;
>          return;
>      }
>  
> -    if (n->tx_waiting) {
> +    if (netq->tx_waiting) {
>          virtio_queue_set_notification(vq, 1);
> -        qemu_del_timer(n->tx_timer);
> -        n->tx_waiting = 0;
> -        virtio_net_flush_tx(n, vq);
> +        qemu_del_timer(netq->tx_timer);
> +        netq->tx_waiting = 0;
> +        virtio_net_flush_tx(n, netq);
>      } else {
> -        qemu_mod_timer(n->tx_timer,
> -                       qemu_get_clock_ns(vm_clock) + n->tx_timeout);
> -        n->tx_waiting = 1;
> +        qemu_mod_timer(netq->tx_timer,
> +                       qemu_get_clock_ns(vm_clock) + netq->tx_timeout);
> +        netq->tx_waiting = 1;
>          virtio_queue_set_notification(vq, 0);
>      }
>  }
> @@ -794,48 +907,53 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>  static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
> +    VirtIONetQueue *netq = &n->vqs[vq_get_pair_index(n, vq)];
>  
> -    if (unlikely(n->tx_waiting)) {
> +    if (unlikely(netq->tx_waiting)) {
>          return;
>      }
> -    n->tx_waiting = 1;
> +    netq->tx_waiting = 1;
>      /* This happens when device was stopped but VCPU wasn't. */
>      if (!n->vdev.vm_running) {
>          return;
>      }
>      virtio_queue_set_notification(vq, 0);
> -    qemu_bh_schedule(n->tx_bh);
> +    qemu_bh_schedule(netq->tx_bh);
>  }
>  
>  static void virtio_net_tx_timer(void *opaque)
>  {
> -    VirtIONet *n = opaque;
> +    VirtIONetQueue *netq = opaque;
> +    VirtIONet *n = netq->n;
> +
>      assert(n->vdev.vm_running);
>  
> -    n->tx_waiting = 0;
> +    netq->tx_waiting = 0;
>  
>      /* Just in case the driver is not ready on more */
>      if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
>          return;
>  
> -    virtio_queue_set_notification(n->tx_vq, 1);
> -    virtio_net_flush_tx(n, n->tx_vq);
> +    virtio_queue_set_notification(netq->tx_vq, 1);
> +    virtio_net_flush_tx(n, netq);
>  }
>  
>  static void virtio_net_tx_bh(void *opaque)
>  {
> -    VirtIONet *n = opaque;
> +    VirtIONetQueue *netq = opaque;
> +    VirtQueue *vq = netq->tx_vq;
> +    VirtIONet *n = netq->n;
>      int32_t ret;
>  
>      assert(n->vdev.vm_running);
>  
> -    n->tx_waiting = 0;
> +    netq->tx_waiting = 0;
>  
>      /* Just in case the driver is not ready on more */
>      if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
>          return;
>  
> -    ret = virtio_net_flush_tx(n, n->tx_vq);
> +    ret = virtio_net_flush_tx(n, netq);
>      if (ret == -EBUSY) {
>          return; /* Notification re-enable handled by tx_complete */
>      }
> @@ -843,33 +961,39 @@ static void virtio_net_tx_bh(void *opaque)
>      /* If we flush a full burst of packets, assume there are
>       * more coming and immediately reschedule */
>      if (ret >= n->tx_burst) {
> -        qemu_bh_schedule(n->tx_bh);
> -        n->tx_waiting = 1;
> +        qemu_bh_schedule(netq->tx_bh);
> +        netq->tx_waiting = 1;
>          return;
>      }
>  
>      /* If less than a full burst, re-enable notification and flush
>       * anything that may have come in while we weren't looking.  If
>       * we find something, assume the guest is still active and reschedule */
> -    virtio_queue_set_notification(n->tx_vq, 1);
> -    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {
> -        virtio_queue_set_notification(n->tx_vq, 0);
> -        qemu_bh_schedule(n->tx_bh);
> -        n->tx_waiting = 1;
> +    virtio_queue_set_notification(vq, 1);
> +    if (virtio_net_flush_tx(n, netq) > 0) {
> +        virtio_queue_set_notification(vq, 0);
> +        qemu_bh_schedule(netq->tx_bh);
> +        netq->tx_waiting = 1;
>      }
>  }
>  
>  static void virtio_net_save(QEMUFile *f, void *opaque)
>  {
>      VirtIONet *n = opaque;
> +    int i;
>  
>      /* At this point, backend must be stopped, otherwise
>       * it might keep writing to memory. */
> -    assert(!n->vhost_started);
> +    for (i = 0; i < n->queues; i++) {
> +        assert(!n->vqs[i].vhost_started);
> +    }
>      virtio_save(&n->vdev, f);
>  
>      qemu_put_buffer(f, n->mac, ETH_ALEN);
> -    qemu_put_be32(f, n->tx_waiting);
> +    qemu_put_be32(f, n->queues);
> +    for (i = 0; i < n->queues; i++) {
> +        qemu_put_be32(f, n->vqs[i].tx_waiting);
> +    }
>      qemu_put_be32(f, n->mergeable_rx_bufs);
>      qemu_put_be16(f, n->status);
>      qemu_put_byte(f, n->promisc);
> @@ -902,7 +1026,10 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
>      }
>  
>      qemu_get_buffer(f, n->mac, ETH_ALEN);
> -    n->tx_waiting = qemu_get_be32(f);
> +    n->queues = qemu_get_be32(f);
> +    for (i = 0; i < n->queues; i++) {
> +        n->vqs[i].tx_waiting = qemu_get_be32(f);
> +    }
>      n->mergeable_rx_bufs = qemu_get_be32(f);
>  
>      if (version_id >= 3)
> @@ -930,7 +1057,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
>              n->mac_table.in_use = 0;
>          }
>      }
> - 
> +
>      if (version_id >= 6)
>          qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3);
>  
> @@ -941,13 +1068,16 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
>          }
>  
>          if (n->has_vnet_hdr) {
> -            tap_using_vnet_hdr(n->nic->nc.peer, 1);
> -            tap_set_offload(n->nic->nc.peer,
> -                    (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
> -                    (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
> -                    (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
> -                    (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_ECN)  & 1,
> -                    (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_UFO)  & 1);
> +            for(i = 0; i < n->queues; i++) {
> +                tap_using_vnet_hdr(n->nic->ncs[i]->peer, 1);
> +                tap_set_offload(n->nic->ncs[i]->peer,
> +                        (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
> +                        (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
> +                        (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
> +                        (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_ECN)  & 1,
> +                        (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_UFO)  &
> +                        1);
> +           }
>          }
>      }
>  
> @@ -982,7 +1112,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
>  
>  static void virtio_net_cleanup(VLANClientState *nc)
>  {
> -    VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
> +    VirtIONet *n = ((NICState *)nc->opaque)->opaque;
>  
>      n->nic = NULL;
>  }
> @@ -1000,6 +1130,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>                                virtio_net_conf *net)
>  {
>      VirtIONet *n;
> +    int i;
>  
>      n = (VirtIONet *)virtio_common_init("virtio-net", VIRTIO_ID_NET,
>                                          sizeof(struct virtio_net_config),
> @@ -1012,7 +1143,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>      n->vdev.bad_features = virtio_net_bad_features;
>      n->vdev.reset = virtio_net_reset;
>      n->vdev.set_status = virtio_net_set_status;
> -    n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
>  
>      if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) {
>          error_report("virtio-net: "
> @@ -1021,15 +1151,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>          error_report("Defaulting to \"bh\"");
>      }
>  
> -    if (net->tx && !strcmp(net->tx, "timer")) {
> -        n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_timer);
> -        n->tx_timer = qemu_new_timer_ns(vm_clock, virtio_net_tx_timer, n);
> -        n->tx_timeout = net->txtimer;
> -    } else {
> -        n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh);
> -        n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
> -    }
> -    n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
>      qemu_macaddr_default_if_unset(&conf->macaddr);
>      memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac));
>      n->status = VIRTIO_NET_S_LINK_UP;
> @@ -1038,7 +1159,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>  
>      qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
>  
> -    n->tx_waiting = 0;
>      n->tx_burst = net->txburst;
>      n->mergeable_rx_bufs = 0;
>      n->promisc = 1; /* for compatibility */
> @@ -1046,6 +1166,32 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>      n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
>  
>      n->vlans = g_malloc0(MAX_VLAN >> 3);
> +    n->queues = conf->queues;
> +
> +    /* Allocate per rx/tx vq's */
> +    for (i = 0; i < n->queues; i++) {
> +        n->vqs[i].rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
> +        if (net->tx && !strcmp(net->tx, "timer")) {
> +            n->vqs[i].tx_vq = virtio_add_queue(&n->vdev, 256,
> +                                               virtio_net_handle_tx_timer);
> +            n->vqs[i].tx_timer = qemu_new_timer_ns(vm_clock,
> +                                                   virtio_net_tx_timer,
> +                                                   &n->vqs[i]);
> +            n->vqs[i].tx_timeout = net->txtimer;
> +        } else {
> +            n->vqs[i].tx_vq = virtio_add_queue(&n->vdev, 256,
> +                                               virtio_net_handle_tx_bh);
> +            n->vqs[i].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[i]);
> +        }
> +
> +        n->vqs[i].tx_waiting = 0;
> +        n->vqs[i].n = n;
> +
> +        if (i == 0) {
> +            /* keep compatiable with spec and old guest */
> +            n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
> +        }
> +    }
>  
>      n->qdev = dev;
>      register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION,
> @@ -1059,24 +1205,33 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>  void virtio_net_exit(VirtIODevice *vdev)
>  {
>      VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
> +    int i;
>  
>      /* This will stop vhost backend if appropriate. */
>      virtio_net_set_status(vdev, 0);
>  
> -    qemu_purge_queued_packets(&n->nic->nc);
> +    for (i = 0; i < n->queues; i++) {
> +        qemu_purge_queued_packets(n->nic->ncs[i]);
> +    }
>  
>      unregister_savevm(n->qdev, "virtio-net", n);
>  
>      g_free(n->mac_table.macs);
>      g_free(n->vlans);
>  
> -    if (n->tx_timer) {
> -        qemu_del_timer(n->tx_timer);
> -        qemu_free_timer(n->tx_timer);
> -    } else {
> -        qemu_bh_delete(n->tx_bh);
> +    for (i = 0; i < n->queues; i++) {
> +        VirtIONetQueue *netq = &n->vqs[i];
> +        if (netq->tx_timer) {
> +            qemu_del_timer(netq->tx_timer);
> +            qemu_free_timer(netq->tx_timer);
> +        } else {
> +            qemu_bh_delete(netq->tx_bh);
> +        }
>      }
>  
> -    qemu_del_vlan_client(&n->nic->nc);
>      virtio_cleanup(&n->vdev);
> +
> +    for (i = 0; i < n->queues; i++) {
> +        qemu_del_vlan_client(n->nic->ncs[i]);
> +    }
>  }
> diff --git a/hw/virtio-net.h b/hw/virtio-net.h
> index 36aa463..b35ba5d 100644
> --- a/hw/virtio-net.h
> +++ b/hw/virtio-net.h
> @@ -44,6 +44,7 @@
>  #define VIRTIO_NET_F_CTRL_RX    18      /* Control channel RX mode support */
>  #define VIRTIO_NET_F_CTRL_VLAN  19      /* Control channel VLAN filtering */
>  #define VIRTIO_NET_F_CTRL_RX_EXTRA 20   /* Extra RX mode control support */
> +#define VIRTIO_NET_F_MULTIQUEUE   22
>  
>  #define VIRTIO_NET_S_LINK_UP    1       /* Link is up */
>  
> @@ -72,6 +73,8 @@ struct virtio_net_config
>      uint8_t mac[ETH_ALEN];
>      /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
>      uint16_t status;
> +
> +    uint16_t queues;
>  } QEMU_PACKED;
>  
>  /* This is the first element of the scatter-gather list.  If you don't

^ permalink raw reply

* Re: [PATCH] virtio-scsi: hotplug suppot for virtio-scsi
From: Paolo Bonzini @ 2012-07-01 13:24 UTC (permalink / raw)
  To: Cong Meng
  Cc: stefanha, linux-scsi, linux-kernel, zwanp, linuxram, senwang,
	virtualization, James E.J. Bottomley, Anthony Liguori
In-Reply-To: <1340175333-16803-1-git-send-email-mc@linux.vnet.ibm.com>

Il 20/06/2012 08:55, Cong Meng ha scritto:
> This patch implements the hotplug support for virtio-scsi.
> When there is a device attached/detached, the virtio-scsi driver will be
> signaled via event virtual queue and it will add/remove the scsi device 
> in question automatically.
> 
> Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
> Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>

Looks good (with one nit below), but please rebase over the four patches
at http://permalink.gmane.org/gmane.linux.kernel/1312496

> ---
>  drivers/scsi/virtio_scsi.c  |  109 ++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/virtio_scsi.h |    9 ++++
>  2 files changed, 116 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> index 1b38431..6dad625 100644
> --- a/drivers/scsi/virtio_scsi.c
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -25,6 +25,7 @@
>  #include <scsi/scsi_cmnd.h>
>  
>  #define VIRTIO_SCSI_MEMPOOL_SZ 64
> +#define VIRTIO_SCSI_EVENT_LEN 8
>  
>  /* Command queue element */
>  struct virtio_scsi_cmd {
> @@ -43,9 +44,15 @@ struct virtio_scsi_cmd {
>  	} resp;
>  } ____cacheline_aligned_in_smp;
>  
> +struct virtio_scsi_event_node {
> +	struct virtio_scsi *vscsi;
> +	struct virtio_scsi_event event;
> +	struct work_struct work;
> +};
> +
>  /* Driver instance state */
>  struct virtio_scsi {
> -	/* Protects ctrl_vq, req_vq and sg[] */
> +	/* Protects ctrl_vq, event_vq, req_vq and sg[] */
>  	spinlock_t vq_lock;
>  
>  	struct virtio_device *vdev;
> @@ -53,6 +60,9 @@ struct virtio_scsi {
>  	struct virtqueue *event_vq;
>  	struct virtqueue *req_vq;
>  
> +	/* Get some buffers reday for event vq */
> +	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
> +
>  	/* For sglist construction when adding commands to the virtqueue.  */
>  	struct scatterlist sg[];
>  };
> @@ -184,9 +194,93 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
>  	virtscsi_vq_done(vq, virtscsi_complete_free);
>  };
>  
> +static int virtscsi_kick_event(struct virtio_scsi *vscsi,
> +			       struct virtio_scsi_event_node *event_node)
> +{
> +	int ret;
> +	struct scatterlist sg;
> +	unsigned long flags;
> +
> +	sg_set_buf(&sg, &event_node->event, sizeof(struct virtio_scsi_event));
> +
> +	spin_lock_irqsave(&vscsi->vq_lock, flags);
> +
> +	ret = virtqueue_add_buf(vscsi->event_vq, &sg, 0, 1, event_node, GFP_ATOMIC);
> +	if (ret >= 0)
> +		virtqueue_kick(vscsi->event_vq);
> +
> +	spin_unlock_irqrestore(&vscsi->vq_lock, flags);
> +
> +	return ret;
> +}
> +
> +static int virtscsi_kick_event_all(struct virtio_scsi *vscsi)
> +{
> +	int i;
> +
> +	for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) {
> +		vscsi->event_list[i].vscsi = vscsi;
> +		virtscsi_kick_event(vscsi, &vscsi->event_list[i]);
> +	}
> +
> +	return 0;
> +}
> +
> +static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
> +                                            struct virtio_scsi_event *event)
> +{
> +	struct scsi_device *sdev;
> +	struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
> +	unsigned int target = event->lun[1];
> +	unsigned int lun = (event->lun[2] << 8) | event->lun[3];
> +
> +	switch (event->reason) {
> +	case VIRTIO_SCSI_EVT_RESET_RESCAN:
> +		scsi_add_device(shost, 0, target, lun);
> +		break;
> +	case VIRTIO_SCSI_EVT_RESET_REMOVED:
> +		sdev = scsi_device_lookup(shost, 0, target, lun);
> +		if (sdev) {
> +			scsi_remove_device(sdev);
> +			scsi_device_put(sdev);
> +		} else {
> +			pr_err("SCSI device %d 0 %d %d not found\n",
> +				shost->host_no, target, lun);
> +		}
> +		break;
> +	default:
> +		pr_info("Unsupport virtio scsi event reason %x\n", event->reason);
> +	}
> +}
> +
> +static void virtscsi_handle_event(struct work_struct *work)
> +{
> +	struct virtio_scsi_event_node *event_node = 
> +		container_of(work, struct virtio_scsi_event_node, work);
> +	struct virtio_scsi *vscsi = event_node->vscsi;
> +	struct virtio_scsi_event *event = &event_node->event;
> +
> +	switch (event->event) {
> +	case VIRTIO_SCSI_T_TRANSPORT_RESET:
> +		virtscsi_handle_transport_reset(vscsi, event);
> +		break;

Please handle VIRTIO_SCSI_T_NO_EVENT too here, and mask out
VIRTIO_SCSI_T_EVENTS_MISSED even if you do not handle it by triggering a
full rescan.

Paolo

> +	default:
> +		pr_err("Unsupport virtio scsi event %x\n", event->event);


> +	}
> +	virtscsi_kick_event(vscsi, event_node);
> +}
> +
> +static void virtscsi_complete_event(void *buf)
> +{
> +	struct virtio_scsi_event_node *event_node = buf;
> +
> +	INIT_WORK(&event_node->work, virtscsi_handle_event);
> +	schedule_work(&event_node->work);
> +}
> +
>  static void virtscsi_event_done(struct virtqueue *vq)
>  {
> -	virtscsi_vq_done(vq, virtscsi_complete_free);
> +	virtscsi_vq_done(vq, virtscsi_complete_event);
>  };
>  
>  static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
> @@ -435,6 +529,11 @@ static int virtscsi_init(struct virtio_device *vdev,
>  
>  	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
>  	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
> +
> +	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
> +		virtscsi_kick_event_all(vscsi);
> +	}
> +
>  	return 0;
>  }
>  
> @@ -532,7 +631,13 @@ static struct virtio_device_id id_table[] = {
>  	{ 0 },
>  };
>  
> +static unsigned int features[] = {
> +	VIRTIO_SCSI_F_HOTPLUG
> +};
> +
>  static struct virtio_driver virtio_scsi_driver = {
> +	.feature_table = features,
> +	.feature_table_size = ARRAY_SIZE(features),
>  	.driver.name = KBUILD_MODNAME,
>  	.driver.owner = THIS_MODULE,
>  	.id_table = id_table,
> diff --git a/include/linux/virtio_scsi.h b/include/linux/virtio_scsi.h
> index 8ddeafd..dc8d305 100644
> --- a/include/linux/virtio_scsi.h
> +++ b/include/linux/virtio_scsi.h
> @@ -69,6 +69,10 @@ struct virtio_scsi_config {
>  	u32 max_lun;
>  } __packed;
>  
> +/* Feature Bits */
> +#define VIRTIO_SCSI_F_INOUT                    0
> +#define VIRTIO_SCSI_F_HOTPLUG                  1
> +
>  /* Response codes */
>  #define VIRTIO_SCSI_S_OK                       0
>  #define VIRTIO_SCSI_S_OVERRUN                  1
> @@ -105,6 +109,11 @@ struct virtio_scsi_config {
>  #define VIRTIO_SCSI_T_TRANSPORT_RESET          1
>  #define VIRTIO_SCSI_T_ASYNC_NOTIFY             2
>  
> +/* Reasons of transport reset event */
> +#define VIRTIO_SCSI_EVT_RESET_HARD             0
> +#define VIRTIO_SCSI_EVT_RESET_RESCAN           1
> +#define VIRTIO_SCSI_EVT_RESET_REMOVED          2
> +
>  #define VIRTIO_SCSI_S_SIMPLE                   0
>  #define VIRTIO_SCSI_S_ORDERED                  1
>  #define VIRTIO_SCSI_S_HEAD                     2
> 

Thanks,

Paolo

^ permalink raw reply

* Re: [PATCH v2 1/4] mm: introduce compaction and migration for virtio ballooned pages
From: Minchan Kim @ 2012-07-01 23:36 UTC (permalink / raw)
  To: Rafael Aquini
  Cc: Rik van Riel, Michael S. Tsirkin, Konrad Rzeszutek Wilk,
	linux-kernel, virtualization, linux-mm, Andi Kleen, Andrew Morton
In-Reply-To: <20120630013447.GA1545@x61.redhat.com>

On 06/30/2012 10:34 AM, Rafael Aquini wrote:

>> void isolate_page_from_balloonlist(struct page* page)
>> > {
>> > 	page->mapping->a_ops->invalidatepage(page, 0);
>> > }
>> > 
>> > 	if (is_balloon_page(page) && (page_count(page) == 2)) {
>> > 		isolate_page_from_balloonlist(page);
>> > 	}
>> > 
> Humm, my feelings on your approach here: just an unecessary indirection that
> doesn't bring the desired code readability improvement.
> If the header comment statement on balloon_mapping->a_ops is not clear enough 
> on those methods usage for ballooned pages:
> 
> ..... 
> /*
>  * Balloon pages special page->mapping.
>  * users must properly allocate and initialize an instance of balloon_mapping,
>  * and set it as the page->mapping for balloon enlisted page instances.
>  *
>  * address_space_operations necessary methods for ballooned pages:
>  *   .migratepage    - used to perform balloon's page migration (as is)
>  *   .invalidatepage - used to isolate a page from balloon's page list
>  *   .freepage       - used to reinsert an isolated page to balloon's page list
>  */
> struct address_space *balloon_mapping;
> EXPORT_SYMBOL_GPL(balloon_mapping);
> .....
> 
> I can add an extra commentary, to recollect folks about that usage, next to the
> points where those callbacks are used at isolate_balloon_page() &
> putback_balloon_page(). What do you think?
> 
> 


I am not strongly against you.
It trivial nitpick must not prevent your great work. :)

Thanks!


-- 
Kind regards,
Minchan Kim

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Rusty Russell @ 2012-07-01 23:54 UTC (permalink / raw)
  To: Asias He, Sasha Levin
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <4FDFE926.7030309@redhat.com>

On Tue, 19 Jun 2012 10:51:18 +0800, Asias He <asias@redhat.com> wrote:
> On 06/18/2012 07:39 PM, Sasha Levin wrote:
> > On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
> >> On 06/18/2012 01:05 PM, Rusty Russell wrote:
> >>> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com>  wrote:
> >>>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
> >>>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>  wrote:
> >>>>>> This patch introduces bio-based IO path for virtio-blk.
> >>>>>
> >>>>> Why make it optional?
> >>>>
> >>>> request-based IO path is useful for users who do not want to bypass the
> >>>> IO scheduler in guest kernel, e.g. users using spinning disk. For users
> >>>> using fast disk device, e.g. SSD device, they can use bio-based IO path.
> >>>
> >>> Users using a spinning disk still get IO scheduling in the host though.
> >>> What benefit is there in doing it in the guest as well?
> >>
> >> The io scheduler waits for requests to merge and thus batch IOs
> >> together. It's not important w.r.t spinning disks since the host can do
> >> it but it causes much less vmexits which is the key issue for VMs.
> >
> > Is the amount of exits caused by virtio-blk significant at all with
> > EVENT_IDX?
> 
> Yes. EVENT_IDX saves the number of notify and interrupt. Let's take the 
> interrupt as an example, The guest fires 200K request to host, the 
> number of interrupt is about 6K thanks to EVENT_IDX. The ratio is 200K / 
> 6K = 33. The ratio of merging is 40000K / 200K = 20.

Confused.  So, without merging we get 6k exits (per second?)  How many
do we get when we use the request-based IO path?

If your device is slow, then you won't be able to make many requests per
second: why worry about exit costs?  If your device is fast (eg. ram),
you've already shown that your patch is a win, right?

Cheers,
Rusty.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox