Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>

This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
will be triggered when tx_queue_len. It could be used by net device
who want to do some processing at that time. An example is tun who may
want to resize tx array when tx_queue_len is changed.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/netdevice.h |  1 +
 net/core/net-sysfs.c      | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84d9d2..7dc2ec7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
 #define NETDEV_UDP_TUNNEL_PUSH_INFO	0x001C
+#define NETDEV_CHANGE_TX_QUEUE_LEN	0x001E
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7a0b616..6e4f347 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
 
 static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 {
-	dev->tx_queue_len = new_len;
+	int res, orig_len = dev->tx_queue_len;
+
+	if (new_len != orig_len) {
+		dev->tx_queue_len = new_len;
+		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+		res = notifier_to_errno(res);
+		if (res) {
+			netdev_err(dev,
+				   "refused to change device tx_queue_len\n");
+			dev->tx_queue_len = orig_len;
+			return -EFAULT;
+		}
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 4/6] skb_array: add wrappers for resizing
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/skb_array.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 2dd0d1e..f4dfade 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -161,6 +161,15 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
 }
 
+static inline int skb_array_resize_multiple(struct skb_array **rings,
+					    int nrings, int size, gfp_t gfp)
+{
+	BUILD_BUG_ON(offsetof(struct skb_array, ring));
+	return ptr_ring_resize_multiple((struct ptr_ring **)rings,
+					nrings, size, gfp,
+					__skb_array_destroy_skb);
+}
+
 static inline void skb_array_cleanup(struct skb_array *a)
 {
 	ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 3/6] ptr_ring: support resizing multiple queues
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>

Sometimes, we need support resizing multiple queues at once. This is
because it was not easy to recover to recover from a partial failure
of multiple queues resizing.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h         | 71 +++++++++++++++++++++++++++++++++++-----
 tools/virtio/ringtest/ptr_ring.c |  5 +++
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d78b8b8..2052011 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -349,20 +349,14 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
 	return 0;
 }
 
-static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
-				  void (*destroy)(void *))
+static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
+					   int size, gfp_t gfp,
+					   void (*destroy)(void *))
 {
-	unsigned long flags;
 	int producer = 0;
-	void **queue = __ptr_ring_init_queue_alloc(size, gfp);
 	void **old;
 	void *ptr;
 
-	if (!queue)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&(r)->producer_lock, flags);
-
 	while ((ptr = ptr_ring_consume(r)))
 		if (producer < size)
 			queue[producer++] = ptr;
@@ -375,6 +369,23 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 	old = r->queue;
 	r->queue = queue;
 
+	return old;
+}
+
+static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
+				  void (*destroy)(void *))
+{
+	unsigned long flags;
+	void **queue = __ptr_ring_init_queue_alloc(size, gfp);
+	void **old;
+
+	if (!queue)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&(r)->producer_lock, flags);
+
+	old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);
+
 	spin_unlock_irqrestore(&(r)->producer_lock, flags);
 
 	kfree(old);
@@ -382,6 +393,48 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 	return 0;
 }
 
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+					   int size,
+					   gfp_t gfp, void (*destroy)(void *))
+{
+	unsigned long flags;
+	void ***queues;
+	int i;
+
+	queues = kmalloc(nrings * sizeof *queues, gfp);
+	if (!queues)
+		goto noqueues;
+
+	for (i = 0; i < nrings; ++i) {
+		queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
+		if (!queues[i])
+			goto nomem;
+	}
+
+	for (i = 0; i < nrings; ++i) {
+		spin_lock_irqsave(&(rings[i])->producer_lock, flags);
+		queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
+						  size, gfp, destroy);
+		spin_unlock_irqrestore(&(rings[i])->producer_lock, flags);
+	}
+
+	for (i = 0; i < nrings; ++i)
+		kfree(queues[i]);
+
+	kfree(queues);
+
+	return 0;
+
+nomem:
+	while (--i >= 0)
+		kfree(queues[i]);
+
+	kfree(queues);
+
+noqueues:
+	return -ENOMEM;
+}
+
 static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
 {
 	void *ptr;
diff --git a/tools/virtio/ringtest/ptr_ring.c b/tools/virtio/ringtest/ptr_ring.c
index 74abd74..68e4f9f 100644
--- a/tools/virtio/ringtest/ptr_ring.c
+++ b/tools/virtio/ringtest/ptr_ring.c
@@ -17,6 +17,11 @@
 typedef pthread_spinlock_t  spinlock_t;
 
 typedef int gfp_t;
+static void *kmalloc(unsigned size, gfp_t gfp)
+{
+	return memalign(64, size);
+}
+
 static void *kzalloc(unsigned size, gfp_t gfp)
 {
 	void *p = memalign(64, size);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 2/6] skb_array: minor tweak
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/skb_array.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 678bfbf..2dd0d1e 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -151,12 +151,12 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
 	return ptr_ring_init(&a->ring, size, gfp);
 }
 
-void __skb_array_destroy_skb(void *ptr)
+static void __skb_array_destroy_skb(void *ptr)
 {
 	kfree_skb(ptr);
 }
 
-int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
+static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 {
 	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 1/6] ptr_ring: support zero length ring
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>

Sometimes, we need zero length ring. But current code will crash since
we don't do any check before accessing the ring. This patch fixes this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 562a65e..d78b8b8 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -102,7 +102,7 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
  */
 static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
-	if (r->queue[r->producer])
+	if (unlikely(!r->size) || r->queue[r->producer])
 		return -ENOSPC;
 
 	r->queue[r->producer++] = ptr;
@@ -164,7 +164,9 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
  */
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
-	return r->queue[r->consumer];
+	if (likely(r->size))
+		return r->queue[r->consumer];
+	return NULL;
 }
 
 /* Note: callers invoking this in a loop must use a compiler barrier,
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 0/6] switch to use tx skb array in tun
From: Jason Wang @ 2016-06-30  3:52 UTC (permalink / raw)
  To: mst, netdev, linux-kernel, davem
  Cc: brouer, eric.dumazet, kvm, virtualization

Hi all:

This series tries to switch to use skb array in tun. This is used to
eliminate the spinlock contention between producer and consumer. The
conversion was straightforward: just introdce a tx skb array and use
it instead of sk_receive_queue.

A minor issue is to keep the tx_queue_len behaviour, since tun used to
use it for the length of sk_receive_queue. This is done through:

- add the ability to resize multiple rings at once to avoid handling
  partial resize failure for mutiple rings.
- add the support for zero length ring.
- introduce a notifier which was triggered when tx_queue_len was
  changed for a netdev.
- resize all queues during the tx_queue_len changing.

Tests shows about 15% improvement on guest rx pps:

Before: ~1300000pps
After : ~1500000pps

Changes from V2:
- add multiple rings resizing support for ptr_ring/skb_array
- add zero length ring support
- introdce a NETDEV_CHANGE_TX_QUEUE_LEN
- drop new flags

Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement

Jason Wang (5):
  ptr_ring: support zero length ring
  skb_array: minor tweak
  skb_array: add wrappers for resizing
  net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
  tun: switch to use skb array for tx

Michael S. Tsirkin (1):
  ptr_ring: support resizing multiple queues

 drivers/net/tun.c                | 138 ++++++++++++++++++++++++++++++++++++---
 drivers/vhost/net.c              |  16 ++++-
 include/linux/net.h              |   1 +
 include/linux/netdevice.h        |   1 +
 include/linux/ptr_ring.h         |  77 ++++++++++++++++++----
 include/linux/skb_array.h        |  13 +++-
 net/core/net-sysfs.c             |  15 ++++-
 tools/virtio/ringtest/ptr_ring.c |   5 ++
 8 files changed, 243 insertions(+), 23 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH v2 02/12] genhd: Honor gen_uevent and add disk_gen_uevents
From: kbuild test robot @ 2016-06-30  3:26 UTC (permalink / raw)
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, linux-kernel, Minchan Kim,
	kbuild-all, linux-mtd, Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-3-famz@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2046 bytes --]

Hi,

[auto build test WARNING on block/for-next]
[also build test WARNING on v4.7-rc5]
[cannot apply to next-20160629]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Fam-Zheng/gendisk-Generate-uevent-after-attribute-available/20160630-100720
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   lib/crc32.c:148: warning: No description found for parameter 'tab)[256]'
   lib/crc32.c:148: warning: Excess function parameter 'tab' description in 'crc32_le_generic'
   lib/crc32.c:293: warning: No description found for parameter 'tab)[256]'
   lib/crc32.c:293: warning: Excess function parameter 'tab' description in 'crc32_be_generic'
   lib/crc32.c:1: warning: no structured comments found
   mm/memory.c:2881: warning: No description found for parameter 'old'
>> block/genhd.c:575: warning: No description found for parameter 'disk'
>> block/genhd.c:575: warning: No description found for parameter 'disk'

vim +/disk +575 block/genhd.c

   559		blkdev_put(bdev, FMODE_READ);
   560	
   561	exit:
   562		/* announce disk after possible partitions are created */
   563		dev_set_uevent_suppress(ddev, 0);
   564		if (gen_uevent)
   565			disk_gen_uevents(disk);
   566	}
   567	
   568	/**
   569	 * disk_gen_uevents
   570	 * @disk - the disk to generate uevent
   571	 *
   572	 * Generate KOBJ_ADD uevents on the disk and partitions.
   573	 */
   574	void disk_gen_uevents(struct gendisk *disk)
 > 575	{
   576		struct device *ddev = disk_to_dev(disk);
   577		struct disk_part_iter piter;
   578		struct hd_struct *part;
   579	
   580		kobject_uevent(&ddev->kobj, KOBJ_ADD);
   581	
   582		/* announce possible partitions */
   583		disk_part_iter_init(&piter, disk, 0);

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6370 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH v2 12/12] nvme: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/nvme/host/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fd70894..2655521 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1462,11 +1462,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (ns->type == NVME_NS_LIGHTNVM)
 		return;
 
-	add_disk(ns->disk, true);
+	add_disk(ns->disk, false);
 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group))
 		pr_warn("%s: failed to create sysfs group for identification\n",
 			ns->disk->disk_name);
+	disk_gen_uevents(ns->disk);
 	return;
  out_free_disk:
 	kfree(disk);
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 11/12] mtd: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/mtd/mtd_blkdevs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index ab3bc22..6848141 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -436,13 +436,14 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	if (new->readonly)
 		set_disk_ro(gd, 1);
 
-	add_disk(gd, true);
+	add_disk(gd, false);
 
 	if (new->disk_attributes) {
 		ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
 					new->disk_attributes);
 		WARN_ON(ret);
 	}
+	disk_gen_uevents(gd);
 	return 0;
 error4:
 	blk_cleanup_queue(new->rq);
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 10/12] mmc: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/mmc/card/block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 94cf51e..4007106 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2457,7 +2457,7 @@ static int mmc_add_disk(struct mmc_blk_data *md)
 	int ret;
 	struct mmc_card *card = md->queue.card;
 
-	add_disk(md->disk, true);
+	add_disk(md->disk, false);
 	md->force_ro.show = force_ro_show;
 	md->force_ro.store = force_ro_store;
 	sysfs_attr_init(&md->force_ro.attr);
@@ -2466,6 +2466,7 @@ static int mmc_add_disk(struct mmc_blk_data *md)
 	ret = device_create_file(disk_to_dev(md->disk), &md->force_ro);
 	if (ret)
 		goto force_ro_fail;
+	disk_gen_uevents(md->disk);
 
 	if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
 	     card->ext_csd.boot_ro_lockable) {
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 09/12] md: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1391c72..dcd09ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5055,7 +5055,7 @@ static int md_alloc(dev_t dev, char *name)
 	 * through to md_open, so make sure it doesn't get too far
 	 */
 	mutex_lock(&mddev->open_mutex);
-	add_disk(disk, true);
+	add_disk(disk, false);
 
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
 				     &disk_to_dev(disk)->kobj, "%s", "md");
@@ -5070,6 +5070,7 @@ static int md_alloc(dev_t dev, char *name)
 	if (mddev->kobj.sd &&
 	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
 		printk(KERN_DEBUG "pointless warning\n");
+	disk_gen_uevents(disk);
 	mutex_unlock(&mddev->open_mutex);
  abort:
 	mutex_unlock(&disks_mutex);
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 08/12] zram: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/zram/zram_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d735513..83f10a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1287,7 +1287,7 @@ static int zram_add(void)
 		zram->disk->queue->limits.discard_zeroes_data = 0;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
-	add_disk(zram->disk, true);
+	add_disk(zram->disk, false);
 
 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
 				&zram_disk_attr_group);
@@ -1296,6 +1296,7 @@ static int zram_add(void)
 				device_id);
 		goto out_free_disk;
 	}
+	disk_gen_uevents(zram->disk);
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 	zram->meta = NULL;
 
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 07/12] pktcdvd: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/pktcdvd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 00928406..a4e6bb7 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2785,11 +2785,13 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->events = pd->bdev->bd_disk->events;
 	disk->async_events = pd->bdev->bd_disk->async_events;
 
-	add_disk(disk, true);
+	add_disk(disk, false);
 
 	pkt_sysfs_dev_new(pd);
 	pkt_debugfs_dev_new(pd);
 
+	disk_gen_uevents(disk);
+
 	pkt_devs[idx] = pd;
 	if (pkt_dev)
 		*pkt_dev = pd->pkt_dev;
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 06/12] mtip32xx: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2d09fae..8c1cf03 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4042,7 +4042,7 @@ skip_create_disk:
 	set_capacity(dd->disk, capacity);
 
 	/* Enable the block device and add it to /dev */
-	add_disk(dd->disk, true);
+	add_disk(dd->disk, false);
 
 	dd->bdev = bdget_disk(dd->disk, 0);
 	/*
@@ -4054,6 +4054,7 @@ skip_create_disk:
 		mtip_hw_sysfs_init(dd, kobj);
 		kobject_put(kobj);
 	}
+	disk_gen_uevents(dd->disk);
 
 	if (dd->mtip_svc_handler) {
 		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 05/12] aoeblk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/aoe/aoeblk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index e91c5f1..f0cf4d6 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -417,9 +417,10 @@ aoeblk_gdalloc(void *vp)
 
 	spin_unlock_irqrestore(&d->lock, flags);
 
-	add_disk(gd, true);
+	add_disk(gd, false);
 	aoedisk_add_sysfs(d);
 	aoedisk_add_debugfs(d);
+	disk_gen_uevents(gd);
 
 	spin_lock_irqsave(&d->lock, flags);
 	WARN_ON(!(d->flags & DEVFL_GD_NOW));
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 04/12] axonrom: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

It is documented that KOBJ_ADD should be generated after the object's
attributes and children are ready.  We can achieve this with the new
disk_gen_uevents interface.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 arch/powerpc/sysdev/axonram.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 4efd69b..27e7175 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -238,7 +238,7 @@ static int axon_ram_probe(struct platform_device *device)
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
 	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
-	add_disk(bank->disk, true);
+	add_disk(bank->disk, false);
 
 	bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0);
 	if (bank->irq_id == NO_IRQ) {
@@ -262,6 +262,7 @@ static int axon_ram_probe(struct platform_device *device)
 		rc = -EFAULT;
 		goto failed;
 	}
+	disk_gen_uevents(bank->disk);
 
 	azfs_minor += bank->disk->minors;
 
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 03/12] virtio-blk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

Userspace listens to the KOBJ_ADD uevent generated in add_disk. At that
point we haven't created the serial attribute file, therefore depending
on how fast udev reacts, the /dev/disk/by-id/ entry doesn't always get
created.

This race condition can be easily reproduced by hot plugging a number of
virtio-blk disks.

Also in systemd, there used to be a related workaround in udev rules
called 'WAIT_FOR="serial"', but it is removed in later versions.

Now let's generate a KOBJ_CHANGE event after the attributes are ready.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 drivers/block/virtio_blk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f3a59f9..cd9a036 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -733,7 +733,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	virtio_device_ready(vdev);
 
-	add_disk(vblk->disk, true);
+	add_disk(vblk->disk, false);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
 		goto out_del_disk;
@@ -746,6 +746,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 					 &dev_attr_cache_type_ro);
 	if (err)
 		goto out_del_disk;
+	disk_gen_uevents(vblk->disk);
 	return 0;
 
 out_del_disk:
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 02/12] genhd: Honor gen_uevent and add disk_gen_uevents
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

In add_disk(), don't send uevent to userspace when gen_uevent is true;
also export the refactored function disk_gen_uevents for later use.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/genhd.c         | 23 +++++++++++++++++++----
 include/linux/genhd.h |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 8e1bfa1..9b66953 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -506,12 +506,10 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
-static void register_disk(struct gendisk *disk)
+static void register_disk(struct gendisk *disk, bool gen_uevent)
 {
 	struct device *ddev = disk_to_dev(disk);
 	struct block_device *bdev;
-	struct disk_part_iter piter;
-	struct hd_struct *part;
 	int err;
 
 	ddev->parent = disk->driverfs_dev;
@@ -563,6 +561,22 @@ static void register_disk(struct gendisk *disk)
 exit:
 	/* announce disk after possible partitions are created */
 	dev_set_uevent_suppress(ddev, 0);
+	if (gen_uevent)
+		disk_gen_uevents(disk);
+}
+
+/**
+ * disk_gen_uevents
+ * @disk - the disk to generate uevent
+ *
+ * Generate KOBJ_ADD uevents on the disk and partitions.
+ */
+void disk_gen_uevents(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
 	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
@@ -571,6 +585,7 @@ exit:
 		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
 }
+EXPORT_SYMBOL(disk_gen_uevents);
 
 /**
  * add_disk - add partitioning information to kernel list
@@ -618,7 +633,7 @@ void add_disk(struct gendisk *disk, bool gen_uevent)
 
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
-	register_disk(disk);
+	register_disk(disk, gen_uevent);
 	blk_register_queue(disk);
 
 	/*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 038be80..87ad9e5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -416,6 +416,7 @@ extern void part_round_stats(int cpu, struct hd_struct *part);
 /* block/genhd.c */
 extern void add_disk(struct gendisk *disk, bool gen_uevent);
 extern void del_gendisk(struct gendisk *gp);
+extern void disk_gen_uevents(struct gendisk *disk);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 01/12] genhd: Add "gen_uevent" parameter to add_disk
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>

The parameter will be used to control whether add_disk should generate
the KOBJ_ADD uevent already.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 arch/m68k/emu/nfblock.c                     | 2 +-
 arch/powerpc/sysdev/axonram.c               | 2 +-
 arch/um/drivers/ubd_kern.c                  | 2 +-
 arch/xtensa/platforms/iss/simdisk.c         | 2 +-
 block/genhd.c                               | 3 ++-
 drivers/block/DAC960.c                      | 2 +-
 drivers/block/amiflop.c                     | 2 +-
 drivers/block/aoe/aoeblk.c                  | 2 +-
 drivers/block/ataflop.c                     | 2 +-
 drivers/block/brd.c                         | 4 ++--
 drivers/block/cciss.c                       | 2 +-
 drivers/block/drbd/drbd_main.c              | 2 +-
 drivers/block/floppy.c                      | 2 +-
 drivers/block/hd.c                          | 2 +-
 drivers/block/loop.c                        | 2 +-
 drivers/block/mg_disk.c                     | 2 +-
 drivers/block/mtip32xx/mtip32xx.c           | 2 +-
 drivers/block/nbd.c                         | 2 +-
 drivers/block/null_blk.c                    | 2 +-
 drivers/block/osdblk.c                      | 2 +-
 drivers/block/paride/pcd.c                  | 2 +-
 drivers/block/paride/pd.c                   | 2 +-
 drivers/block/paride/pf.c                   | 2 +-
 drivers/block/pktcdvd.c                     | 2 +-
 drivers/block/ps3disk.c                     | 2 +-
 drivers/block/ps3vram.c                     | 2 +-
 drivers/block/rbd.c                         | 2 +-
 drivers/block/rsxx/dev.c                    | 2 +-
 drivers/block/skd_main.c                    | 2 +-
 drivers/block/sunvdc.c                      | 2 +-
 drivers/block/swim.c                        | 2 +-
 drivers/block/swim3.c                       | 2 +-
 drivers/block/sx8.c                         | 2 +-
 drivers/block/umem.c                        | 2 +-
 drivers/block/virtio_blk.c                  | 2 +-
 drivers/block/xen-blkfront.c                | 2 +-
 drivers/block/xsysace.c                     | 2 +-
 drivers/block/z2ram.c                       | 2 +-
 drivers/block/zram/zram_drv.c               | 2 +-
 drivers/cdrom/gdrom.c                       | 2 +-
 drivers/ide/ide-cd.c                        | 2 +-
 drivers/ide/ide-gd.c                        | 2 +-
 drivers/lightnvm/core.c                     | 2 +-
 drivers/md/bcache/super.c                   | 4 ++--
 drivers/md/dm.c                             | 2 +-
 drivers/md/md.c                             | 2 +-
 drivers/memstick/core/ms_block.c            | 2 +-
 drivers/memstick/core/mspro_block.c         | 2 +-
 drivers/mmc/card/block.c                    | 2 +-
 drivers/mtd/mtd_blkdevs.c                   | 2 +-
 drivers/mtd/ubi/block.c                     | 2 +-
 drivers/nvdimm/blk.c                        | 2 +-
 drivers/nvdimm/btt.c                        | 2 +-
 drivers/nvdimm/pmem.c                       | 2 +-
 drivers/nvme/host/core.c                    | 2 +-
 drivers/s390/block/dasd_genhd.c             | 2 +-
 drivers/s390/block/dcssblk.c                | 2 +-
 drivers/s390/block/scm_blk.c                | 2 +-
 drivers/s390/block/xpram.c                  | 2 +-
 drivers/sbus/char/jsflash.c                 | 2 +-
 drivers/scsi/sd.c                           | 2 +-
 drivers/scsi/sr.c                           | 2 +-
 drivers/staging/lustre/lustre/llite/lloop.c | 2 +-
 include/linux/genhd.h                       | 2 +-
 64 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index e9110b9..4252568 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -138,7 +138,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
 	set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
 	dev->disk->queue = dev->queue;
 
-	add_disk(dev->disk);
+	add_disk(dev->disk, true);
 
 	list_add_tail(&dev->list, &nfhd_list);
 
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ff75d70..4efd69b 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -238,7 +238,7 @@ static int axon_ram_probe(struct platform_device *device)
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
 	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
-	add_disk(bank->disk);
+	add_disk(bank->disk, true);
 
 	bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0);
 	if (bank->irq_id == NO_IRQ) {
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 17e96dc..c2eea65 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -828,7 +828,7 @@ static int ubd_disk_register(int major, u64 size, int unit,
 
 	disk->private_data = &ubd_devs[unit];
 	disk->queue = ubd_devs[unit].queue;
-	add_disk(disk);
+	add_disk(disk, true);
 
 	*disk_out = disk;
 	return 0;
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index f58a4e6..59951a5 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -288,7 +288,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
 	dev->gd->private_data = dev;
 	snprintf(dev->gd->disk_name, 32, "simdisk%d", which);
 	set_capacity(dev->gd, 0);
-	add_disk(dev->gd);
+	add_disk(dev->gd, true);
 
 	dev->procfile = proc_create_data(tmp, 0644, procdir, &fops, dev);
 	return 0;
diff --git a/block/genhd.c b/block/genhd.c
index 9f42526..8e1bfa1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -575,13 +575,14 @@ exit:
 /**
  * add_disk - add partitioning information to kernel list
  * @disk: per-device partitioning information
+ * @gen_uevent: whether to generate the KOBJ_ADD uevent
  *
  * This function registers the partitioning information in @disk
  * with the kernel.
  *
  * FIXME: error handling
  */
-void add_disk(struct gendisk *disk)
+void add_disk(struct gendisk *disk, bool gen_uevent)
 {
 	struct backing_dev_info *bdi;
 	dev_t devt;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 811e11c..c18fc2c 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3175,7 +3175,7 @@ DAC960_Probe(struct pci_dev *dev, const struct pci_device_id *entry)
 
   for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) {
         set_capacity(Controller->disks[disk], disk_size(Controller, disk));
-        add_disk(Controller->disks[disk]);
+        add_disk(Controller->disks[disk], true);
   }
   DAC960_CreateProcEntries(Controller);
   return 0;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 5fd50a2..c226b30 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1738,7 +1738,7 @@ static int __init fd_probe_drives(void)
 		sprintf(disk->disk_name, "fd%d", drive);
 		disk->private_data = &unit[drive];
 		set_capacity(disk, 880*2);
-		add_disk(disk);
+		add_disk(disk, true);
 	}
 	if ((drives > 0) || (nomem == 0)) {
 		if (drives == 0)
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ec9d861..e91c5f1 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -417,7 +417,7 @@ aoeblk_gdalloc(void *vp)
 
 	spin_unlock_irqrestore(&d->lock, flags);
 
-	add_disk(gd);
+	add_disk(gd, true);
 	aoedisk_add_sysfs(d);
 	aoedisk_add_debugfs(d);
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 2104b1b..0feae71 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1969,7 +1969,7 @@ static int __init atari_floppy_init (void)
 		if (!unit[i].disk->queue)
 			goto Enomem;
 		set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
-		add_disk(unit[i].disk);
+		add_disk(unit[i].disk, true);
 	}
 
 	blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c04bd9b..7101343 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -552,7 +552,7 @@ static struct brd_device *brd_init_one(int i, bool *new)
 
 	brd = brd_alloc(i);
 	if (brd) {
-		add_disk(brd->brd_disk);
+		add_disk(brd->brd_disk, true);
 		list_add_tail(&brd->brd_list, &brd_devices);
 	}
 	*new = true;
@@ -620,7 +620,7 @@ static int __init brd_init(void)
 	/* point of no return */
 
 	list_for_each_entry(brd, &brd_devices, brd_list)
-		add_disk(brd->brd_disk);
+		add_disk(brd->brd_disk, true);
 
 	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
 				  THIS_MODULE, brd_probe, NULL, NULL);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 63c2064..ab93075 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1973,7 +1973,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 	/* allows the interrupt handler to start the queue */
 	wmb();
 	h->drv[drv_index]->queue = disk->queue;
-	add_disk(disk);
+	add_disk(disk, true);
 	return 0;
 
 cleanup_queue:
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2ba1494..7c54597 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2820,7 +2820,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 		goto out_idr_remove_vol;
 	}
 
-	add_disk(disk);
+	add_disk(disk, true);
 
 	/* inherit the connection state */
 	device->state.conn = first_connection(resource)->cstate;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 84708a5..228e8f7 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4350,7 +4350,7 @@ static int __init do_floppy_init(void)
 		disks[drive]->private_data = (void *)(long)drive;
 		disks[drive]->flags |= GENHD_FL_REMOVABLE;
 		disks[drive]->driverfs_dev = &floppy_device[drive].dev;
-		add_disk(disks[drive]);
+		add_disk(disks[drive], true);
 	}
 
 	return 0;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 3abb121..6751c42 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -767,7 +767,7 @@ static int __init hd_init(void)
 
 	/* Let them fly */
 	for (drive = 0; drive < NR_HD; drive++)
-		add_disk(hd_gendisk[drive]);
+		add_disk(hd_gendisk[drive], true);
 
 	return 0;
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1fa8cc2..3223a23 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1800,7 +1800,7 @@ static int loop_add(struct loop_device **l, int i)
 	disk->private_data	= lo;
 	disk->queue		= lo->lo_queue;
 	sprintf(disk->disk_name, "loop%d", i);
-	add_disk(disk);
+	add_disk(disk, true);
 	*l = lo;
 	return lo->lo_number;
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 145ce2a..9def46b 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -1009,7 +1009,7 @@ static int mg_probe(struct platform_device *plat_dev)
 
 	set_capacity(host->gd, host->n_sectors);
 
-	add_disk(host->gd);
+	add_disk(host->gd, true);
 
 	return err;
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 6053e46..2d09fae 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4042,7 +4042,7 @@ skip_create_disk:
 	set_capacity(dd->disk, capacity);
 
 	/* Enable the block device and add it to /dev */
-	add_disk(dd->disk);
+	add_disk(dd->disk, true);
 
 	dd->bdev = bdget_disk(dd->disk, 0);
 	/*
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6a48ed4..4d011c1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1089,7 +1089,7 @@ static int __init nbd_init(void)
 		disk->private_data = &nbd_dev[i];
 		sprintf(disk->disk_name, "nbd%d", i);
 		nbd_reset(&nbd_dev[i]);
-		add_disk(disk);
+		add_disk(disk, true);
 	}
 
 	return 0;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cab9759..bc4c495 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -740,7 +740,7 @@ static int null_add_dev(void)
 	disk->queue		= nullb->q;
 	strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
 
-	add_disk(disk);
+	add_disk(disk, true);
 
 done:
 	mutex_lock(&lock);
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index c2854a2..5a750e2 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -448,7 +448,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 
 	/* finally, announce the disk to the world */
 	set_capacity(disk, obj_size / 512ULL);
-	add_disk(disk);
+	add_disk(disk, true);
 
 	printk(KERN_INFO "%s: Added of size 0x%llx\n",
 		disk->disk_name, (unsigned long long)obj_size);
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 9336236..9d03cbe 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -970,7 +970,7 @@ static int __init pcd_init(void)
 			register_cdrom(&cd->info);
 			cd->disk->private_data = cd;
 			cd->disk->queue = pcd_queue;
-			add_disk(cd->disk);
+			add_disk(cd->disk, true);
 		}
 	}
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 78a39f7..da45f07 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -906,7 +906,7 @@ static int pd_detect(void)
 	for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
 		if (disk->gd) {
 			set_capacity(disk->gd, disk->capacity);
-			add_disk(disk->gd);
+			add_disk(disk->gd, true);
 			found = 1;
 		}
 	}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 7a7d977..9b7f885 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -989,7 +989,7 @@ static int __init pf_init(void)
 			continue;
 		disk->private_data = pf;
 		disk->queue = pf_queue;
-		add_disk(disk);
+		add_disk(disk, true);
 	}
 	return 0;
 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d06c62e..00928406 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2785,7 +2785,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->events = pd->bdev->bd_disk->events;
 	disk->async_events = pd->bdev->bd_disk->async_events;
 
-	add_disk(disk);
+	add_disk(disk, true);
 
 	pkt_sysfs_dev_new(pd);
 	pkt_debugfs_dev_new(pd);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4b7e405..bd72d79 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -499,7 +499,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 		 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
 		 get_capacity(gendisk) >> 11);
 
-	add_disk(gendisk);
+	add_disk(gendisk, true);
 	return 0;
 
 fail_cleanup_queue:
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 56847fc..ab6acc5 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -780,7 +780,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
 	dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
 		 gendisk->disk_name, get_capacity(gendisk) >> 11);
 
-	add_disk(gendisk);
+	add_disk(gendisk, true);
 	return 0;
 
 fail_cleanup_queue:
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 81666a5..bee01ae 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5158,7 +5158,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 	up_write(&rbd_dev->header_rwsem);
 
-	add_disk(rbd_dev->disk);
+	add_disk(rbd_dev->disk, true);
 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
 		(unsigned long long) rbd_dev->mapping.size);
 
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index e1b8b70..b4dc913 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -230,7 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
 			set_capacity(card->gendisk, card->size8 >> 9);
 		else
 			set_capacity(card->gendisk, 0);
-		add_disk(card->gendisk);
+		add_disk(card->gendisk, true);
 
 		card->bdev_attached = 1;
 	}
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 910e065..a7de9bb 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -4693,7 +4693,7 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 static int skd_bdev_attach(struct skd_device *skdev)
 {
 	pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
-	add_disk(skdev->disk);
+	add_disk(skdev->disk, true);
 	return 0;
 }
 
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4b911ed..c762c31 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -835,7 +835,7 @@ static int probe_disk(struct vdc_port *port)
 	       port->vdisk_size, (port->vdisk_size >> (20 - 9)),
 	       port->vio.ver.major, port->vio.ver.minor);
 
-	add_disk(g);
+	add_disk(g, true);
 
 	return 0;
 }
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b5afd49..c1ff0ee 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -858,7 +858,7 @@ static int swim_floppy_init(struct swim_priv *swd)
 		swd->unit[drive].disk->private_data = &swd->unit[drive];
 		swd->unit[drive].disk->queue = swd->queue;
 		set_capacity(swd->unit[drive].disk, 2880);
-		add_disk(swd->unit[drive].disk);
+		add_disk(swd->unit[drive].disk, true);
 	}
 
 	blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c264f2d..6940593 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1240,7 +1240,7 @@ static int swim3_attach(struct macio_dev *mdev,
 	disk->flags |= GENHD_FL_REMOVABLE;
 	sprintf(disk->disk_name, "fd%d", index);
 	set_capacity(disk, 2880);
-	add_disk(disk);
+	add_disk(disk, true);
 
 	return 0;
 }
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index ba4bfe9..ed285f0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1333,7 +1333,7 @@ static void carm_fsm_task (struct work_struct *work)
 				struct gendisk *disk = port->disk;
 
 				set_capacity(disk, port->capacity);
-				add_disk(disk);
+				add_disk(disk, true);
 				activated++;
 			}
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 7939b9f..5f6ed17 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -1098,7 +1098,7 @@ static int __init mm_init(void)
 		disk->private_data = &cards[i];
 		disk->queue = cards[i].queue;
 		set_capacity(disk, cards[i].mm_size << 1);
-		add_disk(disk);
+		add_disk(disk, true);
 	}
 
 	init_battery_timer();
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42758b5..f3a59f9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -733,7 +733,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	virtio_device_ready(vdev);
 
-	add_disk(vblk->disk);
+	add_disk(vblk->disk, true);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
 		goto out_del_disk;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2e6d1e9..175983a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2452,7 +2452,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	for (i = 0; i < info->nr_rings; i++)
 		kick_pending_request_queues(&info->rinfo[i]);
 
-	add_disk(info->gd);
+	add_disk(info->gd, true);
 
 	info->is_ready = 1;
 }
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index c4328d9..7e060e0 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -1057,7 +1057,7 @@ static int ace_setup(struct ace_device *ace)
 	ace_revalidate_disk(ace->gd);
 
 	/* Make the sysace device 'live' */
-	add_disk(ace->gd);
+	add_disk(ace->gd, true);
 
 	return 0;
 
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 968f9e5..91ccbfb 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -364,7 +364,7 @@ z2_init(void)
     sprintf(z2ram_gendisk->disk_name, "z2ram");
 
     z2ram_gendisk->queue = z2_queue;
-    add_disk(z2ram_gendisk);
+    add_disk(z2ram_gendisk, true);
     blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE,
 				z2_find, NULL, NULL);
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8fcad8b..d735513 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1287,7 +1287,7 @@ static int zram_add(void)
 		zram->disk->queue->limits.discard_zeroes_data = 0;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
-	add_disk(zram->disk);
+	add_disk(zram->disk, true);
 
 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
 				&zram_disk_attr_group);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 584bc31..25403ee 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -817,7 +817,7 @@ static int probe_gdrom(struct platform_device *devptr)
 	gd.toc = kzalloc(sizeof(struct gdromtoc), GFP_KERNEL);
 	if (!gd.toc)
 		goto probe_fail_toc;
-	add_disk(gd.disk);
+	add_disk(gd.disk, true);
 	return 0;
 
 probe_fail_toc:
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ef907fd..39125d0 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1780,7 +1780,7 @@ static int ide_cd_probe(ide_drive_t *drive)
 	ide_cd_read_toc(drive, &sense);
 	g->fops = &idecd_ops;
 	g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-	add_disk(g);
+	add_disk(g, true);
 	return 0;
 
 out_free_disk:
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 838996a..6a82457 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -417,7 +417,7 @@ static int ide_gd_probe(ide_drive_t *drive)
 	if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
 		g->flags = GENHD_FL_REMOVABLE;
 	g->fops = &ide_gd_ops;
-	add_disk(g);
+	add_disk(g, true);
 	return 0;
 
 out_free_disk:
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 160c1a6..392886c 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -843,7 +843,7 @@ static int nvm_create_target(struct nvm_dev *dev,
 	blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
 
 	set_capacity(tdisk, tt->capacity(targetdata));
-	add_disk(tdisk);
+	add_disk(tdisk, true);
 
 	t->type = tt;
 	t->disk = tdisk;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f5dbb4e..ecb401c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -865,7 +865,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
 		closure_sync(&cl);
 	}
 
-	add_disk(d->disk);
+	add_disk(d->disk, true);
 	bd_link_disk_holder(dc->bdev, dc->disk.disk);
 	/* won't show up in the uevent file, use udevadm monitor -e instead
 	 * only class / kset properties are persistent */
@@ -1228,7 +1228,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 
 	bcache_device_attach(d, c, u - c->uuids);
 	bch_flash_dev_request_init(d);
-	add_disk(d->disk);
+	add_disk(d->disk, true);
 
 	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
 		goto err;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b2f962..ad48d4e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2399,7 +2399,7 @@ static struct mapped_device *alloc_dev(int minor)
 	md->disk->queue = md->queue;
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
-	add_disk(md->disk);
+	add_disk(md->disk, true);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
 	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866825f..1391c72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5055,7 +5055,7 @@ static int md_alloc(dev_t dev, char *name)
 	 * through to md_open, so make sure it doesn't get too far
 	 */
 	mutex_lock(&mddev->open_mutex);
-	add_disk(disk);
+	add_disk(disk, true);
 
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
 				     &disk_to_dev(disk)->kobj, "%s", "md");
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 3cd6815..9fe45ba 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2163,7 +2163,7 @@ static int msb_init_disk(struct memstick_dev *card)
 		set_disk_ro(msb->disk, 1);
 
 	msb_start(card);
-	add_disk(msb->disk);
+	add_disk(msb->disk, true);
 	dbg("Disk added");
 	return 0;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0fb27d3..cd4e16d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1255,7 +1255,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 	set_capacity(msb->disk, capacity);
 	dev_dbg(&card->dev, "capacity set %ld\n", capacity);
 
-	add_disk(msb->disk);
+	add_disk(msb->disk, true);
 	msb->active = 1;
 	return 0;
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index e62fde3..94cf51e 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2457,7 +2457,7 @@ static int mmc_add_disk(struct mmc_blk_data *md)
 	int ret;
 	struct mmc_card *card = md->queue.card;
 
-	add_disk(md->disk);
+	add_disk(md->disk, true);
 	md->force_ro.show = force_ro_show;
 	md->force_ro.store = force_ro_store;
 	sysfs_attr_init(&md->force_ro.attr);
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 74ae243..ab3bc22 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -436,7 +436,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	if (new->readonly)
 		set_disk_ro(gd, 1);
 
-	add_disk(gd);
+	add_disk(gd, true);
 
 	if (new->disk_attributes) {
 		ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index ebf46ad..125716d 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -445,7 +445,7 @@ int ubiblock_create(struct ubi_volume_info *vi)
 	mutex_unlock(&devices_mutex);
 
 	/* Must be the last step: anyone can call file ops from now on */
-	add_disk(dev->gd);
+	add_disk(dev->gd, true);
 	dev_info(disk_to_dev(dev->gd), "created from ubi%d:%d(%s)",
 		 dev->ubi_num, dev->vol_id, vi->name);
 	return 0;
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 495e06d9..0175c6c 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -294,7 +294,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
 	set_capacity(disk, 0);
-	add_disk(disk);
+	add_disk(disk, true);
 
 	if (nsblk_meta_size(nsblk)) {
 		int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 68a7c3c..4a54a92 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1258,7 +1258,7 @@ static int btt_blk_init(struct btt *btt)
 	btt->btt_queue->queuedata = btt;
 
 	set_capacity(btt->btt_disk, 0);
-	add_disk(btt->btt_disk);
+	add_disk(btt->btt_disk, true);
 	if (btt_meta_size(btt)) {
 		int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 608fc44..b23a946 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -304,7 +304,7 @@ static int pmem_attach_disk(struct device *dev,
 		return -ENOMEM;
 	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
 	disk->bb = &pmem->bb;
-	add_disk(disk);
+	add_disk(disk, true);
 	revalidate_disk(disk);
 
 	return 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1a51584..fd70894 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1462,7 +1462,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (ns->type == NVME_NS_LIGHTNVM)
 		return;
 
-	add_disk(ns->disk);
+	add_disk(ns->disk, true);
 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group))
 		pr_warn("%s: failed to create sysfs group for identification\n",
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 31d544a..5db81d5 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -76,7 +76,7 @@ int dasd_gendisk_alloc(struct dasd_block *block)
 	gdp->queue = block->request_queue;
 	block->gdp = gdp;
 	set_capacity(block->gdp, 0);
-	add_disk(block->gdp);
+	add_disk(block->gdp, true);
 	return 0;
 }
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index bed53c4..63a6fba 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -655,7 +655,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 		goto put_dev;
 
 	get_device(&dev_info->dev);
-	add_disk(dev_info->gd);
+	add_disk(dev_info->gd, true);
 
 	switch (dev_info->segment_type) {
 		case SEG_TYPE_SR:
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index e6f54d3..c3110e0 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -531,7 +531,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
 
 	/* 512 byte sectors */
 	set_capacity(bdev->gendisk, scmdev->size >> 9);
-	add_disk(bdev->gendisk);
+	add_disk(bdev->gendisk, true);
 	return 0;
 
 out_queue:
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 288f59a..e18c58c 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -377,7 +377,7 @@ static int __init xpram_setup_blkdev(void)
 		disk->queue = xpram_queues[i];
 		sprintf(disk->disk_name, "slram%d", i);
 		set_capacity(disk, xpram_sizes[i] << 1);
-		add_disk(disk);
+		add_disk(disk, true);
 	}
 
 	return 0;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index a40ee1e..d45e193 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -590,7 +590,7 @@ static int jsfd_init(void)
 		set_capacity(disk, jdp->dsize >> 9);
 		disk->private_data = jdp;
 		disk->queue = jsf_queue;
-		add_disk(disk);
+		add_disk(disk, true);
 		set_disk_ro(disk, 1);
 	}
 	return 0;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 60bff78..35515a2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2996,7 +2996,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 	}
 
 	blk_pm_runtime_init(sdp->request_queue, dev);
-	add_disk(gd);
+	add_disk(gd, true);
 	if (sdkp->capacity)
 		sd_dif_config_host(sdkp);
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 64c8674..57ea120 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -730,7 +730,7 @@ static int sr_probe(struct device *dev)
 
 	dev_set_drvdata(dev, cd);
 	disk->flags |= GENHD_FL_REMOVABLE;
-	add_disk(disk);
+	add_disk(disk, true);
 
 	sdev_printk(KERN_DEBUG, sdev,
 		    "Attached scsi CD-ROM %s\n", cd->cdi.name);
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index 813a9a3..748f6fd 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -835,7 +835,7 @@ static int __init lloop_init(void)
 
 	/* We cannot fail after we call this, so another loop!*/
 	for (i = 0; i < max_loop; i++)
-		add_disk(disks[i]);
+		add_disk(disks[i], true);
 	return 0;
 
 out_mem4:
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 359a8e4..038be80 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -414,7 +414,7 @@ static inline void free_part_info(struct hd_struct *part)
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
 /* block/genhd.c */
-extern void add_disk(struct gendisk *disk);
+extern void add_disk(struct gendisk *disk, bool gen_uevent);
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
-- 
2.9.0

^ permalink raw reply related

* [PATCH v2 00/12] gendisk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30  1:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
	linux-nvme, virtualization, Keith Busch, Paul Mackerras,
	Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
	famz, Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe,
	linux-raid, David Woodhouse, linux-mmc, Minchan Kim, linux-mtd,
	Brian Norris, linuxppc-dev

The race condition is noticed between disk_add() and disk attributes, on
virtio-blk hotplug.

Userspace listens to the KOBJ_ADD uevent generated in add_disk(). At that
point we haven't created the serial attribute file, therefore depending
on how fast udev reacts, the /dev/disk/by-id/ entry doesn't always get
created.

As pointed out by Christoph Hellwig in the specific fix [1], virtio-blk is not
the only driver that suffers from this, so we cannot count on every driver to
send events manually. Moreover as suggested in uevent documentation, it is
advised to defer the KOBJ_ADD event until all attributes are ready:

Documentation/kobject.txt:
> Use the KOBJ_ADD action for when the kobject is first added to the kernel.
> This should be done only after any attributes or children of the kobject
> have been initialized properly, as userspace will instantly start to look
> for them when this call happens.

Unfortunately it seems impossible to fix this generally without touching the
offending callers.  The approach I'm proposing here is adding a flag to
suppress uevent in add_disk(), which is patch 1, then in later patches, convert
any caller to only trigger the uevent when attributes are added.

[1] https://lkml.org/lkml/2016/6/28/550

Fam Zheng (12):
  genhd: Add "gen_uevent" parameter to add_disk
  genhd: Honor gen_uevent and add disk_gen_uevents
  virtio-blk: Generate uevent after attribute available
  axonrom: Generate uevent after attribute available
  aoeblk: Generate uevent after attribute available
  mtip32xx: Generate uevent after attribute available
  pktcdvd: Generate uevent after attribute available
  zram: Generate uevent after attribute available
  md: Generate uevent after attribute available
  mmc: Generate uevent after attribute available
  mtd: Generate uevent after attribute available
  nvme: Generate uevent after attribute available

 arch/m68k/emu/nfblock.c                     |  2 +-
 arch/powerpc/sysdev/axonram.c               |  3 ++-
 arch/um/drivers/ubd_kern.c                  |  2 +-
 arch/xtensa/platforms/iss/simdisk.c         |  2 +-
 block/genhd.c                               | 26 +++++++++++++++++++++-----
 drivers/block/DAC960.c                      |  2 +-
 drivers/block/amiflop.c                     |  2 +-
 drivers/block/aoe/aoeblk.c                  |  3 ++-
 drivers/block/ataflop.c                     |  2 +-
 drivers/block/brd.c                         |  4 ++--
 drivers/block/cciss.c                       |  2 +-
 drivers/block/drbd/drbd_main.c              |  2 +-
 drivers/block/floppy.c                      |  2 +-
 drivers/block/hd.c                          |  2 +-
 drivers/block/loop.c                        |  2 +-
 drivers/block/mg_disk.c                     |  2 +-
 drivers/block/mtip32xx/mtip32xx.c           |  3 ++-
 drivers/block/nbd.c                         |  2 +-
 drivers/block/null_blk.c                    |  2 +-
 drivers/block/osdblk.c                      |  2 +-
 drivers/block/paride/pcd.c                  |  2 +-
 drivers/block/paride/pd.c                   |  2 +-
 drivers/block/paride/pf.c                   |  2 +-
 drivers/block/pktcdvd.c                     |  4 +++-
 drivers/block/ps3disk.c                     |  2 +-
 drivers/block/ps3vram.c                     |  2 +-
 drivers/block/rbd.c                         |  2 +-
 drivers/block/rsxx/dev.c                    |  2 +-
 drivers/block/skd_main.c                    |  2 +-
 drivers/block/sunvdc.c                      |  2 +-
 drivers/block/swim.c                        |  2 +-
 drivers/block/swim3.c                       |  2 +-
 drivers/block/sx8.c                         |  2 +-
 drivers/block/umem.c                        |  2 +-
 drivers/block/virtio_blk.c                  |  3 ++-
 drivers/block/xen-blkfront.c                |  2 +-
 drivers/block/xsysace.c                     |  2 +-
 drivers/block/z2ram.c                       |  2 +-
 drivers/block/zram/zram_drv.c               |  3 ++-
 drivers/cdrom/gdrom.c                       |  2 +-
 drivers/ide/ide-cd.c                        |  2 +-
 drivers/ide/ide-gd.c                        |  2 +-
 drivers/lightnvm/core.c                     |  2 +-
 drivers/md/bcache/super.c                   |  4 ++--
 drivers/md/dm.c                             |  2 +-
 drivers/md/md.c                             |  3 ++-
 drivers/memstick/core/ms_block.c            |  2 +-
 drivers/memstick/core/mspro_block.c         |  2 +-
 drivers/mmc/card/block.c                    |  3 ++-
 drivers/mtd/mtd_blkdevs.c                   |  3 ++-
 drivers/mtd/ubi/block.c                     |  2 +-
 drivers/nvdimm/blk.c                        |  2 +-
 drivers/nvdimm/btt.c                        |  2 +-
 drivers/nvdimm/pmem.c                       |  2 +-
 drivers/nvme/host/core.c                    |  3 ++-
 drivers/s390/block/dasd_genhd.c             |  2 +-
 drivers/s390/block/dcssblk.c                |  2 +-
 drivers/s390/block/scm_blk.c                |  2 +-
 drivers/s390/block/xpram.c                  |  2 +-
 drivers/sbus/char/jsflash.c                 |  2 +-
 drivers/scsi/sd.c                           |  2 +-
 drivers/scsi/sr.c                           |  2 +-
 drivers/staging/lustre/lustre/llite/lloop.c |  2 +-
 include/linux/genhd.h                       |  3 ++-
 64 files changed, 98 insertions(+), 70 deletions(-)

-- 
2.9.0

^ permalink raw reply

* Re: [PATCH net-next V2] tun: introduce tx skb ring
From: Jason Wang @ 2016-06-30  1:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, eric.dumazet, netdev, linux-kernel, virtualization, brouer,
	davem
In-Reply-To: <20160628100907-mutt-send-email-mst@redhat.com>



On 2016年06月28日 15:09, Michael S. Tsirkin wrote:
> On Thu, Jun 23, 2016 at 01:14:07PM +0800, Jason Wang wrote:
>>
>> On 2016年06月23日 02:18, Michael S. Tsirkin wrote:
>>> On Fri, Jun 17, 2016 at 03:41:20AM +0300, Michael S. Tsirkin wrote:
>>>>> Would it help to have ptr_ring_resize that gets an array of
>>>>> rings and resizes them both to same length?
>>> OK, here it is. Untested so far, and no skb wrapper.
>>> Pls let me know whether this is what you had in mind.
>> Exactly what I want.
>>
>> Thanks
> Ok and this for skb_array
>
> -->
> skb_array: add wrappers for resizing
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> --
>
> diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
> index c900708..7e01c1f 100644
> --- a/include/linux/skb_array.h
> +++ b/include/linux/skb_array.h
> @@ -151,16 +151,24 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
>   	return ptr_ring_init(&a->ring, size, 0, gfp);
>   }
>   
> -void __skb_array_destroy_skb(void *ptr)
> +static void __skb_array_destroy_skb(void *ptr)
>   {
>   	kfree_skb(ptr);
>   }
>   
> -int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
> +static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
>   {
>   	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
>   }

Will split up the above tweak into another patch when reposting.

>   
> +static inline int skb_raay_resize_multiple(struct skb_array **rings, int nrings,

I think you mean 'skb_array_resize' here.

> +					   int size, gfp_t gfp)
> +{
> +	BUILD_BUG_ON(offsetof(struct skb_array, ring));
> +	ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp,
> +				 __skb_array_destroy_skb);

This should be return ptr_ring_resize_multiple(...

> +}
> +
>   static inline void skb_array_cleanup(struct skb_array *a)
>   {
>   	ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);

With this, looks like there's no need for a new flag. Will repost the 
series with those two patches.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH v2 kernel 7/7] virtio-balloon: tell host vm's free page info
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

Support the request for vm's free page information, response with
a page bitmap. QEMU can make use of this free page bitmap to speed
up live migration process by skipping process the free pages.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 104 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 98 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 2d18ff6..5ca4ad3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -62,10 +62,13 @@ module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
 extern unsigned long get_max_pfn(void);
+extern int get_free_pages(unsigned long start_pfn, unsigned long end_pfn,
+		unsigned long *bitmap, unsigned long len);
+
 
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *misc_vq;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
@@ -89,6 +92,8 @@ struct virtio_balloon {
 	unsigned long pfn_limit;
 	/* Used to record the processed pfn range */
 	unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
+	/* Request header */
+	struct balloon_req_hdr req_hdr;
 	/*
 	 * The pages we've told the Host we're not using are enqueued
 	 * at vb_dev_info->pages list.
@@ -373,6 +378,49 @@ static void update_balloon_stats(struct virtio_balloon *vb)
 				pages_to_bytes(available));
 }
 
+static void update_free_pages_stats(struct virtio_balloon *vb,
+				unsigned long req_id)
+{
+	struct scatterlist sg_in, sg_out;
+	unsigned long pfn = 0, bmap_len, max_pfn;
+	struct virtqueue *vq = vb->misc_vq;
+	struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+	int ret = 1;
+
+	max_pfn = get_max_pfn();
+	mutex_lock(&vb->balloon_lock);
+	while (pfn < max_pfn) {
+		memset(vb->page_bitmap, 0, vb->bmap_len);
+		ret = get_free_pages(pfn, pfn + vb->pfn_limit,
+			vb->page_bitmap, vb->bmap_len * BITS_PER_BYTE);
+		hdr->cmd = cpu_to_virtio16(vb->vdev, BALLOON_GET_FREE_PAGES);
+		hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+		hdr->req_id = cpu_to_virtio64(vb->vdev, req_id);
+		hdr->start_pfn = cpu_to_virtio64(vb->vdev, pfn);
+		bmap_len = vb->pfn_limit / BITS_PER_BYTE;
+		if (!ret) {
+			hdr->flag = cpu_to_virtio16(vb->vdev,
+							BALLOON_FLAG_DONE);
+			if (pfn + vb->pfn_limit > max_pfn)
+				bmap_len = (max_pfn - pfn) / BITS_PER_BYTE;
+		} else
+			hdr->flag = cpu_to_virtio16(vb->vdev,
+							BALLOON_FLAG_CONT);
+		hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+		sg_init_one(&sg_out, hdr,
+			 sizeof(struct balloon_bmap_hdr) + bmap_len);
+
+		virtqueue_add_outbuf(vq, &sg_out, 1, vb, GFP_KERNEL);
+		virtqueue_kick(vq);
+		pfn += vb->pfn_limit;
+	}
+
+	sg_init_one(&sg_in, &vb->req_hdr, sizeof(vb->req_hdr));
+	virtqueue_add_inbuf(vq, &sg_in, 1, &vb->req_hdr, GFP_KERNEL);
+	virtqueue_kick(vq);
+	mutex_unlock(&vb->balloon_lock);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -511,18 +559,49 @@ static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
+static void misc_handle_rq(struct virtio_balloon *vb)
+{
+	struct balloon_req_hdr *ptr_hdr;
+	unsigned int len;
+
+	ptr_hdr = virtqueue_get_buf(vb->misc_vq, &len);
+	if (!ptr_hdr || len != sizeof(vb->req_hdr))
+		return;
+
+	switch (ptr_hdr->cmd) {
+	case BALLOON_GET_FREE_PAGES:
+		update_free_pages_stats(vb, ptr_hdr->param);
+		break;
+	default:
+		break;
+	}
+}
+
+static void misc_request(struct virtqueue *vq)
+{
+	struct virtio_balloon *vb = vq->vdev->priv;
+
+	misc_handle_rq(vb);
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
+	struct virtqueue *vqs[4];
+	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack,
+					 stats_request, misc_request };
+	static const char * const names[] = { "inflate", "deflate", "stats",
+						 "misc" };
 	int err, nvqs;
 
 	/*
 	 * We expect two virtqueues: inflate and deflate, and
 	 * optionally stat.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_MISC_VQ))
+		nvqs = 4;
+	else
+		nvqs = virtio_has_feature(vb->vdev,
+					  VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
 	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names);
 	if (err)
 		return err;
@@ -543,6 +622,16 @@ static int init_vqs(struct virtio_balloon *vb)
 			BUG();
 		virtqueue_kick(vb->stats_vq);
 	}
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_MISC_VQ)) {
+		struct scatterlist sg_in;
+
+		vb->misc_vq = vqs[3];
+		sg_init_one(&sg_in, &vb->req_hdr, sizeof(vb->req_hdr));
+		if (virtqueue_add_inbuf(vb->misc_vq, &sg_in, 1,
+		    &vb->req_hdr, GFP_KERNEL) < 0)
+			BUG();
+		virtqueue_kick(vb->misc_vq);
+	}
 	return 0;
 }
 
@@ -639,8 +728,10 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->bmap_hdr = kzalloc(hdr_len + vb->bmap_len, GFP_KERNEL);
 
 	/* Clear the feature bit if memory allocation fails */
-	if (!vb->bmap_hdr)
+	if (!vb->bmap_hdr) {
 		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_MISC_VQ);
+	}
 	else
 		vb->page_bitmap = vb->bmap_hdr + hdr_len;
 	mutex_init(&vb->balloon_lock);
@@ -743,6 +834,7 @@ static unsigned int features[] = {
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_PAGE_BITMAP,
+	VIRTIO_BALLOON_F_MISC_VQ,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 6/7] mm: add the related functions to get free page info
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, Andrew Morton, virtualization,
	Mel Gorman
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

Save the free page info into a page bitmap, will be used in virtio
balloon device driver.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2083b40..c2a6669 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4521,6 +4521,52 @@ unsigned long get_max_pfn(void)
 }
 EXPORT_SYMBOL(get_max_pfn);
 
+static void mark_free_pages_bitmap(struct zone *zone, unsigned long start_pfn,
+	unsigned long end_pfn, unsigned long *bitmap, unsigned long len)
+{
+	unsigned long pfn, flags, page_num;
+	unsigned int order, t;
+	struct list_head *curr;
+
+	if (zone_is_empty(zone))
+		return;
+	end_pfn = min(start_pfn + len, end_pfn);
+	spin_lock_irqsave(&zone->lock, flags);
+
+	for_each_migratetype_order(order, t) {
+		list_for_each(curr, &zone->free_area[order].free_list[t]) {
+			pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			if (pfn >= start_pfn && pfn <= end_pfn) {
+				page_num = 1UL << order;
+				if (pfn + page_num > end_pfn)
+					page_num = end_pfn - pfn;
+				bitmap_set(bitmap, pfn - start_pfn, page_num);
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+int get_free_pages(unsigned long start_pfn, unsigned long end_pfn,
+		unsigned long *bitmap, unsigned long len)
+{
+	struct zone *zone;
+	int ret = 0;
+
+	if (bitmap == NULL || start_pfn > end_pfn || start_pfn >= max_pfn)
+		return 0;
+	if (end_pfn < max_pfn)
+		ret = 1;
+	if (end_pfn >= max_pfn)
+		ret = 0;
+
+	for_each_populated_zone(zone)
+		mark_free_pages_bitmap(zone, start_pfn, end_pfn, bitmap, len);
+	return ret;
+}
+EXPORT_SYMBOL(get_free_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 5/7] virtio-balloon: define feature bit and head for misc virt queue
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

Define a new feature bit which supports a new virtual queue. This
new virtual qeuque is for information exchange between hypervisor
and guest. The VMM hypervisor can make use of this virtual queue
to request the guest do some operations, e.g. drop page cache,
synchronize file system, etc. And the VMM hypervisor can get some
of guest's runtime information through this virtual queue, e.g. the
guest's free page information, which can be used for live migration
optimization.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 include/uapi/linux/virtio_balloon.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index d3b182a..be4880f 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_PAGE_BITMAP	3 /* Send page info with bitmap */
+#define VIRTIO_BALLOON_F_MISC_VQ	4 /* Misc info virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -101,4 +102,25 @@ struct balloon_bmap_hdr {
 	__virtio64 bmap_len;
 };
 
+enum balloon_req_id {
+	/* Get free pages information */
+	BALLOON_GET_FREE_PAGES,
+};
+
+enum balloon_flag {
+	/* Have more data for a request */
+	BALLOON_FLAG_CONT,
+	/* No more data for a request */
+	BALLOON_FLAG_DONE,
+};
+
+struct balloon_req_hdr {
+	/* Used to distinguish different request */
+	__virtio16 cmd;
+	/* Reserved */
+	__virtio16 reserved[3];
+	/* Request parameter */
+	__virtio64 param;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 kernel 4/7] virtio-balloon: speed up inflate/deflate process
From: Liang Li @ 2016-06-29 10:32 UTC (permalink / raw)
  To: mst
  Cc: virtio-dev, kvm, linux-kernel, Liang Li, qemu-devel, dgilbert,
	Amit Shah, Paolo Bonzini, virtualization
In-Reply-To: <1467196340-22079-1-git-send-email-liang.z.li@intel.com>

The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 184 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 162 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8d649a2..2d18ff6 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -41,10 +41,28 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+/*
+ * VIRTIO_BALLOON_PFNS_LIMIT is used to limit the size of page bitmap
+ * to prevent a very large page bitmap, there are two reasons for this:
+ * 1) to save memory.
+ * 2) allocate a large bitmap may fail.
+ *
+ * The actual limit of pfn is determined by:
+ * pfn_limit = min(max_pfn, VIRTIO_BALLOON_PFNS_LIMIT);
+ *
+ * If system has more pages than VIRTIO_BALLOON_PFNS_LIMIT, we will scan
+ * the page list and send the PFNs with several times. To reduce the
+ * overhead of scanning the page list. VIRTIO_BALLOON_PFNS_LIMIT should
+ * be set with a value which can cover most cases.
+ */
+#define VIRTIO_BALLOON_PFNS_LIMIT ((32 * (1ULL << 30)) >> PAGE_SHIFT) /* 32GB */
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern unsigned long get_max_pfn(void);
+
 struct virtio_balloon {
 	struct virtio_device *vdev;
 	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -62,6 +80,15 @@ struct virtio_balloon {
 
 	/* Number of balloon pages we've told the Host we're not using. */
 	unsigned int num_pages;
+	/* Pointer of the bitmap header. */
+	void *bmap_hdr;
+	/* Bitmap and length used to tell the host the pages */
+	unsigned long *page_bitmap;
+	unsigned long bmap_len;
+	/* Pfn limit */
+	unsigned long pfn_limit;
+	/* Used to record the processed pfn range */
+	unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
 	/*
 	 * The pages we've told the Host we're not using are enqueued
 	 * at vb_dev_info->pages list.
@@ -105,12 +132,45 @@ static void balloon_ack(struct virtqueue *vq)
 	wake_up(&vb->acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+	vb->min_pfn = ULONG_MAX;
+	vb->max_pfn = 0;
+}
+
+static inline void update_pfn_range(struct virtio_balloon *vb,
+				 struct page *page)
+{
+	unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+	if (balloon_pfn < vb->min_pfn)
+		vb->min_pfn = balloon_pfn;
+	if (balloon_pfn > vb->max_pfn)
+		vb->max_pfn = balloon_pfn;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
 	struct scatterlist sg;
 	unsigned int len;
 
-	sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+		struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+		unsigned long bmap_len;
+
+		/* cmd and req_id are not used here, set them to 0 */
+		hdr->cmd = cpu_to_virtio16(vb->vdev, 0);
+		hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+		hdr->reserved = cpu_to_virtio16(vb->vdev, 0);
+		hdr->req_id = cpu_to_virtio64(vb->vdev, 0);
+		hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+		bmap_len = min(vb->bmap_len,
+			(vb->end_pfn - vb->start_pfn) / BITS_PER_BYTE);
+		hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+		sg_init_one(&sg, hdr,
+			 sizeof(struct balloon_bmap_hdr) + bmap_len);
+	} else
+		sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
 
 	/* We should always be able to add one buffer to an empty queue. */
 	virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
@@ -118,7 +178,6 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 	/* When host has read buffer, this completes via balloon_ack */
 	wait_event(vb->acked, virtqueue_get_buf(vq, &len));
-
 }
 
 static void set_page_pfns(struct virtio_balloon *vb,
@@ -133,13 +192,53 @@ static void set_page_pfns(struct virtio_balloon *vb,
 					  page_to_balloon_pfn(page) + i);
 }
 
-static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
+static void set_page_bitmap(struct virtio_balloon *vb,
+			 struct list_head *pages, struct virtqueue *vq)
+{
+	unsigned long pfn;
+	struct page *page;
+	bool found;
+
+	vb->min_pfn = rounddown(vb->min_pfn, BITS_PER_LONG);
+	vb->max_pfn = roundup(vb->max_pfn, BITS_PER_LONG);
+	for (pfn = vb->min_pfn; pfn < vb->max_pfn;
+			pfn += vb->pfn_limit) {
+		vb->start_pfn = pfn + vb->pfn_limit;
+		vb->end_pfn = pfn;
+		memset(vb->page_bitmap, 0, vb->bmap_len);
+		found = false;
+		list_for_each_entry(page, pages, lru) {
+			unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+			if (balloon_pfn < pfn ||
+				 balloon_pfn >= pfn + vb->pfn_limit)
+				continue;
+			set_bit(balloon_pfn - pfn, vb->page_bitmap);
+			if (balloon_pfn > vb->end_pfn)
+				vb->end_pfn = balloon_pfn;
+			if (balloon_pfn < vb->start_pfn)
+				vb->start_pfn = balloon_pfn;
+			found = true;
+		}
+		if (found) {
+			vb->start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG);
+			vb->end_pfn = roundup(vb->end_pfn, BITS_PER_LONG);
+			tell_host(vb, vq);
+		}
+	}
+}
+
+static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num,
+				 bool use_bmap)
 {
 	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
-	unsigned num_allocated_pages;
+	unsigned int num_allocated_pages;
 
-	/* We can only do one array worth at a time. */
-	num = min(num, ARRAY_SIZE(vb->pfns));
+	if (use_bmap)
+		init_pfn_range(vb);
+	else
+		/* We can only do one array worth at a time. */
+		num = min(num, ARRAY_SIZE(vb->pfns));
 
 	mutex_lock(&vb->balloon_lock);
 	for (vb->num_pfns = 0; vb->num_pfns < num;
@@ -154,7 +253,10 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 			msleep(200);
 			break;
 		}
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		if (use_bmap)
+			update_pfn_range(vb, page);
+		else
+			set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
 		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
@@ -163,8 +265,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 
 	num_allocated_pages = vb->num_pfns;
 	/* Did we get any? */
-	if (vb->num_pfns != 0)
-		tell_host(vb, vb->inflate_vq);
+	if (vb->num_pfns != 0) {
+		if (use_bmap)
+			set_page_bitmap(vb, &vb_dev_info->pages,
+					vb->inflate_vq);
+		else
+			tell_host(vb, vb->inflate_vq);
+	}
 	mutex_unlock(&vb->balloon_lock);
 
 	return num_allocated_pages;
@@ -184,15 +291,19 @@ static void release_pages_balloon(struct virtio_balloon *vb,
 	}
 }
 
-static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
+static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num,
+				bool use_bmap)
 {
-	unsigned num_freed_pages;
+	unsigned int num_freed_pages;
 	struct page *page;
 	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
 	LIST_HEAD(pages);
 
-	/* We can only do one array worth at a time. */
-	num = min(num, ARRAY_SIZE(vb->pfns));
+	if (use_bmap)
+		init_pfn_range(vb);
+	else
+		/* We can only do one array worth at a time. */
+		num = min(num, ARRAY_SIZE(vb->pfns));
 
 	mutex_lock(&vb->balloon_lock);
 	for (vb->num_pfns = 0; vb->num_pfns < num;
@@ -200,7 +311,10 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 		page = balloon_page_dequeue(vb_dev_info);
 		if (!page)
 			break;
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		if (use_bmap)
+			update_pfn_range(vb, page);
+		else
+			set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
 		list_add(&page->lru, &pages);
 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
 	}
@@ -211,9 +325,14 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 	 * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
 	 * is true, we *have* to do it in this order
 	 */
-	if (vb->num_pfns != 0)
-		tell_host(vb, vb->deflate_vq);
-	release_pages_balloon(vb, &pages);
+	if (vb->num_pfns != 0) {
+		if (use_bmap)
+			set_page_bitmap(vb, &pages, vb->deflate_vq);
+		else
+			tell_host(vb, vb->deflate_vq);
+
+		release_pages_balloon(vb, &pages);
+	}
 	mutex_unlock(&vb->balloon_lock);
 	return num_freed_pages;
 }
@@ -347,13 +466,15 @@ static int virtballoon_oom_notify(struct notifier_block *self,
 	struct virtio_balloon *vb;
 	unsigned long *freed;
 	unsigned num_freed_pages;
+	bool use_bmap;
 
 	vb = container_of(self, struct virtio_balloon, nb);
 	if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
 		return NOTIFY_OK;
 
 	freed = parm;
-	num_freed_pages = leak_balloon(vb, oom_pages);
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+	num_freed_pages = leak_balloon(vb, oom_pages, use_bmap);
 	update_balloon_size(vb);
 	*freed += num_freed_pages;
 
@@ -373,15 +494,17 @@ static void update_balloon_size_func(struct work_struct *work)
 {
 	struct virtio_balloon *vb;
 	s64 diff;
+	bool use_bmap;
 
 	vb = container_of(work, struct virtio_balloon,
 			  update_balloon_size_work);
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 	diff = towards_target(vb);
 
 	if (diff > 0)
-		diff -= fill_balloon(vb, diff);
+		diff -= fill_balloon(vb, diff, use_bmap);
 	else if (diff < 0)
-		diff += leak_balloon(vb, -diff);
+		diff += leak_balloon(vb, -diff, use_bmap);
 	update_balloon_size(vb);
 
 	if (diff)
@@ -489,7 +612,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 static int virtballoon_probe(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb;
-	int err;
+	int err, hdr_len;
 
 	if (!vdev->config->get) {
 		dev_err(&vdev->dev, "%s failure: config access disabled\n",
@@ -508,6 +631,18 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	spin_lock_init(&vb->stop_update_lock);
 	vb->stop_update = false;
 	vb->num_pages = 0;
+	vb->pfn_limit = VIRTIO_BALLOON_PFNS_LIMIT;
+	vb->pfn_limit = min(vb->pfn_limit, get_max_pfn());
+	vb->bmap_len = ALIGN(vb->pfn_limit, BITS_PER_LONG) /
+		 BITS_PER_BYTE + 2 * sizeof(unsigned long);
+	hdr_len = sizeof(struct balloon_bmap_hdr);
+	vb->bmap_hdr = kzalloc(hdr_len + vb->bmap_len, GFP_KERNEL);
+
+	/* Clear the feature bit if memory allocation fails */
+	if (!vb->bmap_hdr)
+		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+	else
+		vb->page_bitmap = vb->bmap_hdr + hdr_len;
 	mutex_init(&vb->balloon_lock);
 	init_waitqueue_head(&vb->acked);
 	vb->vdev = vdev;
@@ -541,9 +676,12 @@ out:
 
 static void remove_common(struct virtio_balloon *vb)
 {
+	bool use_bmap;
+
+	use_bmap = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 	/* There might be pages left in the balloon: free them. */
 	while (vb->num_pages)
-		leak_balloon(vb, vb->num_pages);
+		leak_balloon(vb, vb->num_pages, use_bmap);
 	update_balloon_size(vb);
 
 	/* Now we reset the device so we can clean up the queues. */
@@ -565,6 +703,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	cancel_work_sync(&vb->update_balloon_stats_work);
 
 	remove_common(vb);
+	kfree(vb->page_bitmap);
 	kfree(vb);
 }
 
@@ -603,6 +742,7 @@ static unsigned int features[] = {
 	VIRTIO_BALLOON_F_MUST_TELL_HOST,
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+	VIRTIO_BALLOON_F_PAGE_BITMAP,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
-- 
1.8.3.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox