* [PATCH v24 2/2] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
From: Wei Wang @ 2018-01-24 10:42 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
In-Reply-To: <1516790562-37889-1-git-send-email-wei.w.wang@intel.com>
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.
Host requests the guest to report free pages by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.
When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host may also requests
the guest to stop the reporting in advance by sending the stop cmd id to
the guest via the configuration register.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
drivers/virtio/virtio_balloon.c | 265 +++++++++++++++++++++++++++++++-----
include/uapi/linux/virtio_balloon.h | 7 +
2 files changed, 236 insertions(+), 36 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index a1fb52c..4440873 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -51,9 +51,21 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
static struct vfsmount *balloon_mnt;
#endif
+/* The number of virtqueues supported by virtio-balloon */
+#define VIRTIO_BALLOON_VQ_NUM 4
+#define VIRTIO_BALLOON_VQ_ID_INFLATE 0
+#define VIRTIO_BALLOON_VQ_ID_DEFLATE 1
+#define VIRTIO_BALLOON_VQ_ID_STATS 2
+#define VIRTIO_BALLOON_VQ_ID_FREE_PAGE 3
+
struct virtio_balloon {
struct virtio_device *vdev;
- struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+ struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+ /* Balloon's own wq for cpu-intensive work items */
+ struct workqueue_struct *balloon_wq;
+ /* The free page reporting work item submitted to the balloon wq */
+ struct work_struct report_free_page_work;
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -63,6 +75,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
+ /* Start to report free pages */
+ bool report_free_page;
+ /* Stores the cmd id given by host to start the free page reporting */
+ __virtio32 start_cmd_id;
+ /* Stores STOP_ID as a sign to tell host that the reporting is done */
+ __virtio32 stop_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
@@ -281,6 +300,53 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
return idx;
}
+static int add_one_sg(struct virtqueue *vq, unsigned long pfn, uint32_t len)
+{
+ struct scatterlist sg;
+ unsigned int unused;
+ int ret = 0;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg, pfn_to_page(pfn), len, 0);
+
+ /* Detach all the used buffers from the vq */
+ while (virtqueue_get_buf(vq, &unused))
+ ;
+
+ /*
+ * Since this is an optimization feature, losing a couple of free
+ * pages to report isn't important. We simply return without adding
+ * the page if the vq is full.
+ * We are adding one entry each time, which essentially results in no
+ * memory allocation, so the GFP_KERNEL flag below can be ignored.
+ * There is always one entry reserved for the cmd id to use.
+ */
+ if (vq->num_free > 1)
+ ret = virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL);
+
+ if (vq->num_free < virtqueue_get_vring_size(vq) / 2)
+ virtqueue_kick(vq);
+
+ return ret;
+}
+
+static void send_cmd_id(struct virtio_balloon *vb, __virtio32 *cmd_id)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq = vb->free_page_vq;
+
+ if (unlikely(!virtio_has_feature(vb->vdev,
+ VIRTIO_BALLOON_F_FREE_PAGE_HINT)))
+ return;
+
+ sg_init_one(&sg, cmd_id, sizeof(*cmd_id));
+
+ if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL))
+ __virtio_clear_bit(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+
+ virtqueue_kick(vq);
+}
+
/*
* While most virtqueues communicate guest-initiated requests to the hypervisor,
* the stats queue operates in reverse. The driver initializes the virtqueue
@@ -316,17 +382,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
}
-static void virtballoon_changed(struct virtio_device *vdev)
-{
- struct virtio_balloon *vb = vdev->priv;
- unsigned long flags;
-
- spin_lock_irqsave(&vb->stop_update_lock, flags);
- if (!vb->stop_update)
- queue_work(system_freezable_wq, &vb->update_balloon_size_work);
- spin_unlock_irqrestore(&vb->stop_update_lock, flags);
-}
-
static inline s64 towards_target(struct virtio_balloon *vb)
{
s64 target;
@@ -343,6 +398,49 @@ static inline s64 towards_target(struct virtio_balloon *vb)
return target - vb->num_pages;
}
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+ unsigned long flags;
+ __u32 cmd_id;
+ s64 diff = towards_target(vb);
+
+ if (diff) {
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update)
+ queue_work(system_freezable_wq,
+ &vb->update_balloon_size_work);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+ }
+
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ virtio_cread(vdev, struct virtio_balloon_config,
+ free_page_report_cmd_id, &cmd_id);
+ if (cmd_id == VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
+ vb->report_free_page = false;
+ } else {
+ /*
+ * The request is queued only when the ack of the
+ * previous request has been sent to host, which is
+ * indicated by start_cmd_id set to
+ * VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID. Otherwise,
+ * simply update the start_cmd_id, and when the
+ * previous queued work runs, the latest cmd id will
+ * be sent to host.
+ */
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update &&
+ virtio32_to_cpu(vdev, vb->start_cmd_id) ==
+ VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID)
+ queue_work(vb->balloon_wq,
+ &vb->report_free_page_work);
+ vb->report_free_page = true;
+ vb->start_cmd_id = cpu_to_virtio32(vdev, cmd_id);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+ }
+ }
+}
+
static void update_balloon_size(struct virtio_balloon *vb)
{
u32 actual = vb->num_pages;
@@ -417,42 +515,108 @@ static void update_balloon_size_func(struct work_struct *work)
static int init_vqs(struct virtio_balloon *vb)
{
- struct virtqueue *vqs[3];
- vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
- static const char * const names[] = { "inflate", "deflate", "stats" };
- int err, nvqs;
+ struct virtqueue *vqs[VIRTIO_BALLOON_VQ_NUM];
+ vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_NUM];
+ const char *names[VIRTIO_BALLOON_VQ_NUM];
+ struct scatterlist sg;
+ int ret;
/*
- * We expect two virtqueues: inflate and deflate, and
- * optionally stat.
+ * Inflateq and deflateq are used unconditionally. The names[]
+ * will be NULL if the related feature is not enabled, which will
+ * cause no allocation for the corresponding virtqueue in find_vqs.
*/
- nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
- err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
- if (err)
- return err;
+ callbacks[VIRTIO_BALLOON_VQ_ID_INFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_ID_INFLATE] = "inflate";
+ callbacks[VIRTIO_BALLOON_VQ_ID_DEFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_ID_DEFLATE] = "deflate";
+ names[VIRTIO_BALLOON_VQ_ID_STATS] = NULL;
+ names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
- vb->inflate_vq = vqs[0];
- vb->deflate_vq = vqs[1];
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
- struct scatterlist sg;
- unsigned int num_stats;
- vb->stats_vq = vqs[2];
+ names[VIRTIO_BALLOON_VQ_ID_STATS] = "stats";
+ callbacks[VIRTIO_BALLOON_VQ_ID_STATS] = stats_request;
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = "free_page_vq";
+ callbacks[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
+ }
+
+ ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_NUM,
+ vqs, callbacks, names, NULL, NULL);
+ if (ret)
+ return ret;
+ vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_INFLATE];
+ vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_DEFLATE];
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+ vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_ID_STATS];
/*
* Prime this virtqueue with one buffer so the hypervisor can
* use it to signal us later (it can't be broken yet!).
*/
- num_stats = update_balloon_stats(vb);
-
- sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
- if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
- < 0)
- BUG();
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats));
+ ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
+ GFP_KERNEL);
+ if (ret) {
+ dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+ __func__);
+ return ret;
+ }
virtqueue_kick(vb->stats_vq);
}
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_ID_FREE_PAGE];
+
return 0;
}
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+ uint32_t len = nr_pages << PAGE_SHIFT;
+ int ret;
+
+ if (!vb->report_free_page ||
+ unlikely(!virtio_has_feature(vb->vdev,
+ VIRTIO_BALLOON_F_FREE_PAGE_HINT)))
+ return false;
+
+ ret = add_one_sg(vb->free_page_vq, pfn, len);
+ if (unlikely(ret))
+ __virtio_clear_bit(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+
+ return !ret;
+}
+
+static void report_free_page_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+ unsigned long flags;
+
+ vb = container_of(work, struct virtio_balloon, report_free_page_work);
+
+ /* Start by sending the obtained cmd id to the host with an outbuf */
+ send_cmd_id(vb, &vb->start_cmd_id);
+
+ /*
+ * Set start_cmd_id to VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID to
+ * indicate a new request can be queued.
+ */
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ vb->start_cmd_id = cpu_to_virtio32(vb->vdev,
+ VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+
+ walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
+
+ /* End by sending the stop id to the host with an outbuf */
+ send_cmd_id(vb, &vb->stop_cmd_id);
+}
+
#ifdef CONFIG_BALLOON_COMPACTION
/*
* virtballoon_migratepage - perform the balloon page migration on behalf of
@@ -537,6 +701,7 @@ static struct file_system_type balloon_fs = {
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
+ __u32 poison_val;
int err;
if (!vdev->config->get) {
@@ -566,18 +731,37 @@ static int virtballoon_probe(struct virtio_device *vdev)
if (err)
goto out_free_vb;
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ vb->balloon_wq = alloc_workqueue("balloon-wq",
+ WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+ if (!vb->balloon_wq) {
+ err = -ENOMEM;
+ goto out_del_vqs;
+ }
+ INIT_WORK(&vb->report_free_page_work, report_free_page_func);
+ vb->start_cmd_id = cpu_to_virtio32(vdev,
+ VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+ vb->stop_cmd_id = cpu_to_virtio32(vdev,
+ VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+ if(virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+ poison_val = PAGE_POISON;
+ virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+ poison_val, &poison_val);
+ }
+ }
+
vb->nb.notifier_call = virtballoon_oom_notify;
vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
err = register_oom_notifier(&vb->nb);
if (err < 0)
- goto out_del_vqs;
+ goto out_del_balloon_wq;
#ifdef CONFIG_BALLOON_COMPACTION
balloon_mnt = kern_mount(&balloon_fs);
if (IS_ERR(balloon_mnt)) {
err = PTR_ERR(balloon_mnt);
unregister_oom_notifier(&vb->nb);
- goto out_del_vqs;
+ goto out_del_balloon_wq;
}
vb->vb_dev_info.migratepage = virtballoon_migratepage;
@@ -587,7 +771,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
kern_unmount(balloon_mnt);
unregister_oom_notifier(&vb->nb);
vb->vb_dev_info.inode = NULL;
- goto out_del_vqs;
+ goto out_del_balloon_wq;
}
vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
#endif
@@ -598,6 +782,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
virtballoon_changed(vdev);
return 0;
+out_del_balloon_wq:
+ destroy_workqueue(vb->balloon_wq);
out_del_vqs:
vdev->config->del_vqs(vdev);
out_free_vb:
@@ -630,6 +816,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
spin_unlock_irq(&vb->stop_update_lock);
cancel_work_sync(&vb->update_balloon_size_work);
cancel_work_sync(&vb->update_balloon_stats_work);
+ cancel_work_sync(&vb->report_free_page_work);
+ destroy_workqueue(vb->balloon_wq);
remove_common(vb);
#ifdef CONFIG_BALLOON_COMPACTION
@@ -674,6 +862,9 @@ static int virtballoon_restore(struct virtio_device *vdev)
static int virtballoon_validate(struct virtio_device *vdev)
{
+ if (!page_poisoning_enabled())
+ __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+
__virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
return 0;
}
@@ -682,6 +873,8 @@ static unsigned int features[] = {
VIRTIO_BALLOON_F_MUST_TELL_HOST,
VIRTIO_BALLOON_F_STATS_VQ,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+ VIRTIO_BALLOON_F_FREE_PAGE_HINT,
+ VIRTIO_BALLOON_F_PAGE_POISON,
};
static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 343d7dd..3f97067 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,15 +34,22 @@
#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
#define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */
#define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
+#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
/* Size of a PFN in the balloon interface. */
#define VIRTIO_BALLOON_PFN_SHIFT 12
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
struct virtio_balloon_config {
/* Number of pages host wants Guest to give up. */
__u32 num_pages;
/* Number of pages we've actually got in balloon. */
__u32 actual;
+ /* Free page report command id, readonly by guest */
+ __u32 free_page_report_cmd_id;
+ /* Stores PAGE_POISON if page poisoning is in use */
+ __u32 poison_val;
};
#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
--
2.7.4
^ permalink raw reply related
* [PATCH v24 1/2] mm: support reporting free page blocks
From: Wei Wang @ 2018-01-24 10:42 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
In-Reply-To: <1516790562-37889-1-git-send-email-wei.w.wang@intel.com>
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.
One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michal Hocko <mhocko@kernel.org>
---
include/linux/mm.h | 6 ++++
mm/page_alloc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ea818ff..b3077dd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1938,6 +1938,12 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
extern void free_initmem(void);
+extern void walk_free_mem_block(void *opaque,
+ int min_order,
+ bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num));
+
/*
* Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
* into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688..705de22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4899,6 +4899,97 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
show_swap_cache_info();
}
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+ struct zone *zone,
+ int order,
+ enum migratetype mt,
+ bool (*report_pfn_range)(void *,
+ unsigned long,
+ unsigned long))
+{
+ struct page *page;
+ struct list_head *list;
+ unsigned long pfn, flags;
+ bool ret;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ list = &zone->free_area[order].free_list[mt];
+ list_for_each_entry(page, list, lru) {
+ pfn = page_to_pfn(page);
+ ret = report_pfn_range(opaque, pfn, 1 << order);
+ if (!ret)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+ int min_order,
+ bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num))
+{
+ struct zone *zone;
+ int order;
+ enum migratetype mt;
+ bool ret;
+
+ for_each_populated_zone(zone) {
+ for (order = MAX_ORDER - 1; order >= min_order; order--) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ ret = walk_free_page_list(opaque, zone,
+ order, mt,
+ report_pfn_range);
+ if (!ret)
+ return;
+ }
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(walk_free_mem_block);
+
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
--
2.7.4
^ permalink raw reply related
* [PATCH v24 0/2] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-01-24 10:42 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
This patch series is separated from the previous "Virtio-balloon
Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
implemented by this series enables the virtio-balloon driver to report
hints of guest free pages to the host. It can be used to accelerate live
migration of VMs. Here is an introduction of this usage:
Live migration needs to transfer the VM's memory from the source machine
to the destination round by round. For the 1st round, all the VM's memory
is transferred. From the 2nd round, only the pieces of memory that were
written by the guest (after the 1st round) are transferred. One method
that is popularly used by the hypervisor to track which part of memory is
written is to write-protect all the guest memory.
The second feature enables the optimization of the 1st round memory
transfer - the hypervisor can skip the transfer of guest free pages in the
1st round. It is not concerned that the memory pages are used after they
are given to the hypervisor as a hint of the free pages, because they will
be tracked by the hypervisor and transferred in the next round if they are
used and written.
ChangeLog:
v23->v24:
- change feature name VIRTIO_BALLOON_F_FREE_PAGE_VQ to
VIRTIO_BALLOON_F_FREE_PAGE_HINT
- kick when vq->num_free < half full, instead of "= half full"
- replace BUG_ON with bailing out
- check vb->balloon_wq in probe(), if null, bail out
- add a new feature bit for page poisoning
- solve the corner case that one cmd id being sent to host twice
v22->v23:
- change to kick the device when the vq is half-way full;
- open-code batch_free_page_sg into add_one_sg;
- change cmd_id from "uint32_t" to "__virtio32";
- reserver one entry in the vq for teh driver to send cmd_id, instead
of busywaiting for an available entry;
- add "stop_update" check before queue_work for prudence purpose for
now, will have a separate patch to discuss this flag check later;
- init_vqs: change to put some variables on stack to have simpler
implementation;
- add destroy_workqueue(vb->balloon_wq);
v21->v22:
- add_one_sg: some code and comment re-arrangement
- send_cmd_id: handle a cornercase
For previous ChangeLog, please reference
https://lwn.net/Articles/743660/
Wei Wang (2):
mm: support reporting free page blocks
virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
drivers/virtio/virtio_balloon.c | 265 +++++++++++++++++++++++++++++++-----
include/linux/mm.h | 6 +
include/uapi/linux/virtio_balloon.h | 7 +
mm/page_alloc.c | 91 +++++++++++++
4 files changed, 333 insertions(+), 36 deletions(-)
--
2.7.4
^ permalink raw reply
* Re: [PATCH v23 2/2] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Wei Wang @ 2018-01-24 5:47 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization
In-Reply-To: <20180124064923-mutt-send-email-mst@kernel.org>
On 01/24/2018 01:01 PM, Michael S. Tsirkin wrote:
> On Wed, Jan 24, 2018 at 10:50:27AM +0800, Wei Wang wrote:
> This will not DTRT in all cases. It's quite possible
> that host does not need the kick when ring is half full but
> does need it later when ring is full.
> You can kick at ring half full as optimization but you absolutely
> still must kick on ring full. Something like:
>
> if (vq->num_free == virtqueue_get_vring_size(vq) / 2 ||
> vq->num_free <= 2)
Right. Would "if (vq->num_free < virtqueue_get_vring_size(vq) / 2" be
better?
Best,
Wei
^ permalink raw reply
* Re: [PATCH v23 2/2] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Michael S. Tsirkin @ 2018-01-24 5:01 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization
In-Reply-To: <1516762227-36346-3-git-send-email-wei.w.wang@intel.com>
On Wed, Jan 24, 2018 at 10:50:27AM +0800, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> support of reporting hints of guest free pages to host via virtio-balloon.
>
> Host requests the guest to report free pages by sending a new cmd
> id to the guest via the free_page_report_cmd_id configuration register.
>
> When the guest starts to report, the first element added to the free page
> vq is the cmd id given by host. When the guest finishes the reporting
> of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
> to the vq to tell host that the reporting is done. Host may also requests
> the guest to stop the reporting in advance by sending the stop cmd id to
> the guest via the configuration register.
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> ---
> drivers/virtio/virtio_balloon.c | 228 ++++++++++++++++++++++++++++++------
> include/uapi/linux/virtio_balloon.h | 6 +
> 2 files changed, 201 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index a1fb52c..d038f4a 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -51,9 +51,21 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> static struct vfsmount *balloon_mnt;
> #endif
>
> +/* The number of virtqueues supported by virtio-balloon */
> +#define VIRTIO_BALLOON_VQ_NUM 4
> +#define VIRTIO_BALLOON_VQ_ID_INFLATE 0
> +#define VIRTIO_BALLOON_VQ_ID_DEFLATE 1
> +#define VIRTIO_BALLOON_VQ_ID_STATS 2
> +#define VIRTIO_BALLOON_VQ_ID_FREE_PAGE 3
> +
> struct virtio_balloon {
> struct virtio_device *vdev;
> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> + /* Balloon's own wq for cpu-intensive work items */
> + struct workqueue_struct *balloon_wq;
> + /* The free page reporting work item submitted to the balloon wq */
> + struct work_struct report_free_page_work;
>
> /* The balloon servicing is delegated to a freezable workqueue. */
> struct work_struct update_balloon_stats_work;
> @@ -63,6 +75,13 @@ struct virtio_balloon {
> spinlock_t stop_update_lock;
> bool stop_update;
>
> + /* Start to report free pages */
> + bool report_free_page;
> + /* Stores the cmd id given by host to start the free page reporting */
> + __virtio32 start_cmd_id;
> + /* Stores STOP_ID as a sign to tell host that the reporting is done */
> + __virtio32 stop_cmd_id;
> +
> /* Waiting for host to ack the pages we released. */
> wait_queue_head_t acked;
>
> @@ -281,6 +300,56 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
> return idx;
> }
>
> +static void add_one_sg(struct virtqueue *vq, unsigned long pfn, uint32_t len)
> +{
> + struct scatterlist sg;
> + unsigned int unused;
> + int err;
> +
> + sg_init_table(&sg, 1);
> + sg_set_page(&sg, pfn_to_page(pfn), len, 0);
> +
> + /* Detach all the used buffers from the vq */
> + while (virtqueue_get_buf(vq, &unused))
> + ;
> +
> + /*
> + * Since this is an optimization feature, losing a couple of free
> + * pages to report isn't important. We simply return without adding
> + * the page if the vq is full.
> + * We are adding one entry each time, which essentially results in no
> + * memory allocation, so the GFP_KERNEL flag below can be ignored.
> + * There is always one entry reserved for the cmd id to use.
> + */
> + if (vq->num_free > 1) {
> + err = virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL);
> + /*
> + * This is expected to never fail, because there is always an
> + * entry available on the vq.
> + */
> + BUG_ON(err);
> + }
> +
> + if (vq->num_free == virtqueue_get_vring_size(vq) / 2)
> + virtqueue_kick(vq);
This will not DTRT in all cases. It's quite possible
that host does not need the kick when ring is half full but
does need it later when ring is full.
You can kick at ring half full as optimization but you absolutely
still must kick on ring full. Something like:
if (vq->num_free == virtqueue_get_vring_size(vq) / 2 ||
vq->num_free <= 2)
> +}
> +
> +static void send_cmd_id(struct virtqueue *vq, __virtio32 *cmd_id)
> +{
> + struct scatterlist sg;
> + int err;
> +
> + sg_init_one(&sg, cmd_id, sizeof(*cmd_id));
> +
> + err = virtqueue_add_outbuf(vq, &sg, 1, vq, GFP_KERNEL);
> + /*
> + * This is expected to never fail, because there is always an
> + * entry reserved for the cmd id.
> + */
> + BUG_ON(err);
> + virtqueue_kick(vq);
Actually add can fail if device becomes broken. I'd like to see us
bail out gracefully rather than BUG_ON.
> +}
> +
> /*
> * While most virtqueues communicate guest-initiated requests to the hypervisor,
> * the stats queue operates in reverse. The driver initializes the virtqueue
> @@ -316,17 +385,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
> virtqueue_kick(vq);
> }
>
> -static void virtballoon_changed(struct virtio_device *vdev)
> -{
> - struct virtio_balloon *vb = vdev->priv;
> - unsigned long flags;
> -
> - spin_lock_irqsave(&vb->stop_update_lock, flags);
> - if (!vb->stop_update)
> - queue_work(system_freezable_wq, &vb->update_balloon_size_work);
> - spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> -}
> -
> static inline s64 towards_target(struct virtio_balloon *vb)
> {
> s64 target;
> @@ -343,6 +401,42 @@ static inline s64 towards_target(struct virtio_balloon *vb)
> return target - vb->num_pages;
> }
>
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> + struct virtio_balloon *vb = vdev->priv;
> + unsigned long flags;
> + __u32 cmd_id;
> + s64 diff = towards_target(vb);
> +
> + if (diff) {
> + spin_lock_irqsave(&vb->stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(system_freezable_wq,
> + &vb->update_balloon_size_work);
> + spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> + }
> +
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> + virtio_cread(vdev, struct virtio_balloon_config,
> + free_page_report_cmd_id, &cmd_id);
> + if (cmd_id == VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
> + vb->report_free_page = false;
> + } else if (cmd_id != virtio32_to_cpu(vdev, vb->start_cmd_id)) {
> + /*
> + * Host requests to start the reporting by sending a
> + * new cmd id.
> + */
> + vb->report_free_page = true;
> + vb->start_cmd_id = cpu_to_virtio32(vdev, cmd_id);
> + spin_lock_irqsave(&vb->stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(vb->balloon_wq,
> + &vb->report_free_page_work);
> + spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> + }
> + }
> +}
> +
> static void update_balloon_size(struct virtio_balloon *vb)
> {
> u32 actual = vb->num_pages;
> @@ -417,42 +511,91 @@ static void update_balloon_size_func(struct work_struct *work)
>
> static int init_vqs(struct virtio_balloon *vb)
> {
> - struct virtqueue *vqs[3];
> - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> - static const char * const names[] = { "inflate", "deflate", "stats" };
> - int err, nvqs;
> + struct virtqueue *vqs[VIRTIO_BALLOON_VQ_NUM];
> + vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_NUM];
> + const char *names[VIRTIO_BALLOON_VQ_NUM];
> + struct scatterlist sg;
> + int ret;
>
> /*
> - * We expect two virtqueues: inflate and deflate, and
> - * optionally stat.
> + * Inflateq and deflateq are used unconditionally. stats_vq and
> + * free_page_vq uses names[2] and names[3], respectively. The names[]
> + * will be NULL if the related feature is not enabled, which will
> + * cause no allocation for the corresponding virtqueue in find_vqs.
> */
> - nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> - err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> - if (err)
> - return err;
> + callbacks[VIRTIO_BALLOON_VQ_ID_INFLATE] = balloon_ack;
> + names[VIRTIO_BALLOON_VQ_ID_INFLATE] = "inflate";
> + callbacks[VIRTIO_BALLOON_VQ_ID_DEFLATE] = balloon_ack;
> + names[VIRTIO_BALLOON_VQ_ID_DEFLATE] = "deflate";
> + names[VIRTIO_BALLOON_VQ_ID_STATS] = NULL;
> + names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
>
> - vb->inflate_vq = vqs[0];
> - vb->deflate_vq = vqs[1];
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> - struct scatterlist sg;
> - unsigned int num_stats;
> - vb->stats_vq = vqs[2];
> + names[VIRTIO_BALLOON_VQ_ID_STATS] = "stats";
> + callbacks[VIRTIO_BALLOON_VQ_ID_STATS] = stats_request;
> + }
> +
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> + names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = "free_page_vq";
> + callbacks[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
> + }
> +
> + ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_NUM,
> + vqs, callbacks, names, NULL, NULL);
> + if (ret)
> + return ret;
>
> + vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_INFLATE];
> + vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_DEFLATE];
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> + vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_ID_STATS];
> /*
> * Prime this virtqueue with one buffer so the hypervisor can
> * use it to signal us later (it can't be broken yet!).
> */
> - num_stats = update_balloon_stats(vb);
> -
> - sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> - if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> - < 0)
> - BUG();
> + sg_init_one(&sg, vb->stats, sizeof(vb->stats));
> + ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
> + GFP_KERNEL);
> + if (ret) {
> + dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> + __func__);
> + return ret;
> + }
> virtqueue_kick(vb->stats_vq);
> }
> +
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> + vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_ID_FREE_PAGE];
> +
> return 0;
> }
>
> +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> + unsigned long nr_pages)
> +{
> + struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> + uint32_t len = nr_pages << PAGE_SHIFT;
> +
> + if (!vb->report_free_page)
> + return false;
> +
> + add_one_sg(vb->free_page_vq, pfn, len);
> +
> + return true;
> +}
> +
> +static void report_free_page_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> +
> + vb = container_of(work, struct virtio_balloon, report_free_page_work);
> + /* Start by sending the obtained cmd id to the host with an outbuf */
> + send_cmd_id(vb->free_page_vq, &vb->start_cmd_id);
> + walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> + /* End by sending the stop id to the host with an outbuf */
> + send_cmd_id(vb->free_page_vq, &vb->stop_cmd_id);
> +}
> +
> #ifdef CONFIG_BALLOON_COMPACTION
> /*
> * virtballoon_migratepage - perform the balloon page migration on behalf of
> @@ -537,6 +680,7 @@ static struct file_system_type balloon_fs = {
> static int virtballoon_probe(struct virtio_device *vdev)
> {
> struct virtio_balloon *vb;
> + __u32 poison_val;
> int err;
>
> if (!vdev->config->get) {
> @@ -566,6 +710,21 @@ static int virtballoon_probe(struct virtio_device *vdev)
> if (err)
> goto out_free_vb;
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> + vb->balloon_wq = alloc_workqueue("balloon-wq",
> + WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
why don't you check the return value here?
> + INIT_WORK(&vb->report_free_page_work, report_free_page_func);
> + vb->stop_cmd_id = cpu_to_virtio32(vdev,
> + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
> + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) ||
> + !page_poisoning_enabled())
> + poison_val = 0;
> + else
> + poison_val = PAGE_POISON;
> + virtio_cwrite(vb->vdev, struct virtio_balloon_config,
> + poison_val, &poison_val);
I think we should differentiate between 0 and not poison.
How about we use a separate feature bit for the poison flag?
> + }
> +
> vb->nb.notifier_call = virtballoon_oom_notify;
> vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
> err = register_oom_notifier(&vb->nb);
> @@ -630,6 +789,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
> spin_unlock_irq(&vb->stop_update_lock);
> cancel_work_sync(&vb->update_balloon_size_work);
> cancel_work_sync(&vb->update_balloon_stats_work);
> + cancel_work_sync(&vb->report_free_page_work);
> + destroy_workqueue(vb->balloon_wq);
>
> remove_common(vb);
> #ifdef CONFIG_BALLOON_COMPACTION
> @@ -682,6 +843,7 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_MUST_TELL_HOST,
> VIRTIO_BALLOON_F_STATS_VQ,
> VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> + VIRTIO_BALLOON_F_FREE_PAGE_VQ,
> };
>
> static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 343d7dd..5861876 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -34,15 +34,21 @@
> #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
> #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */
> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_VQ 3 /* VQ to report free pages */
I'd call it something like FREE_PAGE_HINT
>
> /* Size of a PFN in the balloon interface. */
> #define VIRTIO_BALLOON_PFN_SHIFT 12
>
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
> struct virtio_balloon_config {
> /* Number of pages host wants Guest to give up. */
> __u32 num_pages;
> /* Number of pages we've actually got in balloon. */
> __u32 actual;
> + /* Free page report command id, readonly by guest */
> + __u32 free_page_report_cmd_id;
> + /* Stores PAGE_POISON if page poisoning with sanity check is in use */
> + __u32 poison_val;
> };
>
> #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
> --
> 2.7.4
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v22 2/3] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Michael S. Tsirkin @ 2018-01-24 4:31 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, nilal, virtualization
In-Reply-To: <5A67FB10.2050201@intel.com>
On Wed, Jan 24, 2018 at 11:18:40AM +0800, Wei Wang wrote:
> On 01/22/2018 07:25 PM, Wei Wang wrote:
> > On 01/19/2018 08:39 PM, Michael S. Tsirkin wrote:
> > > On Fri, Jan 19, 2018 at 11:44:21AM +0800, Wei Wang wrote:
> > > > On 01/18/2018 12:44 AM, Michael S. Tsirkin wrote:
> > > > > On Wed, Jan 17, 2018 at 01:10:11PM +0800, Wei Wang wrote:
> > > > >
> > > > > > + vb->start_cmd_id = cmd_id;
> > > > > > + queue_work(vb->balloon_wq, &vb->report_free_page_work);
> > > > > It seems that if a command was already queued (with a different id),
> > > > > this will result in new command id being sent to host twice,
> > > > > which will
> > > > > likely confuse the host.
> > > > I think that case won't happen, because
> > > > - the host sends a cmd id to the guest via the config, while the
> > > > guest acks
> > > > back the received cmd id via the virtqueue;
> > > > - the guest ack back a cmd id only when a new cmd id is received
> > > > from the
> > > > host, that is the above check:
> > > >
> > > > if (cmd_id != vb->start_cmd_id) { --> the driver only queues the
> > > > reporting work only when a new cmd id is received
> > > > /*
> > > > * Host requests to start the reporting
> > > > by sending a
> > > > * new cmd id.
> > > > */
> > > > WRITE_ONCE(vb->report_free_page, true);
> > > > vb->start_cmd_id = cmd_id;
> > > > queue_work(vb->balloon_wq,
> > > > &vb->report_free_page_work);
> > > > }
> > > >
> > > > So the same cmd id wouldn't queue the reporting work twice.
> > > >
> > > Like this:
> > >
> > > vb->start_cmd_id = cmd_id;
> > > queue_work(vb->balloon_wq, &vb->report_free_page_work);
> > >
> > > command id changes
> > >
> > > vb->start_cmd_id = cmd_id;
> > >
> > > work executes
> > >
> > > queue_work(vb->balloon_wq, &vb->report_free_page_work);
> > >
> > > work executes again
> > >
> >
> > If we think about the whole working flow, I think this case couldn't
> > happen:
> >
> > 1) device send cmd_id=1 to driver;
> > 2) driver receives cmd_id=1 in the config and acks cmd_id=1 to the
> > device via the vq;
> > 3) device revives cmd_id=1;
> > 4) device wants to stop the reporting by sending cmd_id=STOP;
> > 5) driver receives cmd_id=STOP from the config, and acks cmd_id=STOP to
> > the device via the vq;
> > 6) device sends cmd_id=2 to driver;
> > ...
> >
> > cmd_id=2 won't come after cmd_id=1, there will be a STOP cmd in between
> > them (STOP won't queue the work).
> >
> > How about defining the correct device behavior in the spec:
> > The device Should NOT send a second cmd id to the driver until a STOP
> > cmd ack for the previous cmd id has been received from the guest.
>
>
> Thanks for the comments, and I adopted most of them in the new posted v23
> patches. The above discussion is the one that I haven't included. If you
> could still see issues in the above analysis, please let me know. Thanks.
>
> Best,
> Wei
>
>
>
Yes, I think you should just fix the race in the driver.
--
MST
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v22 2/3] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Michael S. Tsirkin @ 2018-01-24 4:29 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization
In-Reply-To: <5A65CA39.2070906@intel.com>
On Mon, Jan 22, 2018 at 07:25:45PM +0800, Wei Wang wrote:
> On 01/19/2018 08:39 PM, Michael S. Tsirkin wrote:
> > On Fri, Jan 19, 2018 at 11:44:21AM +0800, Wei Wang wrote:
> > > On 01/18/2018 12:44 AM, Michael S. Tsirkin wrote:
> > > > On Wed, Jan 17, 2018 at 01:10:11PM +0800, Wei Wang wrote:
> > > >
> > > > > + vb->start_cmd_id = cmd_id;
> > > > > + queue_work(vb->balloon_wq, &vb->report_free_page_work);
> > > > It seems that if a command was already queued (with a different id),
> > > > this will result in new command id being sent to host twice, which will
> > > > likely confuse the host.
> > > I think that case won't happen, because
> > > - the host sends a cmd id to the guest via the config, while the guest acks
> > > back the received cmd id via the virtqueue;
> > > - the guest ack back a cmd id only when a new cmd id is received from the
> > > host, that is the above check:
> > >
> > > if (cmd_id != vb->start_cmd_id) { --> the driver only queues the
> > > reporting work only when a new cmd id is received
> > > /*
> > > * Host requests to start the reporting by sending a
> > > * new cmd id.
> > > */
> > > WRITE_ONCE(vb->report_free_page, true);
> > > vb->start_cmd_id = cmd_id;
> > > queue_work(vb->balloon_wq,
> > > &vb->report_free_page_work);
> > > }
> > >
> > > So the same cmd id wouldn't queue the reporting work twice.
> > >
> > Like this:
> >
> > vb->start_cmd_id = cmd_id;
> > queue_work(vb->balloon_wq, &vb->report_free_page_work);
> >
> > command id changes
> >
> > vb->start_cmd_id = cmd_id;
> >
> > work executes
> >
> > queue_work(vb->balloon_wq, &vb->report_free_page_work);
> >
> > work executes again
> >
>
> If we think about the whole working flow, I think this case couldn't happen:
>
> 1) device send cmd_id=1 to driver;
> 2) driver receives cmd_id=1 in the config and acks cmd_id=1 to the device
> via the vq;
> 3) device revives cmd_id=1;
> 4) device wants to stop the reporting by sending cmd_id=STOP;
> 5) driver receives cmd_id=STOP from the config, and acks cmd_id=STOP to the
> device via the vq;
> 6) device sends cmd_id=2 to driver;
> ...
>
> cmd_id=2 won't come after cmd_id=1, there will be a STOP cmd in between them
> (STOP won't queue the work).
>
> How about defining the correct device behavior in the spec:
> The device Should NOT send a second cmd id to the driver until a STOP cmd
> ack for the previous cmd id has been received from the guest.
>
>
> Best,
> Wei
I think we should just fix races in the driver rather than introduce
random restrictions in the device.
If device wants to start a new sequence, it should be able to
do just that without a complicated back and forth with several
roundtrips through the driver.
--
MST
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v22 2/3] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Wei Wang @ 2018-01-24 3:18 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, nilal, virtualization
In-Reply-To: <5A65CA39.2070906@intel.com>
On 01/22/2018 07:25 PM, Wei Wang wrote:
> On 01/19/2018 08:39 PM, Michael S. Tsirkin wrote:
>> On Fri, Jan 19, 2018 at 11:44:21AM +0800, Wei Wang wrote:
>>> On 01/18/2018 12:44 AM, Michael S. Tsirkin wrote:
>>>> On Wed, Jan 17, 2018 at 01:10:11PM +0800, Wei Wang wrote:
>>>>
>>>>> + vb->start_cmd_id = cmd_id;
>>>>> + queue_work(vb->balloon_wq, &vb->report_free_page_work);
>>>> It seems that if a command was already queued (with a different id),
>>>> this will result in new command id being sent to host twice, which
>>>> will
>>>> likely confuse the host.
>>> I think that case won't happen, because
>>> - the host sends a cmd id to the guest via the config, while the
>>> guest acks
>>> back the received cmd id via the virtqueue;
>>> - the guest ack back a cmd id only when a new cmd id is received
>>> from the
>>> host, that is the above check:
>>>
>>> if (cmd_id != vb->start_cmd_id) { --> the driver only queues the
>>> reporting work only when a new cmd id is received
>>> /*
>>> * Host requests to start the reporting by
>>> sending a
>>> * new cmd id.
>>> */
>>> WRITE_ONCE(vb->report_free_page, true);
>>> vb->start_cmd_id = cmd_id;
>>> queue_work(vb->balloon_wq,
>>> &vb->report_free_page_work);
>>> }
>>>
>>> So the same cmd id wouldn't queue the reporting work twice.
>>>
>> Like this:
>>
>> vb->start_cmd_id = cmd_id;
>> queue_work(vb->balloon_wq, &vb->report_free_page_work);
>>
>> command id changes
>>
>> vb->start_cmd_id = cmd_id;
>>
>> work executes
>>
>> queue_work(vb->balloon_wq, &vb->report_free_page_work);
>>
>> work executes again
>>
>
> If we think about the whole working flow, I think this case couldn't
> happen:
>
> 1) device send cmd_id=1 to driver;
> 2) driver receives cmd_id=1 in the config and acks cmd_id=1 to the
> device via the vq;
> 3) device revives cmd_id=1;
> 4) device wants to stop the reporting by sending cmd_id=STOP;
> 5) driver receives cmd_id=STOP from the config, and acks cmd_id=STOP
> to the device via the vq;
> 6) device sends cmd_id=2 to driver;
> ...
>
> cmd_id=2 won't come after cmd_id=1, there will be a STOP cmd in
> between them (STOP won't queue the work).
>
> How about defining the correct device behavior in the spec:
> The device Should NOT send a second cmd id to the driver until a STOP
> cmd ack for the previous cmd id has been received from the guest.
Thanks for the comments, and I adopted most of them in the new posted
v23 patches. The above discussion is the one that I haven't included. If
you could still see issues in the above analysis, please let me know.
Thanks.
Best,
Wei
^ permalink raw reply
* [PATCH v23 2/2] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
From: Wei Wang @ 2018-01-24 2:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
In-Reply-To: <1516762227-36346-1-git-send-email-wei.w.wang@intel.com>
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.
Host requests the guest to report free pages by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.
When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host may also requests
the guest to stop the reporting in advance by sending the stop cmd id to
the guest via the configuration register.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
drivers/virtio/virtio_balloon.c | 228 ++++++++++++++++++++++++++++++------
include/uapi/linux/virtio_balloon.h | 6 +
2 files changed, 201 insertions(+), 33 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index a1fb52c..d038f4a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -51,9 +51,21 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
static struct vfsmount *balloon_mnt;
#endif
+/* The number of virtqueues supported by virtio-balloon */
+#define VIRTIO_BALLOON_VQ_NUM 4
+#define VIRTIO_BALLOON_VQ_ID_INFLATE 0
+#define VIRTIO_BALLOON_VQ_ID_DEFLATE 1
+#define VIRTIO_BALLOON_VQ_ID_STATS 2
+#define VIRTIO_BALLOON_VQ_ID_FREE_PAGE 3
+
struct virtio_balloon {
struct virtio_device *vdev;
- struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+ struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+ /* Balloon's own wq for cpu-intensive work items */
+ struct workqueue_struct *balloon_wq;
+ /* The free page reporting work item submitted to the balloon wq */
+ struct work_struct report_free_page_work;
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -63,6 +75,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
+ /* Start to report free pages */
+ bool report_free_page;
+ /* Stores the cmd id given by host to start the free page reporting */
+ __virtio32 start_cmd_id;
+ /* Stores STOP_ID as a sign to tell host that the reporting is done */
+ __virtio32 stop_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
@@ -281,6 +300,56 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
return idx;
}
+static void add_one_sg(struct virtqueue *vq, unsigned long pfn, uint32_t len)
+{
+ struct scatterlist sg;
+ unsigned int unused;
+ int err;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg, pfn_to_page(pfn), len, 0);
+
+ /* Detach all the used buffers from the vq */
+ while (virtqueue_get_buf(vq, &unused))
+ ;
+
+ /*
+ * Since this is an optimization feature, losing a couple of free
+ * pages to report isn't important. We simply return without adding
+ * the page if the vq is full.
+ * We are adding one entry each time, which essentially results in no
+ * memory allocation, so the GFP_KERNEL flag below can be ignored.
+ * There is always one entry reserved for the cmd id to use.
+ */
+ if (vq->num_free > 1) {
+ err = virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL);
+ /*
+ * This is expected to never fail, because there is always an
+ * entry available on the vq.
+ */
+ BUG_ON(err);
+ }
+
+ if (vq->num_free == virtqueue_get_vring_size(vq) / 2)
+ virtqueue_kick(vq);
+}
+
+static void send_cmd_id(struct virtqueue *vq, __virtio32 *cmd_id)
+{
+ struct scatterlist sg;
+ int err;
+
+ sg_init_one(&sg, cmd_id, sizeof(*cmd_id));
+
+ err = virtqueue_add_outbuf(vq, &sg, 1, vq, GFP_KERNEL);
+ /*
+ * This is expected to never fail, because there is always an
+ * entry reserved for the cmd id.
+ */
+ BUG_ON(err);
+ virtqueue_kick(vq);
+}
+
/*
* While most virtqueues communicate guest-initiated requests to the hypervisor,
* the stats queue operates in reverse. The driver initializes the virtqueue
@@ -316,17 +385,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
}
-static void virtballoon_changed(struct virtio_device *vdev)
-{
- struct virtio_balloon *vb = vdev->priv;
- unsigned long flags;
-
- spin_lock_irqsave(&vb->stop_update_lock, flags);
- if (!vb->stop_update)
- queue_work(system_freezable_wq, &vb->update_balloon_size_work);
- spin_unlock_irqrestore(&vb->stop_update_lock, flags);
-}
-
static inline s64 towards_target(struct virtio_balloon *vb)
{
s64 target;
@@ -343,6 +401,42 @@ static inline s64 towards_target(struct virtio_balloon *vb)
return target - vb->num_pages;
}
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+ unsigned long flags;
+ __u32 cmd_id;
+ s64 diff = towards_target(vb);
+
+ if (diff) {
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update)
+ queue_work(system_freezable_wq,
+ &vb->update_balloon_size_work);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+ }
+
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+ virtio_cread(vdev, struct virtio_balloon_config,
+ free_page_report_cmd_id, &cmd_id);
+ if (cmd_id == VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
+ vb->report_free_page = false;
+ } else if (cmd_id != virtio32_to_cpu(vdev, vb->start_cmd_id)) {
+ /*
+ * Host requests to start the reporting by sending a
+ * new cmd id.
+ */
+ vb->report_free_page = true;
+ vb->start_cmd_id = cpu_to_virtio32(vdev, cmd_id);
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update)
+ queue_work(vb->balloon_wq,
+ &vb->report_free_page_work);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+ }
+ }
+}
+
static void update_balloon_size(struct virtio_balloon *vb)
{
u32 actual = vb->num_pages;
@@ -417,42 +511,91 @@ static void update_balloon_size_func(struct work_struct *work)
static int init_vqs(struct virtio_balloon *vb)
{
- struct virtqueue *vqs[3];
- vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
- static const char * const names[] = { "inflate", "deflate", "stats" };
- int err, nvqs;
+ struct virtqueue *vqs[VIRTIO_BALLOON_VQ_NUM];
+ vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_NUM];
+ const char *names[VIRTIO_BALLOON_VQ_NUM];
+ struct scatterlist sg;
+ int ret;
/*
- * We expect two virtqueues: inflate and deflate, and
- * optionally stat.
+ * Inflateq and deflateq are used unconditionally. stats_vq and
+ * free_page_vq uses names[2] and names[3], respectively. The names[]
+ * will be NULL if the related feature is not enabled, which will
+ * cause no allocation for the corresponding virtqueue in find_vqs.
*/
- nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
- err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
- if (err)
- return err;
+ callbacks[VIRTIO_BALLOON_VQ_ID_INFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_ID_INFLATE] = "inflate";
+ callbacks[VIRTIO_BALLOON_VQ_ID_DEFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_ID_DEFLATE] = "deflate";
+ names[VIRTIO_BALLOON_VQ_ID_STATS] = NULL;
+ names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
- vb->inflate_vq = vqs[0];
- vb->deflate_vq = vqs[1];
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
- struct scatterlist sg;
- unsigned int num_stats;
- vb->stats_vq = vqs[2];
+ names[VIRTIO_BALLOON_VQ_ID_STATS] = "stats";
+ callbacks[VIRTIO_BALLOON_VQ_ID_STATS] = stats_request;
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+ names[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = "free_page_vq";
+ callbacks[VIRTIO_BALLOON_VQ_ID_FREE_PAGE] = NULL;
+ }
+
+ ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_NUM,
+ vqs, callbacks, names, NULL, NULL);
+ if (ret)
+ return ret;
+ vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_INFLATE];
+ vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_ID_DEFLATE];
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+ vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_ID_STATS];
/*
* Prime this virtqueue with one buffer so the hypervisor can
* use it to signal us later (it can't be broken yet!).
*/
- num_stats = update_balloon_stats(vb);
-
- sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
- if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
- < 0)
- BUG();
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats));
+ ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
+ GFP_KERNEL);
+ if (ret) {
+ dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+ __func__);
+ return ret;
+ }
virtqueue_kick(vb->stats_vq);
}
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
+ vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_ID_FREE_PAGE];
+
return 0;
}
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+ uint32_t len = nr_pages << PAGE_SHIFT;
+
+ if (!vb->report_free_page)
+ return false;
+
+ add_one_sg(vb->free_page_vq, pfn, len);
+
+ return true;
+}
+
+static void report_free_page_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+
+ vb = container_of(work, struct virtio_balloon, report_free_page_work);
+ /* Start by sending the obtained cmd id to the host with an outbuf */
+ send_cmd_id(vb->free_page_vq, &vb->start_cmd_id);
+ walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
+ /* End by sending the stop id to the host with an outbuf */
+ send_cmd_id(vb->free_page_vq, &vb->stop_cmd_id);
+}
+
#ifdef CONFIG_BALLOON_COMPACTION
/*
* virtballoon_migratepage - perform the balloon page migration on behalf of
@@ -537,6 +680,7 @@ static struct file_system_type balloon_fs = {
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
+ __u32 poison_val;
int err;
if (!vdev->config->get) {
@@ -566,6 +710,21 @@ static int virtballoon_probe(struct virtio_device *vdev)
if (err)
goto out_free_vb;
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+ vb->balloon_wq = alloc_workqueue("balloon-wq",
+ WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+ INIT_WORK(&vb->report_free_page_work, report_free_page_func);
+ vb->stop_cmd_id = cpu_to_virtio32(vdev,
+ VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) ||
+ !page_poisoning_enabled())
+ poison_val = 0;
+ else
+ poison_val = PAGE_POISON;
+ virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+ poison_val, &poison_val);
+ }
+
vb->nb.notifier_call = virtballoon_oom_notify;
vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
err = register_oom_notifier(&vb->nb);
@@ -630,6 +789,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
spin_unlock_irq(&vb->stop_update_lock);
cancel_work_sync(&vb->update_balloon_size_work);
cancel_work_sync(&vb->update_balloon_stats_work);
+ cancel_work_sync(&vb->report_free_page_work);
+ destroy_workqueue(vb->balloon_wq);
remove_common(vb);
#ifdef CONFIG_BALLOON_COMPACTION
@@ -682,6 +843,7 @@ static unsigned int features[] = {
VIRTIO_BALLOON_F_MUST_TELL_HOST,
VIRTIO_BALLOON_F_STATS_VQ,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+ VIRTIO_BALLOON_F_FREE_PAGE_VQ,
};
static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 343d7dd..5861876 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,15 +34,21 @@
#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
#define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */
#define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_VQ 3 /* VQ to report free pages */
/* Size of a PFN in the balloon interface. */
#define VIRTIO_BALLOON_PFN_SHIFT 12
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
struct virtio_balloon_config {
/* Number of pages host wants Guest to give up. */
__u32 num_pages;
/* Number of pages we've actually got in balloon. */
__u32 actual;
+ /* Free page report command id, readonly by guest */
+ __u32 free_page_report_cmd_id;
+ /* Stores PAGE_POISON if page poisoning with sanity check is in use */
+ __u32 poison_val;
};
#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
--
2.7.4
^ permalink raw reply related
* [PATCH v23 1/2] mm: support reporting free page blocks
From: Wei Wang @ 2018-01-24 2:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
In-Reply-To: <1516762227-36346-1-git-send-email-wei.w.wang@intel.com>
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.
One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michal Hocko <mhocko@kernel.org>
---
include/linux/mm.h | 6 ++++
mm/page_alloc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ea818ff..b3077dd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1938,6 +1938,12 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
extern void free_initmem(void);
+extern void walk_free_mem_block(void *opaque,
+ int min_order,
+ bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num));
+
/*
* Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
* into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688..705de22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4899,6 +4899,97 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
show_swap_cache_info();
}
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+ struct zone *zone,
+ int order,
+ enum migratetype mt,
+ bool (*report_pfn_range)(void *,
+ unsigned long,
+ unsigned long))
+{
+ struct page *page;
+ struct list_head *list;
+ unsigned long pfn, flags;
+ bool ret;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ list = &zone->free_area[order].free_list[mt];
+ list_for_each_entry(page, list, lru) {
+ pfn = page_to_pfn(page);
+ ret = report_pfn_range(opaque, pfn, 1 << order);
+ if (!ret)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+ int min_order,
+ bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num))
+{
+ struct zone *zone;
+ int order;
+ enum migratetype mt;
+ bool ret;
+
+ for_each_populated_zone(zone) {
+ for (order = MAX_ORDER - 1; order >= min_order; order--) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ ret = walk_free_page_list(opaque, zone,
+ order, mt,
+ report_pfn_range);
+ if (!ret)
+ return;
+ }
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(walk_free_mem_block);
+
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
--
2.7.4
^ permalink raw reply related
* [PATCH v23 0/2] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-01-24 2:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
mhocko, akpm
Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
nilal
This patch series is separated from the previous "Virtio-balloon
Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_VQ,
implemented by this series enables the virtio-balloon driver to report
hints of guest free pages to the host. It can be used to accelerate live
migration of VMs. Here is an introduction of this usage:
Live migration needs to transfer the VM's memory from the source machine
to the destination round by round. For the 1st round, all the VM's memory
is transferred. From the 2nd round, only the pieces of memory that were
written by the guest (after the 1st round) are transferred. One method
that is popularly used by the hypervisor to track which part of memory is
written is to write-protect all the guest memory.
The second feature enables the optimization of the 1st round memory
transfer - the hypervisor can skip the transfer of guest free pages in the
1st round. It is not concerned that the memory pages are used after they
are given to the hypervisor as a hint of the free pages, because they will
be tracked by the hypervisor and transferred in the next round if they are
used and written.
ChangeLog:
v22->v23:
- change to kick the device when the vq is half-way full;
- open-code batch_free_page_sg into add_one_sg;
- change cmd_id from "uint32_t" to "__virtio32";
- reserver one entry in the vq for teh driver to send cmd_id, instead
of busywaiting for an available entry;
- add "stop_update" check before queue_work for prudence purpose for
now, will have a separate patch to discuss this flag check later;
- init_vqs: change to put some variables on stack to have simpler
implementation;
- add destroy_workqueue(vb->balloon_wq);
v21->v22:
- add_one_sg: some code and comment re-arrangement
- send_cmd_id: handle a cornercase
For previous ChangeLog, please reference
https://lwn.net/Articles/743660/
Wei Wang (2):
mm: support reporting free page blocks
virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ
drivers/virtio/virtio_balloon.c | 228 ++++++++++++++++++++++++++++++------
include/linux/mm.h | 6 +
include/uapi/linux/virtio_balloon.h | 6 +
mm/page_alloc.c | 91 ++++++++++++++
4 files changed, 298 insertions(+), 33 deletions(-)
--
2.7.4
^ permalink raw reply
* Re: [virtio-dev] [RFC PATCH net-next v2 1/2] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
From: Michael S. Tsirkin @ 2018-01-23 23:01 UTC (permalink / raw)
To: Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski,
Samudrala, Sridhar, virtualization, achiad shochat,
Achiad Shochat, Netdev, David Miller
In-Reply-To: <CAKgT0UfaKG7=E5ciarE3fSUp-Ke-oDNodGe_baEAGbZqWV4Qug@mail.gmail.com>
On Mon, Jan 22, 2018 at 07:36:32PM -0800, Alexander Duyck wrote:
> On Mon, Jan 22, 2018 at 6:04 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jan 22, 2018 at 05:34:37PM -0800, Samudrala, Sridhar wrote:
> >> On 1/22/2018 4:05 PM, Michael S. Tsirkin wrote:
> >> > On Mon, Jan 22, 2018 at 03:27:40PM -0800, Samudrala, Sridhar wrote:
> >> > > > > You could probably
> >> > > > > even handle the Tx queue selection via a simple eBPF program and map
> >> > > > > since the input for whatever is used to select Tx should be pretty
> >> > > > > simple, destination MAC, source NUMA node, etc, and the data-set
> >> > > > > shouldn't be too large.
> >> > > > That sounds interesting. A separate device might make this kind of setup
> >> > > > a bit easier. Sridhar, did you look into creating a separate device for
> >> > > > the virtual bond device at all? It does not have to be in a separate
> >> > > > module, that kind of refactoring can come later, but once we commit to
> >> > > > using the same single device as virtio, we can't change that.
> >> > > No. I haven't looked into creating a separate device. If we are going to
> >> > > create a new
> >> > > device, i guess it has to be of a new device type with its own driver.
> >> > Well not necessarily - just a separate netdev ops.
> >> > Kind of like e.g. vlans share a driver with the main driver.
> >>
> >> Not sure what you meant by vlans sharing a driver with the main driver.
> >> IIUC, vlans are supported via 802.1q driver and creates a netdev of type
> >> 'vlan'
> >> with vlan_netdev_ops
> >
> > But nothing prevents a single module from registering
> > multiple ops.
>
> Just to clarify, it seems like what you are suggesting is just adding
> the "master" as a separate set of netdev ops or netdevice and to have
> virtio spawn two network devices, one slave and one master, if the
> BACKUP bit is set. Do I have that right?
Yes, that was my idea.
> I am good with the code still living in the virtio driver and
> consolidation with other similar implementations and further
> improvement could probably happen later as part of some refactor.
>
> Thanks.
>
> - Alex
^ permalink raw reply
* Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Michael S. Tsirkin @ 2018-01-23 22:58 UTC (permalink / raw)
To: Siwei Liu
Cc: Alexander Duyck, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
virtualization, Netdev, David Miller
In-Reply-To: <CADGSJ209vrdjnEAweLg84FwN3sf+1W+APmvB=NjFkMNAhsOyrg@mail.gmail.com>
On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
> >> First off, as mentioned in another thread, the model of stacking up
> >> virt-bond functionality over virtio seems a wrong direction to me.
> >> Essentially the migration process would need to carry over all guest
> >> side configurations previously done on the VF/PT and get them moved to
> >> the new device being it virtio or VF/PT.
> >
> > I might be wrong but I don't see why we should worry about this usecase.
> > Whoever has a bond configured already has working config for migration.
> > We are trying to help people who don't, not convert existig users.
>
> That has been placed in the view of cloud providers that the imported
> images from the store must be able to run unmodified thus no
> additional setup script is allowed (just as Stephen mentioned in
> another mail). Cloud users don't care about live migration themselves
> but the providers are required to implement such automation mechanism
> to make this process transparent if at all possible. The user does not
> care about the device underneath being VF or not, but they do care
> about consistency all across and the resulting performance
> acceleration in making VF the prefered datapath. It is not quite
> peculiar user cases but IMHO *any* approach proposed for live
> migration should be able to persist the state including network config
> e.g. as simple as MTU. Actually this requirement has nothing to do
> with virtio but our target users are live migration agnostic, being it
> tracking DMA through dirty pages, using virtio as the helper, or
> whatsoever, the goal of persisting configs across remains same.
So the patching being discussed here will mostly do exactly that if your
original config was simply a single virtio net device.
What kind of configs do your users have right now?
> >
> >> Without the help of a new
> >> upper layer bond driver that enslaves virtio and VF/PT devices
> >> underneath, virtio will be overloaded with too much specifics being a
> >> VF/PT backup in the future.
> >
> > So this paragraph already includes at least two conflicting
> > proposals. On the one hand you want a separate device for
> > the virtual bond, on the other you are saying a separate
> > driver.
>
> Just to be crystal clear: separate virtual bond device (netdev ops,
> not necessarily bus device) for VM migration specifically with a
> separate driver.
Okay, but note that any config someone had on a virtio device won't
propagate to that bond.
> >
> > Further, the reason to have a separate *driver* was that
> > some people wanted to share code with netvsc - and that
> > one does not create a separate device, which you can't
> > change without breaking existing configs.
>
> I'm not sure I understand this statement. netvsc is already another
> netdev being created than the enslaved VF netdev, why it bothers?
Because it shipped, so userspace ABI is frozen. You can't really add a
netdevice and enslave an existing one without a risk of breaking some
userspace configs.
> In
> the Azure case, the stock image to be imported does not bind to a
> specific driver but only MAC address.
I'll let netvsc developers decide this, on the surface I don't think
it's reasonable to assume everyone only binds to a MAC.
> And people just deal with the
> new virt-bond netdev rather than the underlying virtio and VF. And
> both these two underlying netdevs should be made invisible to prevent
> userspace script from getting them misconfigured IMHO.
>
> A separate driver was for code sharing for sure, only just netvsc but
> could be other para-virtual devices floating around: any PV can serve
> as the side channel and the backup path for VF/PT. Once we get the new
> driver working atop virtio we may define ops and/or protocol needed to
> talk to various other PV frontend that may implement the side channel
> of its own for datapath switching (e.g. virtio is one of them, Xen PV
> frontend can be another). I just don't like to limit the function to
> virtio only and we have to duplicate code then it starts to scatter
> around all over the places.
>
> I understand right now we start it as simple so it may just be fine
> that the initial development activities center around virtio. However,
> from cloud provider/vendor perspective I don't see the proposed scheme
> limits to virtio only. Any other PV driver which has the plan to
> support the same scheme can benefit. The point is that we shouldn't be
> limiting the scheme to virtio specifics so early which is hard to have
> it promoted to a common driver once we get there.
The whole idea has been floating around for years. It would always
get being drowned in this kind of "lets try to cover all use-cases"
discussions, and never make progress.
So let's see some working code merged. If it works fine for virtio
and turns out to be a good fit for netvsc, we can share code.
> >
> > So some people want a fully userspace-configurable switchdev, and that
> > already exists at some level, and maybe it makes sense to add more
> > features for performance.
> >
> > But the point was that some host configurations are very simple,
> > and it probably makes sense to pass this information to the guest
> > and have guest act on it directly. Let's not conflate the two.
>
> It may be fine to push some of the configurations from host but that
> perhaps doesn't cover all the cases: how is it possible for the host
> to save all network states and configs done by the guest before
> migration. Some of the configs might come from future guest which is
> unknown to host. Anyhow the bottom line is that the guest must be able
> to act on those configuration request changes automatically without
> involving users intervention.
>
> Regards,
> -Siwei
All use-cases are *already* covered by existing kernel APIs. Just use a
bond, or a bridge, or whatever. It's just that they are so generic and
hard to use, that userspace to do it never surfaced.
So I am interested in some code that handles some simple use-cases
in the kernel, with a simple kernel API.
> >
> > --
> > MST
^ permalink raw reply
* Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Siwei Liu @ 2018-01-23 20:24 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Alexander Duyck, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
virtualization, Netdev, David Miller
In-Reply-To: <20180122233205-mutt-send-email-mst@kernel.org>
On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>> First off, as mentioned in another thread, the model of stacking up
>> virt-bond functionality over virtio seems a wrong direction to me.
>> Essentially the migration process would need to carry over all guest
>> side configurations previously done on the VF/PT and get them moved to
>> the new device being it virtio or VF/PT.
>
> I might be wrong but I don't see why we should worry about this usecase.
> Whoever has a bond configured already has working config for migration.
> We are trying to help people who don't, not convert existig users.
That has been placed in the view of cloud providers that the imported
images from the store must be able to run unmodified thus no
additional setup script is allowed (just as Stephen mentioned in
another mail). Cloud users don't care about live migration themselves
but the providers are required to implement such automation mechanism
to make this process transparent if at all possible. The user does not
care about the device underneath being VF or not, but they do care
about consistency all across and the resulting performance
acceleration in making VF the prefered datapath. It is not quite
peculiar user cases but IMHO *any* approach proposed for live
migration should be able to persist the state including network config
e.g. as simple as MTU. Actually this requirement has nothing to do
with virtio but our target users are live migration agnostic, being it
tracking DMA through dirty pages, using virtio as the helper, or
whatsoever, the goal of persisting configs across remains same.
>
>> Without the help of a new
>> upper layer bond driver that enslaves virtio and VF/PT devices
>> underneath, virtio will be overloaded with too much specifics being a
>> VF/PT backup in the future.
>
> So this paragraph already includes at least two conflicting
> proposals. On the one hand you want a separate device for
> the virtual bond, on the other you are saying a separate
> driver.
Just to be crystal clear: separate virtual bond device (netdev ops,
not necessarily bus device) for VM migration specifically with a
separate driver.
>
> Further, the reason to have a separate *driver* was that
> some people wanted to share code with netvsc - and that
> one does not create a separate device, which you can't
> change without breaking existing configs.
I'm not sure I understand this statement. netvsc is already another
netdev being created than the enslaved VF netdev, why it bothers? In
the Azure case, the stock image to be imported does not bind to a
specific driver but only MAC address. And people just deal with the
new virt-bond netdev rather than the underlying virtio and VF. And
both these two underlying netdevs should be made invisible to prevent
userspace script from getting them misconfigured IMHO.
A separate driver was for code sharing for sure, only just netvsc but
could be other para-virtual devices floating around: any PV can serve
as the side channel and the backup path for VF/PT. Once we get the new
driver working atop virtio we may define ops and/or protocol needed to
talk to various other PV frontend that may implement the side channel
of its own for datapath switching (e.g. virtio is one of them, Xen PV
frontend can be another). I just don't like to limit the function to
virtio only and we have to duplicate code then it starts to scatter
around all over the places.
I understand right now we start it as simple so it may just be fine
that the initial development activities center around virtio. However,
from cloud provider/vendor perspective I don't see the proposed scheme
limits to virtio only. Any other PV driver which has the plan to
support the same scheme can benefit. The point is that we shouldn't be
limiting the scheme to virtio specifics so early which is hard to have
it promoted to a common driver once we get there.
>
> So some people want a fully userspace-configurable switchdev, and that
> already exists at some level, and maybe it makes sense to add more
> features for performance.
>
> But the point was that some host configurations are very simple,
> and it probably makes sense to pass this information to the guest
> and have guest act on it directly. Let's not conflate the two.
It may be fine to push some of the configurations from host but that
perhaps doesn't cover all the cases: how is it possible for the host
to save all network states and configs done by the guest before
migration. Some of the configs might come from future guest which is
unknown to host. Anyhow the bottom line is that the guest must be able
to act on those configuration request changes automatically without
involving users intervention.
Regards,
-Siwei
>
> --
> MST
^ permalink raw reply
* Re: [virtio-dev] [RFC PATCH net-next v2 1/2] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
From: Jakub Kicinski @ 2018-01-23 19:21 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Duyck, Alexander H, virtio-dev, achiad shochat,
Samudrala, Sridhar, Alexander Duyck, virtualization,
Achiad Shochat, Netdev, David Miller
In-Reply-To: <20180123031348-mutt-send-email-mst@kernel.org>
On Tue, 23 Jan 2018 03:23:57 +0200, Michael S. Tsirkin wrote:
> > > > b. next-gen silicon may be able to disguise as virtio device, and the
> > > > loop check in virtio driver will prevent the legitimate bond it such
> > > > case. AFAIU that's one of the goals of next-gen virtio spec as well.
> > >
> > > In fact we have a virtio feature bit for the fallback.
> > > So this part does not depend on how software in guest works
> > > and does not need software solutions.
> >
> > You mean in the new spec? Nice. Still I think people will try to
> > implement the old one too given sufficiently capable HW.
>
> Existing HW won't have the BACKUP feature so the new functionality
> won't be activated. So no problem I think.
I meant code that compares of netdev_ops, e.g.:
+ /* Skip our own events */
+ if (event_dev->netdev_ops == &virtnet_netdev)
+ return NOTIFY_DONE;
Would be an obstacle to bonding virtio_nets. But that's just one of
the thoughts, perhaps of disputable value. Anyway, separate netdev and
netdev_ops will solve this, and I think we agree that's preferable :)
^ permalink raw reply
* Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-01-23 16:03 UTC (permalink / raw)
To: Jason Wang, mst, stephen, davem, netdev, virtualization,
virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici
In-Reply-To: <afd11f19-6906-2a7e-1b82-e3138c4d9530@redhat.com>
On 1/23/2018 2:33 AM, Jason Wang wrote:
>
>
> On 2018年01月12日 13:58, Sridhar Samudrala wrote:
>> static netdev_tx_t start_xmit(struct sk_buff *skb, struct
>> net_device *dev)
>> {
>> struct virtnet_info *vi = netdev_priv(dev);
>> int qnum = skb_get_queue_mapping(skb);
>> struct send_queue *sq = &vi->sq[qnum];
>> + struct net_device *vf_netdev;
>> int err;
>> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> bool kick = !skb->xmit_more;
>> bool use_napi = sq->napi.weight;
>> + /* If VF is present and up then redirect packets
>> + * called with rcu_read_lock_bh
>> + */
>> + vf_netdev = rcu_dereference_bh(vi->vf_netdev);
>> + if (vf_netdev && netif_running(vf_netdev) &&
>> + !netpoll_tx_running(dev) &&
>> + is_unicast_ether_addr(eth_hdr(skb)->h_dest))
>> + return virtnet_vf_xmit(dev, vf_netdev, skb);
>> +
>
> A question here.
>
> If I read the code correctly, all features were validated against
> virtio instead VF before transmitting. This assumes VF's feature is a
> superset of virtio, does this really work? Do we need to sanitize the
> feature before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be
> removed).
Actually, virtnet_vf_xmit() calls dev_queue_xmit() after updating
skb->dev to vf netdev.
So the features get validated against VF features and the right tx queue
is selected
before the real transmit.
Thanks
Sridhar
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net 2/2] vhost: do not try to access device IOTLB when not initialized
From: Michael S. Tsirkin @ 2018-01-23 15:58 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1516699646-7321-2-git-send-email-jasowang@redhat.com>
On Tue, Jan 23, 2018 at 05:27:26PM +0800, Jason Wang wrote:
> The code will try to access dev->iotlb when processing
> VHOST_IOTLB_INVALIDATE even if it was not initialized which may lead
> to NULL pointer dereference. Fixes this by check dev->iotlb before.
>
> Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vhost.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 549771a..5727b18 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1015,6 +1015,10 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> vhost_iotlb_notify_vq(dev, msg);
> break;
> case VHOST_IOTLB_INVALIDATE:
> + if (!dev->iotlb) {
> + ret = -EFAULT;
> + break;
> + }
> vhost_vq_meta_reset(dev);
> vhost_del_umem_range(dev->iotlb, msg->iova,
> msg->iova + msg->size - 1);
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net 1/2] vhost: use mutex_lock_nested() in vhost_dev_lock_vqs()
From: Michael S. Tsirkin @ 2018-01-23 15:57 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1516699646-7321-1-git-send-email-jasowang@redhat.com>
On Tue, Jan 23, 2018 at 05:27:25PM +0800, Jason Wang wrote:
> We used to call mutex_lock() in vhost_dev_lock_vqs() which tries to
> hold mutexes of all virtqueues. This may confuse lockdep to report a
> possible deadlock because of trying to hold locks belong to same
> class. Switch to use mutex_lock_nested() to avoid false positive.
>
> Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> Reported-by: syzbot+dbb7c1161485e61b0241@syzkaller.appspotmail.com
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vhost.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 33ac2b1..549771a 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -904,7 +904,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d)
> {
> int i = 0;
> for (i = 0; i < d->nvqs; ++i)
> - mutex_lock(&d->vqs[i]->mutex);
> + mutex_lock_nested(&d->vqs[i]->mutex, i);
> }
>
> static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> --
> 2.7.4
^ permalink raw reply
* Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Jason Wang @ 2018-01-23 10:33 UTC (permalink / raw)
To: Sridhar Samudrala, mst, stephen, davem, netdev, virtualization,
virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici
In-Reply-To: <1515736720-39368-3-git-send-email-sridhar.samudrala@intel.com>
On 2018年01月12日 13:58, Sridhar Samudrala wrote:
> static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> int qnum = skb_get_queue_mapping(skb);
> struct send_queue *sq = &vi->sq[qnum];
> + struct net_device *vf_netdev;
> int err;
> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> bool kick = !skb->xmit_more;
> bool use_napi = sq->napi.weight;
>
> + /* If VF is present and up then redirect packets
> + * called with rcu_read_lock_bh
> + */
> + vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> + if (vf_netdev && netif_running(vf_netdev) &&
> + !netpoll_tx_running(dev) &&
> + is_unicast_ether_addr(eth_hdr(skb)->h_dest))
> + return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
A question here.
If I read the code correctly, all features were validated against virtio
instead VF before transmitting. This assumes VF's feature is a superset
of virtio, does this really work? Do we need to sanitize the feature
before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be removed).
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net 2/2] vhost: do not try to access device IOTLB when not initialized
From: Jason Wang @ 2018-01-23 9:27 UTC (permalink / raw)
To: mst, jasowang, linux-kernel; +Cc: netdev, virtualization
In-Reply-To: <1516699646-7321-1-git-send-email-jasowang@redhat.com>
The code will try to access dev->iotlb when processing
VHOST_IOTLB_INVALIDATE even if it was not initialized which may lead
to NULL pointer dereference. Fixes this by check dev->iotlb before.
Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 549771a..5727b18 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1015,6 +1015,10 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
vhost_iotlb_notify_vq(dev, msg);
break;
case VHOST_IOTLB_INVALIDATE:
+ if (!dev->iotlb) {
+ ret = -EFAULT;
+ break;
+ }
vhost_vq_meta_reset(dev);
vhost_del_umem_range(dev->iotlb, msg->iova,
msg->iova + msg->size - 1);
--
2.7.4
^ permalink raw reply related
* [PATCH net 1/2] vhost: use mutex_lock_nested() in vhost_dev_lock_vqs()
From: Jason Wang @ 2018-01-23 9:27 UTC (permalink / raw)
To: mst, jasowang, linux-kernel; +Cc: netdev, virtualization
We used to call mutex_lock() in vhost_dev_lock_vqs() which tries to
hold mutexes of all virtqueues. This may confuse lockdep to report a
possible deadlock because of trying to hold locks belong to same
class. Switch to use mutex_lock_nested() to avoid false positive.
Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: syzbot+dbb7c1161485e61b0241@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 33ac2b1..549771a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -904,7 +904,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d)
{
int i = 0;
for (i = 0; i < d->nvqs; ++i)
- mutex_lock(&d->vqs[i]->mutex);
+ mutex_lock_nested(&d->vqs[i]->mutex, i);
}
static void vhost_dev_unlock_vqs(struct vhost_dev *d)
--
2.7.4
^ permalink raw reply related
* Re: Ping Re: [PATCH] virtio: make VIRTIO a menuconfig to ease disabling it all
From: Vincent Legoll @ 2018-01-23 8:08 UTC (permalink / raw)
To: Michael Ellerman, sfr
Cc: virtualization, Randy Dunlap, linux-kernel, Michael S. Tsirkin
In-Reply-To: <87h8rdp55n.fsf@concordia.ellerman.id.au>
On 1/23/18, Michael Ellerman <mpe@ellerman.id.au> wrote:
> This has been broken in linux-next for ~6 weeks now, can we please merge
> this and get it fixed.
Added Stephen Rothwell to cc
--
Vincent Legoll
^ permalink raw reply
* Re: [virtio-dev] [RFC PATCH net-next v2 1/2] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
From: Samudrala, Sridhar @ 2018-01-23 5:54 UTC (permalink / raw)
To: Alexander Duyck, Michael S. Tsirkin
Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Netdev,
virtualization, achiad shochat, Achiad Shochat, David Miller
In-Reply-To: <CAKgT0UfaKG7=E5ciarE3fSUp-Ke-oDNodGe_baEAGbZqWV4Qug@mail.gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 4174 bytes --]
On 1/22/2018 7:36 PM, Alexander Duyck wrote:
> On Mon, Jan 22, 2018 at 6:04 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Mon, Jan 22, 2018 at 05:34:37PM -0800, Samudrala, Sridhar wrote:
>>> On 1/22/2018 4:05 PM, Michael S. Tsirkin wrote:
>>>> On Mon, Jan 22, 2018 at 03:27:40PM -0800, Samudrala, Sridhar wrote:
>>>>>>> You could probably
>>>>>>> even handle the Tx queue selection via a simple eBPF program and map
>>>>>>> since the input for whatever is used to select Tx should be pretty
>>>>>>> simple, destination MAC, source NUMA node, etc, and the data-set
>>>>>>> shouldn't be too large.
>>>>>> That sounds interesting. A separate device might make this kind of setup
>>>>>> a bit easier. Sridhar, did you look into creating a separate device for
>>>>>> the virtual bond device at all? It does not have to be in a separate
>>>>>> module, that kind of refactoring can come later, but once we commit to
>>>>>> using the same single device as virtio, we can't change that.
>>>>> No. I haven't looked into creating a separate device. If we are going to
>>>>> create a new
>>>>> device, i guess it has to be of a new device type with its own driver.
>>>> Well not necessarily - just a separate netdev ops.
>>>> Kind of like e.g. vlans share a driver with the main driver.
>>> Not sure what you meant by vlans sharing a driver with the main driver.
>>> IIUC, vlans are supported via 802.1q driver and creates a netdev of type
>>> 'vlan'
>>> with vlan_netdev_ops
>> But nothing prevents a single module from registering
>> multiple ops.
> Just to clarify, it seems like what you are suggesting is just adding
> the "master" as a separate set of netdev ops or netdevice and to have
> virtio spawn two network devices, one slave and one master, if the
> BACKUP bit is set. Do I have that right?
>
> I am good with the code still living in the virtio driver and
> consolidation with other similar implementations and further
> improvement could probably happen later as part of some refactor.
>
Here are the netdev_ops that are currently supported by virtio_net
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
.ndo_start_xmit = start_xmit,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = virtnet_set_mac_address,
.ndo_set_rx_mode = virtnet_set_rx_mode,
.ndo_get_stats64 = virtnet_stats,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = virtnet_netpoll,
#endif
.ndo_bpf = virtnet_xdp,
.ndo_xdp_xmit = virtnet_xdp_xmit,
.ndo_xdp_flush = virtnet_xdp_flush,
.ndo_features_check = passthru_features_check,
};
Now if we want to create 2 netdevs associated with a single
virtio_device, do we want these ops
supported by master or slave netdev? I guess these should be supported
by the slave netdev.
So do we want the netdev_ops of master simply call the slave netdev_ops
for most the the cases
except for the ndo_start_xmit() which will decide between virtio or vf
datapath?
what about ethtool_ops? we need to sync up link state between master and
slave netdevs and
the userspace needs to be aware of 2 type of virtio_net devices.
Is this complexity really required to support a feature that can only be
supported/useful for
simple guest network configurations that can be controlled by
hypervisor. virtio_net device
that could be accelerated via a passthru device and support live migration.
If a guest admin needs to configure any complex network configurations
in the guest, i think those
scenarios can be supported via bond/bridge/team setups and live
migration may not be a requirement
for such usecases.
Thanks
Sridhar
[-- Attachment #1.2: Type: text/html, Size: 5503 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Ping Re: [PATCH] virtio: make VIRTIO a menuconfig to ease disabling it all
From: Michael Ellerman @ 2018-01-23 4:31 UTC (permalink / raw)
To: Michael S. Tsirkin, Jason Wang, virtualization, linux-kernel
Cc: Vincent Legoll, Randy Dunlap
In-Reply-To: <20180107113356.2806-1-vincent.legoll@gmail.com>
Vincent Legoll <vincent.legoll@gmail.com> writes:
> No need to get into the submenu to disable all VIRTIO-related
> config entries.
>
> This makes it easier to disable all VIRTIO config options
> without entering the submenu. It will also enable one
> to see that en/dis-abled state from the outside menu.
>
> This is only intended to change menuconfig UI, not change
> the config dependencies.
>
> v2: Added "default y" to avoid breaking existing configs
> v3: Fixed wrong indentation, added *-by from Randy
>
> Signed-off-by: Vincent Legoll <vincent.legoll@gmail.com>
> Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
> Tested-by: Randy Dunlap <rdunlap@infradead.org> # works for me
> ---
> drivers/virtio/Kconfig | 8 ++++++--
> 1 file changed, 6 insertions(+), 2 deletions(-)
This has been broken in linux-next for ~6 weeks now, can we please merge
this and get it fixed.
cheers
^ permalink raw reply
* Re: [virtio-dev] [RFC PATCH net-next v2 1/2] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
From: Alexander Duyck @ 2018-01-23 3:36 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski,
Samudrala, Sridhar, virtualization, achiad shochat,
Achiad Shochat, Netdev, David Miller
In-Reply-To: <20180123035619-mutt-send-email-mst@kernel.org>
On Mon, Jan 22, 2018 at 6:04 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jan 22, 2018 at 05:34:37PM -0800, Samudrala, Sridhar wrote:
>> On 1/22/2018 4:05 PM, Michael S. Tsirkin wrote:
>> > On Mon, Jan 22, 2018 at 03:27:40PM -0800, Samudrala, Sridhar wrote:
>> > > > > You could probably
>> > > > > even handle the Tx queue selection via a simple eBPF program and map
>> > > > > since the input for whatever is used to select Tx should be pretty
>> > > > > simple, destination MAC, source NUMA node, etc, and the data-set
>> > > > > shouldn't be too large.
>> > > > That sounds interesting. A separate device might make this kind of setup
>> > > > a bit easier. Sridhar, did you look into creating a separate device for
>> > > > the virtual bond device at all? It does not have to be in a separate
>> > > > module, that kind of refactoring can come later, but once we commit to
>> > > > using the same single device as virtio, we can't change that.
>> > > No. I haven't looked into creating a separate device. If we are going to
>> > > create a new
>> > > device, i guess it has to be of a new device type with its own driver.
>> > Well not necessarily - just a separate netdev ops.
>> > Kind of like e.g. vlans share a driver with the main driver.
>>
>> Not sure what you meant by vlans sharing a driver with the main driver.
>> IIUC, vlans are supported via 802.1q driver and creates a netdev of type
>> 'vlan'
>> with vlan_netdev_ops
>
> But nothing prevents a single module from registering
> multiple ops.
Just to clarify, it seems like what you are suggesting is just adding
the "master" as a separate set of netdev ops or netdevice and to have
virtio spawn two network devices, one slave and one master, if the
BACKUP bit is set. Do I have that right?
I am good with the code still living in the virtio driver and
consolidation with other similar implementations and further
improvement could probably happen later as part of some refactor.
Thanks.
- Alex
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox