Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Factor out generic busy polling logic and will be
used for tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..458f81d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 	return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sock->ops->peek_len)
+		return sock->ops->peek_len(sock);
+
+	return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+				struct vhost_virtqueue *rvq,
+				struct vhost_virtqueue *tvq,
+				bool rx)
+{
+	unsigned long uninitialized_var(endtime);
+	struct socket *sock = rvq->private_data;
+	struct vhost_virtqueue *vq = rx ? tvq : rvq;
+	unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
+					      tvq->busyloop_timeout;
+
+	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+	vhost_disable_notify(&net->dev, vq);
+
+	preempt_disable();
+	endtime = busy_clock() + busyloop_timeout;
+	while (vhost_can_busy_poll(tvq->dev, endtime) &&
+	       !(sock && sk_has_rx_data(sock->sk)) &&
+	       vhost_vq_avail_empty(tvq->dev, tvq))
+		cpu_relax();
+	preempt_enable();
+
+	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
+	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
+		vhost_poll_queue(&vq->poll);
+	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+		vhost_disable_notify(&net->dev, vq);
+		vhost_poll_queue(&vq->poll);
+	}
+
+	mutex_unlock(&vq->mutex);
+}
+
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	return len;
 }
 
-static int sk_has_rx_data(struct sock *sk)
-{
-	struct socket *sock = sk->sk_socket;
-
-	if (sock->ops->peek_len)
-		return sock->ops->peek_len(sock);
-
-	return skb_queue_empty(&sk->sk_receive_queue);
-}
-
 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
@@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
-	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
-	struct vhost_virtqueue *vq = &nvq->vq;
-	unsigned long uninitialized_var(endtime);
-	int len = peek_head_len(rvq, sk);
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
 
-	if (!len && vq->busyloop_timeout) {
-		/* Flush batched heads first */
-		vhost_rx_signal_used(rvq);
-		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
-		vhost_disable_notify(&net->dev, vq);
+	int len = peek_head_len(nvq_rx, sk);
 
-		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-
-		while (vhost_can_busy_poll(&net->dev, endtime) &&
-		       !sk_has_rx_data(sk) &&
-		       vhost_vq_avail_empty(&net->dev, vq))
-			cpu_relax();
-
-		preempt_enable();
-
-		if (!vhost_vq_avail_empty(&net->dev, vq))
-			vhost_poll_queue(&vq->poll);
-		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-			vhost_disable_notify(&net->dev, vq);
-			vhost_poll_queue(&vq->poll);
-		}
+	if (!len && nvq_rx->vq.busyloop_timeout) {
+		/* Flush batched heads first */
+		vhost_rx_signal_used(nvq_rx);
 
-		mutex_unlock(&vq->mutex);
+		/* Both tx vq and rx socket were polled here */
+		vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
 
-		len = peek_head_len(rvq, sk);
+		len = peek_head_len(nvq_rx, sk);
 	}
 
 	return len;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..62bb8e8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
-	mutex_lock(&vq->mutex);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
@@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 		/* Flush batched heads first */
 		vhost_rx_signal_used(rvq);
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, 1);
+		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
 		vhost_disable_notify(&net->dev, vq);
 
 		preempt_disable();
@@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
 	__virtio16 num_buffers;
 	int recv_pkts = 0;
 
-	mutex_lock_nested(&vq->mutex, 0);
+	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 1/4] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, virtualization, Tonghao Zhang
In-Reply-To: <1530340438-3039-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.

Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 drivers/vhost/vhost.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 895eaa2..4ca9383 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 {
 	int i;
 
-	for (i = 0; i < d->nvqs; ++i)
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
 		__vhost_vq_meta_reset(d->vqs[i]);
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
 	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_unlock(&d->vqs[i]->mutex);
-}
-
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 		if (msg->iova <= vq_msg->iova &&
 		    msg->iova + msg->size - 1 > vq_msg->iova &&
 		    vq_msg->type == VHOST_IOTLB_MISS) {
+			mutex_lock(&node->vq->mutex);
 			vhost_poll_queue(&node->vq->poll);
+			mutex_unlock(&node->vq->mutex);
+
 			list_del(&node->node);
 			kfree(node);
 		}
@@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 	int ret = 0;
 
 	mutex_lock(&dev->mutex);
-	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
 	case VHOST_IOTLB_UPDATE:
 		if (!dev->iotlb) {
@@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 		break;
 	}
 
-	vhost_dev_unlock_vqs(dev);
 	mutex_unlock(&dev->mutex);
 
 	return ret;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v3 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-06-30  6:33 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, mst

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patches improve the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the same time.
handle_rx do that in the same way.

This patches are splited from previous big patch:
http://patchwork.ozlabs.org/patch/934673/

For more performance report, see patch 4.

Tonghao Zhang (4):
  net: vhost: lock the vqs one by one
  net: vhost: replace magic number of lock annotation
  net: vhost: factor out busy polling logic to vhost_net_busy_poll()
  net: vhost: add rx busy polling in tx path

 drivers/vhost/net.c   | 106 +++++++++++++++++++++++++++-----------------------
 drivers/vhost/vhost.c |  24 ++++--------
 2 files changed, 65 insertions(+), 65 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-06-30  4:31 UTC (permalink / raw)
  To: virtio-dev, linux-kernel, virtualization, kvm, linux-mm, mst,
	mhocko, akpm
  Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
	nilal, torvalds
In-Reply-To: <1529928312-30500-1-git-send-email-wei.w.wang@intel.com>

On 06/25/2018 08:05 PM, Wei Wang wrote:
> This patch series is separated from the previous "Virtio-balloon
> Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> implemented by this series enables the virtio-balloon driver to report
> hints of guest free pages to the host. It can be used to accelerate live
> migration of VMs. Here is an introduction of this usage:
>
> Live migration needs to transfer the VM's memory from the source machine
> to the destination round by round. For the 1st round, all the VM's memory
> is transferred. From the 2nd round, only the pieces of memory that were
> written by the guest (after the 1st round) are transferred. One method
> that is popularly used by the hypervisor to track which part of memory is
> written is to write-protect all the guest memory.
>
> This feature enables the optimization by skipping the transfer of guest
> free pages during VM live migration. It is not concerned that the memory
> pages are used after they are given to the hypervisor as a hint of the
> free pages, because they will be tracked by the hypervisor and transferred
> in the subsequent round if they are used and written.
>
> * Tests
> - Test Environment
>      Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
>      Guest: 8G RAM, 4 vCPU
>      Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
>
> - Test Results
>      - Idle Guest Live Migration Time (results are averaged over 10 runs):
>          - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction

According to Michael's comments, add one more set of data here:

Enabling page poison with value=0, and enable KSM.

The legacy live migration time is 1806ms (averaged across 10 runs), 
compared to the case with this optimization feature in use (i.e. 284ms), 
there is still around ~84% reduction.


Best,
Wei

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Michael S. Tsirkin @ 2018-06-29 16:38 UTC (permalink / raw)
  To: Toshiaki Makita; +Cc: netdev, kvm, virtualization
In-Reply-To: <1530259790-2414-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>

On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
> Under heavy load vhost busypoll may run without suppressing
> notification. For example tx zerocopy callback can push tx work while
> handle_tx() is running, then busyloop exits due to vhost_has_work()
> condition and enables notification but immediately reenters handle_tx()
> because the pushed work was tx. In this case handle_tx() tries to
> disable notification again, but when using event_idx it by design
> cannot. Then busyloop will run without suppressing notification.
> Another example is the case where handle_tx() tries to enable
> notification but avail idx is advanced so disables it again. This case
> also lead to the same situation with event_idx.
> 
> The problem is that once we enter this situation busyloop does not work
> under heavy load for considerable amount of time, because notification
> is likely to happen during busyloop and handle_tx() immediately enables
> notification after notification happens. Specifically busyloop detects
> notification by vhost_has_work() and then handle_tx() calls
> vhost_enable_notify().

I'd like to understand the problem a bit better.
Why does this happen?
Doesn't this only happen if ring is empty?

> Because the detected work was the tx work, it
> enters handle_tx(), and enters busyloop without suppression again.
> This is likely to be repeated, so with event_idx we are almost not able
> to suppress notification in this case.
> 
> To fix this, poll the work instead of enabling notification when
> busypoll is interrupted by something. IMHO signal_pending() and
> vhost_has_work() are kind of interruptions rather than signals to
> completely cancel the busypoll, so let's run busypoll after the
> necessary work is done. In order to avoid too long busyloop due to
> interruption, save the endtime in vq field and use it when reentering
> the work function.
> 
> I observed this problem makes tx performance poor but does not rx. I
> guess it is because rx notification from socket is not that heavy so
> did not impact the performance even when we failed to mask the
> notification. Anyway for consistency I fixed rx routine as well as tx.
> 
> Performance numbers:
> 
> - Bulk transfer from guest to external physical server.
>     [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
> - Set 10us busypoll.
> - Guest disables checksum and TSO because of host XDP.
> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>   (EPT_MISCONFIG event).
> 
>                             Before              After
>                           Mbps  kicks/s      Mbps  kicks/s
> UDP_STREAM 1472byte              247758                 27
>                 Send   3645.37            6958.10
>                 Recv   3588.56            6958.10
>               1byte                9865                 37
>                 Send      4.34               5.43
>                 Recv      4.17               5.26
> TCP_STREAM             8801.03    45794   9592.77     2884
> 
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

Is this with busy poll enabled?
Are there CPU utilization #s?

> ---
>  drivers/vhost/net.c   | 94 +++++++++++++++++++++++++++++++++++----------------
>  drivers/vhost/vhost.c |  1 +
>  drivers/vhost/vhost.h |  1 +
>  3 files changed, 66 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index eeaf6739215f..0e85f628b965 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -391,13 +391,14 @@ static inline unsigned long busy_clock(void)
>  	return local_clock() >> 10;
>  }
>  
> -static bool vhost_can_busy_poll(struct vhost_dev *dev,
> -				unsigned long endtime)
> +static bool vhost_can_busy_poll(unsigned long endtime)
>  {
> -	return likely(!need_resched()) &&
> -	       likely(!time_after(busy_clock(), endtime)) &&
> -	       likely(!signal_pending(current)) &&
> -	       !vhost_has_work(dev);
> +	return likely(!need_resched() && !time_after(busy_clock(), endtime));
> +}
> +
> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
> +{
> +	return unlikely(signal_pending(current)) || vhost_has_work(dev);
>  }
>  
>  static void vhost_net_disable_vq(struct vhost_net *n,
> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>  
>  	if (r == vq->num && vq->busyloop_timeout) {
>  		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> +		if (vq->busyloop_endtime) {
> +			endtime = vq->busyloop_endtime;
> +			vq->busyloop_endtime = 0;
> +		} else {
> +			endtime = busy_clock() + vq->busyloop_timeout;
> +		}
> +		while (vhost_can_busy_poll(endtime)) {
> +			if (vhost_busy_poll_interrupted(vq->dev)) {
> +				vq->busyloop_endtime = endtime;
> +				break;
> +			}
> +			if (!vhost_vq_avail_empty(vq->dev, vq))
> +				break;
>  			cpu_relax();
> +		}
>  		preempt_enable();
>  		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>  				      out_num, in_num, NULL, NULL);
> @@ -509,12 +521,16 @@ static void handle_tx(struct vhost_net *net)
>  			break;
>  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>  		if (head == vq->num) {
> -			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> +			if (unlikely(vq->busyloop_endtime)) {
> +				/* Busy poll is interrupted. */
> +				vhost_poll_queue(&vq->poll);
> +			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
>  			}
>  			break;
>  		}
> +		vq->busyloop_endtime = 0;
>  		if (in) {
>  			vq_err(vq, "Unexpected descriptor format for TX: "
>  			       "out %d, int %d\n", out, in);
> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>  
>  static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>  {
> -	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> -	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> -	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> +	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> +	struct vhost_virtqueue *rvq = &rnvq->vq;
> +	struct vhost_virtqueue *tvq = &tnvq->vq;
>  	unsigned long uninitialized_var(endtime);
> -	int len = peek_head_len(rvq, sk);
> +	int len = peek_head_len(rnvq, sk);
>  
> -	if (!len && vq->busyloop_timeout) {
> +	if (!len && tvq->busyloop_timeout) {
>  		/* Flush batched heads first */
> -		vhost_rx_signal_used(rvq);
> +		vhost_rx_signal_used(rnvq);
>  		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, 1);
> -		vhost_disable_notify(&net->dev, vq);
> +		mutex_lock_nested(&tvq->mutex, 1);
> +		vhost_disable_notify(&net->dev, tvq);
>  
>  		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> +		if (rvq->busyloop_endtime) {
> +			endtime = rvq->busyloop_endtime;
> +			rvq->busyloop_endtime = 0;
> +		} else {
> +			endtime = busy_clock() + tvq->busyloop_timeout;
> +		}
>  
> -		while (vhost_can_busy_poll(&net->dev, endtime) &&
> -		       !sk_has_rx_data(sk) &&
> -		       vhost_vq_avail_empty(&net->dev, vq))
> +		while (vhost_can_busy_poll(endtime)) {
> +			if (vhost_busy_poll_interrupted(&net->dev)) {
> +				rvq->busyloop_endtime = endtime;
> +				break;
> +			}
> +			if (sk_has_rx_data(sk) ||
> +			    !vhost_vq_avail_empty(&net->dev, tvq))
> +				break;
>  			cpu_relax();
> +		}
>  
>  		preempt_enable();
>  
> -		if (!vhost_vq_avail_empty(&net->dev, vq))
> -			vhost_poll_queue(&vq->poll);
> -		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> -			vhost_disable_notify(&net->dev, vq);
> -			vhost_poll_queue(&vq->poll);
> +		if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> +			vhost_poll_queue(&tvq->poll);
> +		} else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> +			vhost_disable_notify(&net->dev, tvq);
> +			vhost_poll_queue(&tvq->poll);
>  		}
>  
> -		mutex_unlock(&vq->mutex);
> +		mutex_unlock(&tvq->mutex);
>  
> -		len = peek_head_len(rvq, sk);
> +		len = peek_head_len(rnvq, sk);
>  	}
>  
>  	return len;
> @@ -804,6 +832,7 @@ static void handle_rx(struct vhost_net *net)
>  	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
>  
>  	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
> +		vq->busyloop_endtime = 0;
>  		sock_len += sock_hlen;
>  		vhost_len = sock_len + vhost_hlen;
>  		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>  			goto out;
>  		}
>  	}
> -	vhost_net_enable_vq(net, vq);
> +	if (unlikely(vq->busyloop_endtime)) {
> +		/* Busy poll is interrupted. */
> +		vhost_poll_queue(&vq->poll);
> +	} else {
> +		vhost_net_enable_vq(net, vq);
> +	}
>  out:
>  	vhost_rx_signal_used(nvq);
>  	mutex_unlock(&vq->mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 9beefa6ed1ce..fe83578fe336 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -323,6 +323,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vhost_reset_is_le(vq);
>  	vhost_disable_cross_endian(vq);
>  	vq->busyloop_timeout = 0;
> +	vq->busyloop_endtime = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
>  	__vhost_vq_meta_reset(vq);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 6c844b90a168..7e9cf80ccae9 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -144,6 +144,7 @@ struct vhost_virtqueue {
>  	bool user_be;
>  #endif
>  	u32 busyloop_timeout;
> +	unsigned long busyloop_endtime;
>  };
>  
>  struct vhost_msg_node {
> -- 
> 2.14.2
> 

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Michael S. Tsirkin @ 2018-06-29 16:32 UTC (permalink / raw)
  To: Wang, Wei W
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	riel@redhat.com, quan.xu0@gmail.com, kvm@vger.kernel.org,
	nilal@redhat.com, liliang.opensource@gmail.com,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, linux-mm@kvack.org,
	pbonzini@redhat.com, akpm@linux-foundation.org, mhocko@kernel.org,
	torvalds@linux-foundation.org
In-Reply-To: <286AC319A985734F985F78AFA26841F7396C251E@shsmsx102.ccr.corp.intel.com>

On Fri, Jun 29, 2018 at 03:52:40PM +0000, Wang, Wei W wrote:
> On Friday, June 29, 2018 10:46 PM, Michael S. Tsirkin wrote:
> > To: David Hildenbrand <david@redhat.com>
> > Cc: Wang, Wei W <wei.w.wang@intel.com>; virtio-dev@lists.oasis-open.org;
> > linux-kernel@vger.kernel.org; virtualization@lists.linux-foundation.org;
> > kvm@vger.kernel.org; linux-mm@kvack.org; mhocko@kernel.org;
> > akpm@linux-foundation.org; torvalds@linux-foundation.org;
> > pbonzini@redhat.com; liliang.opensource@gmail.com;
> > yang.zhang.wz@gmail.com; quan.xu0@gmail.com; nilal@redhat.com;
> > riel@redhat.com; peterx@redhat.com
> > Subject: Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
> > 
> > On Wed, Jun 27, 2018 at 01:06:32PM +0200, David Hildenbrand wrote:
> > > On 25.06.2018 14:05, Wei Wang wrote:
> > > > This patch series is separated from the previous "Virtio-balloon
> > > > Enhancement" series. The new feature,
> > > > VIRTIO_BALLOON_F_FREE_PAGE_HINT, implemented by this series
> > enables
> > > > the virtio-balloon driver to report hints of guest free pages to the
> > > > host. It can be used to accelerate live migration of VMs. Here is an
> > introduction of this usage:
> > > >
> > > > Live migration needs to transfer the VM's memory from the source
> > > > machine to the destination round by round. For the 1st round, all
> > > > the VM's memory is transferred. From the 2nd round, only the pieces
> > > > of memory that were written by the guest (after the 1st round) are
> > > > transferred. One method that is popularly used by the hypervisor to
> > > > track which part of memory is written is to write-protect all the guest
> > memory.
> > > >
> > > > This feature enables the optimization by skipping the transfer of
> > > > guest free pages during VM live migration. It is not concerned that
> > > > the memory pages are used after they are given to the hypervisor as
> > > > a hint of the free pages, because they will be tracked by the
> > > > hypervisor and transferred in the subsequent round if they are used and
> > written.
> > > >
> > > > * Tests
> > > > - Test Environment
> > > >     Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> > > >     Guest: 8G RAM, 4 vCPU
> > > >     Migration setup: migrate_set_speed 100G, migrate_set_downtime 2
> > > > second
> > > >
> > > > - Test Results
> > > >     - Idle Guest Live Migration Time (results are averaged over 10 runs):
> > > >         - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction
> > > >     - Guest with Linux Compilation Workload (make bzImage -j4):
> > > >         - Live Migration Time (average)
> > > >           Optimization v.s. Legacy = 1402ms v.s. 2528ms --> ~44% reduction
> > > >         - Linux Compilation Time
> > > >           Optimization v.s. Legacy = 5min6s v.s. 5min12s
> > > >           --> no obvious difference
> > > >
> > >
> > > Being in version 34 already, this whole thing still looks and feels
> > > like a big hack to me. It might just be me, but especially if I read
> > > about assumptions like "QEMU will not hotplug memory during
> > > migration". This does not feel like a clean solution.
> > >
> > > I am still not sure if we really need this interface, especially as
> > > real free page hinting might be on its way.
> > >
> > > a) we perform free page hinting by setting all free pages
> > > (arch_free_page()) to zero. Migration will detect zero pages and
> > > minimize #pages to migrate. I don't think this is a good idea but
> > > Michel suggested to do a performance evaluation and Nitesh is looking
> > > into that right now.
> > 
> > Yes this test is needed I think. If we can get most of the benefit without PV
> > interfaces, that's nice.
> > 
> > Wei, I think you need this as part of your performance comparison
> > too: set page poisoning value to 0 and enable KSM, compare with your
> > patches.
> 
> Do you mean live migration with zero pages?
> I can first share the amount of memory transferred during live migration I saw,
> Legacy is around 380MB,
> Optimization is around 340MB.
> This proves that most pages have already been 0 and skipped during the legacy live migration. But the legacy time is still much larger because zero page checking is costly. 
> (It's late night here, I can get you that with my server probably tomorrow)
> 
> Best,
> Wei

Sure thing.

Also we might want to look at optimizing the RLE compressor for
the common case of pages full of zeroes.

Here are some ideas:
https://rusty.ozlabs.org/?p=560

Note Epiphany #2 as well as comments Paolo Bonzini and by Victor Kaplansky.

-- 
MST

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: David Hildenbrand @ 2018-06-29 16:03 UTC (permalink / raw)
  To: Wang, Wei W, virtio-dev@lists.oasis-open.org,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	linux-mm@kvack.org, mst@redhat.com, mhocko@kernel.org,
	akpm@linux-foundation.org
  Cc: yang.zhang.wz@gmail.com, Andrea Arcangeli, riel@redhat.com,
	quan.xu0@gmail.com, liliang.opensource@gmail.com, Luiz Capitulino,
	pbonzini@redhat.com, nilal@redhat.com,
	torvalds@linux-foundation.org
In-Reply-To: <286AC319A985734F985F78AFA26841F7396C254C@shsmsx102.ccr.corp.intel.com>


>> Why would your suggestion still be applicable?
>>
>> Your point for now is "I might not want to have page hinting enabled due to
>> the overhead, but still a live migration speedup". If that overhead actually
>> exists (we'll have to see) or there might be another reason to disable page
>> hinting, then we have to decide if that specific setup is worth it merging your
>> changes.
> 
> All the above "if we have", "assume we have" don't sound like a valid argument to me.

Argument? *confused* And that hinders you from answering the question
"Why would your suggestion still be applicable?" ? Well, okay.

So I will answer it by myself: Because somebody would want to disable
page hinting. Maybe there are some people out there.

>  
>> I am not (and don't want to be) in the position to make any decisions here :) I
>> just want to understand if two interfaces for free pages actually make sense.
> 
> I responded to Nitesh about the differences, you may want to check with him about this.
> I would suggest you to send out your patches to LKML to get a discussion with the mm folks.

Indeed, Nitesh is trying to solve the problems we found in the RFC, so
this can take some time.

> 
> Best,
> Wei
> 


-- 

Thanks,

David / dhildenb

^ permalink raw reply

* RE: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Wang, Wei W @ 2018-06-29 15:55 UTC (permalink / raw)
  To: David Hildenbrand, virtio-dev@lists.oasis-open.org,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	linux-mm@kvack.org, mst@redhat.com, mhocko@kernel.org,
	akpm@linux-foundation.org
  Cc: yang.zhang.wz@gmail.com, Andrea Arcangeli, riel@redhat.com,
	quan.xu0@gmail.com, liliang.opensource@gmail.com, Luiz Capitulino,
	pbonzini@redhat.com, nilal@redhat.com,
	torvalds@linux-foundation.org
In-Reply-To: <34bb25eb-97f3-8a9f-8a13-401dfcf39a2c@redhat.com>

On Friday, June 29, 2018 7:54 PM, David Hildenbrand wrote:
> On 29.06.2018 13:31, Wei Wang wrote:
> > On 06/29/2018 03:46 PM, David Hildenbrand wrote:
> >>>
> >>> I'm afraid it can't. For example, when we have a guest booted,
> >>> without too many memory activities. Assume the guest has 8GB free
> >>> memory. The arch_free_page there won't be able to capture the 8GB
> >>> free pages since there is no free() called. This results in no free pages
> reported to host.
> >>
> >> So, it takes some time from when the guest boots up until the balloon
> >> device was initialized and therefore page hinting can start. For that
> >> period, you won't get any arch_free_page()/page hinting callbacks, correct.
> >>
> >> However in the hypervisor, you can theoretically track which pages
> >> the guest actually touched ("dirty"), so you already know "which
> >> pages were never touched while booting up until virtio-balloon was
> >> brought to life". These, you can directly exclude from migration. No
> >> interface required.
> >>
> >> The remaining problem is pages that were touched ("allocated") by the
> >> guest during bootup but freed again, before virtio-balloon came up.
> >> One would have to measure how many pages these usually are, I would
> >> say it would not be that many (because recently freed pages are
> >> likely to be used again next for allocation). However, there are some
> >> pages not being reported.
> >>
> >> During the lifetime of the guest, this should not be a problem,
> >> eventually one of these pages would get allocated/freed again, so the
> >> problem "solves itself over time". You are looking into the special
> >> case of migrating the VM just after it has been started. But we have
> >> the exact same problem also for ordinary free page hinting, so we
> >> should rather solve that problem. It is not migration specific.
> >>
> >> If we are looking for an alternative to "problem solves itself",
> >> something like "if virtio-balloon comes up, it will report all free
> >> pages step by step using free page hinting, just like we would have
> >> from "arch_free_pages()"". This would be the same interface we are
> >> using for free page hinting - and it could even be made configurable in the
> guest.
> >>
> >> The current approach we are discussing internally for details about
> >> Nitesh's work ("how the magic inside arch_fee_pages() will work
> >> efficiently) would allow this as far as I can see just fine.
> >>
> >> There would be a tiny little window between virtio-balloon comes up
> >> and it has reported all free pages step by step, but that can be
> >> considered a very special corner case that I would argue is not worth
> >> it to be optimized.
> >>
> >> If I am missing something important here, sorry in advance :)
> >>
> >
> > Probably I didn't explain that well. Please see my re-try:
> >
> > That work is to monitor page allocation and free activities via
> > arch_alloc_pages and arch_free_pages. It has per-CPU lists to record
> > the pages that are freed to the mm free list, and the per-CPU lists
> > dump the recorded pages to a global list when any of them is full.
> > So its own per-CPU list will only be able to get free pages when there
> > is an mm free() function gets called. If we have 8GB free memory on
> > the mm free list, but no application uses them and thus no mm free()
> > calls are made. In that case, the arch_free_pages isn't called, and no
> > free pages added to the per-CPU list, but we have 8G free memory right
> > on the mm free list.
> > How would you guarantee the per-CPU lists have got all the free pages
> > that the mm free lists have?
> 
> As I said, if we have some mechanism that will scan the free pages (not
> arch_free_page() once and report hints using the same mechanism step by
> step (not your bulk interface)), this problem is solved. And as I said, this is
> not a migration specific problem, we have the same problem in the current
> page hinting RFC. These pages have to be reported.
> 
> >
> > - I'm also worried about the overhead of maintaining so many per-CPU
> > lists and the global list. For example, if we have applications
> > frequently allocate and free 4KB pages, and each per-CPU list needs to
> > implement the buddy algorithm to sort and merge neighbor pages. Today
> > a server can have more than 100 CPUs, then there will be more than 100
> > per-CPU lists which need to sync to a global list under a lock, I'm
> > not sure if this would scale well.
> 
> The overhead in the current RFC is definitely too high. But I consider this a
> problem to be solved before page hinting would go upstream. And we are
> discussing right now "if we have a reasonable page hinting implementation,
> why would we need your interface in addition".
> 
> >
> > - This seems to be a burden imposed on the core mm memory
> > allocation/free path. The whole overhead needs to be carried during
> > the whole system life cycle. What we actually expected is to just make
> > one call to get the free page hints only when live migration happens.
> 
> You're focusing too much on the actual implementation of the page hinting
> RFC right now. Assume for now that we would have
> - efficient page hinting without degrading other CPUs and little
>   overhead
> - a mechanism that solves reporting free pages once after we started up
>   virtio-balloon and actual free page hinting starts
> 
> Why would your suggestion still be applicable?
> 
> Your point for now is "I might not want to have page hinting enabled due to
> the overhead, but still a live migration speedup". If that overhead actually
> exists (we'll have to see) or there might be another reason to disable page
> hinting, then we have to decide if that specific setup is worth it merging your
> changes.

All the above "if we have", "assume we have" don't sound like a valid argument to me.
 
> I am not (and don't want to be) in the position to make any decisions here :) I
> just want to understand if two interfaces for free pages actually make sense.

I responded to Nitesh about the differences, you may want to check with him about this.
I would suggest you to send out your patches to LKML to get a discussion with the mm folks.

Best,
Wei

^ permalink raw reply

* RE: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Wang, Wei W @ 2018-06-29 15:52 UTC (permalink / raw)
  To: Michael S. Tsirkin, David Hildenbrand
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	riel@redhat.com, quan.xu0@gmail.com, kvm@vger.kernel.org,
	nilal@redhat.com, liliang.opensource@gmail.com,
	linux-kernel@vger.kernel.org, mhocko@kernel.org,
	linux-mm@kvack.org, pbonzini@redhat.com,
	akpm@linux-foundation.org,
	virtualization@lists.linux-foundation.org,
	torvalds@linux-foundation.org
In-Reply-To: <20180629172216-mutt-send-email-mst@kernel.org>

On Friday, June 29, 2018 10:46 PM, Michael S. Tsirkin wrote:
> To: David Hildenbrand <david@redhat.com>
> Cc: Wang, Wei W <wei.w.wang@intel.com>; virtio-dev@lists.oasis-open.org;
> linux-kernel@vger.kernel.org; virtualization@lists.linux-foundation.org;
> kvm@vger.kernel.org; linux-mm@kvack.org; mhocko@kernel.org;
> akpm@linux-foundation.org; torvalds@linux-foundation.org;
> pbonzini@redhat.com; liliang.opensource@gmail.com;
> yang.zhang.wz@gmail.com; quan.xu0@gmail.com; nilal@redhat.com;
> riel@redhat.com; peterx@redhat.com
> Subject: Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
> 
> On Wed, Jun 27, 2018 at 01:06:32PM +0200, David Hildenbrand wrote:
> > On 25.06.2018 14:05, Wei Wang wrote:
> > > This patch series is separated from the previous "Virtio-balloon
> > > Enhancement" series. The new feature,
> > > VIRTIO_BALLOON_F_FREE_PAGE_HINT, implemented by this series
> enables
> > > the virtio-balloon driver to report hints of guest free pages to the
> > > host. It can be used to accelerate live migration of VMs. Here is an
> introduction of this usage:
> > >
> > > Live migration needs to transfer the VM's memory from the source
> > > machine to the destination round by round. For the 1st round, all
> > > the VM's memory is transferred. From the 2nd round, only the pieces
> > > of memory that were written by the guest (after the 1st round) are
> > > transferred. One method that is popularly used by the hypervisor to
> > > track which part of memory is written is to write-protect all the guest
> memory.
> > >
> > > This feature enables the optimization by skipping the transfer of
> > > guest free pages during VM live migration. It is not concerned that
> > > the memory pages are used after they are given to the hypervisor as
> > > a hint of the free pages, because they will be tracked by the
> > > hypervisor and transferred in the subsequent round if they are used and
> written.
> > >
> > > * Tests
> > > - Test Environment
> > >     Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> > >     Guest: 8G RAM, 4 vCPU
> > >     Migration setup: migrate_set_speed 100G, migrate_set_downtime 2
> > > second
> > >
> > > - Test Results
> > >     - Idle Guest Live Migration Time (results are averaged over 10 runs):
> > >         - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction
> > >     - Guest with Linux Compilation Workload (make bzImage -j4):
> > >         - Live Migration Time (average)
> > >           Optimization v.s. Legacy = 1402ms v.s. 2528ms --> ~44% reduction
> > >         - Linux Compilation Time
> > >           Optimization v.s. Legacy = 5min6s v.s. 5min12s
> > >           --> no obvious difference
> > >
> >
> > Being in version 34 already, this whole thing still looks and feels
> > like a big hack to me. It might just be me, but especially if I read
> > about assumptions like "QEMU will not hotplug memory during
> > migration". This does not feel like a clean solution.
> >
> > I am still not sure if we really need this interface, especially as
> > real free page hinting might be on its way.
> >
> > a) we perform free page hinting by setting all free pages
> > (arch_free_page()) to zero. Migration will detect zero pages and
> > minimize #pages to migrate. I don't think this is a good idea but
> > Michel suggested to do a performance evaluation and Nitesh is looking
> > into that right now.
> 
> Yes this test is needed I think. If we can get most of the benefit without PV
> interfaces, that's nice.
> 
> Wei, I think you need this as part of your performance comparison
> too: set page poisoning value to 0 and enable KSM, compare with your
> patches.

Do you mean live migration with zero pages?
I can first share the amount of memory transferred during live migration I saw,
Legacy is around 380MB,
Optimization is around 340MB.
This proves that most pages have already been 0 and skipped during the legacy live migration. But the legacy time is still much larger because zero page checking is costly. 
(It's late night here, I can get you that with my server probably tomorrow)

Best,
Wei

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: David Hildenbrand @ 2018-06-29 15:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
	liliang.opensource, linux-kernel, virtualization, linux-mm,
	pbonzini, akpm, mhocko, torvalds
In-Reply-To: <20180629172216-mutt-send-email-mst@kernel.org>

>> And looking at all the discussions and problems that already happened
>> during the development of this series, I think we should rather look
>> into how clean free page hinting might solve the same problem.
> 
> I'm not sure I follow the logic. We found that neat tricks
> especially re-using the max order free page for reporting.

Let me rephrase: history of this series showed that this is some really
complicated stuff. I am asking if this complexity is actually necessary.

No question that we had a very valuable outcome so far (that especially
is also relevant for other projects like Nitesh's proposal - talking
about virtio requests and locking).

> 
>> If it can't be solved using free page hinting, fair enough.
> 
> I suspect Nitesh will need to find a way not to have mm code
> call out to random drivers or subsystems before that code
> is acceptable.
> 
> 


-- 

Thanks,

David / dhildenb

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Michael S. Tsirkin @ 2018-06-29 14:45 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
	liliang.opensource, linux-kernel, virtualization, linux-mm,
	pbonzini, akpm, mhocko, torvalds
In-Reply-To: <c4dd0a13-91fb-c0f5-b41f-54421fdacca9@redhat.com>

On Wed, Jun 27, 2018 at 01:06:32PM +0200, David Hildenbrand wrote:
> On 25.06.2018 14:05, Wei Wang wrote:
> > This patch series is separated from the previous "Virtio-balloon
> > Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,  
> > implemented by this series enables the virtio-balloon driver to report
> > hints of guest free pages to the host. It can be used to accelerate live
> > migration of VMs. Here is an introduction of this usage:
> > 
> > Live migration needs to transfer the VM's memory from the source machine
> > to the destination round by round. For the 1st round, all the VM's memory
> > is transferred. From the 2nd round, only the pieces of memory that were
> > written by the guest (after the 1st round) are transferred. One method
> > that is popularly used by the hypervisor to track which part of memory is
> > written is to write-protect all the guest memory.
> > 
> > This feature enables the optimization by skipping the transfer of guest
> > free pages during VM live migration. It is not concerned that the memory
> > pages are used after they are given to the hypervisor as a hint of the
> > free pages, because they will be tracked by the hypervisor and transferred
> > in the subsequent round if they are used and written.
> > 
> > * Tests
> > - Test Environment
> >     Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> >     Guest: 8G RAM, 4 vCPU
> >     Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
> > 
> > - Test Results
> >     - Idle Guest Live Migration Time (results are averaged over 10 runs):
> >         - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction
> >     - Guest with Linux Compilation Workload (make bzImage -j4):
> >         - Live Migration Time (average)
> >           Optimization v.s. Legacy = 1402ms v.s. 2528ms --> ~44% reduction
> >         - Linux Compilation Time
> >           Optimization v.s. Legacy = 5min6s v.s. 5min12s
> >           --> no obvious difference
> > 
> 
> Being in version 34 already, this whole thing still looks and feels like
> a big hack to me. It might just be me, but especially if I read about
> assumptions like "QEMU will not hotplug memory during migration". This
> does not feel like a clean solution.
> 
> I am still not sure if we really need this interface, especially as real
> free page hinting might be on its way.
> 
> a) we perform free page hinting by setting all free pages
> (arch_free_page()) to zero. Migration will detect zero pages and
> minimize #pages to migrate. I don't think this is a good idea but Michel
> suggested to do a performance evaluation and Nitesh is looking into that
> right now.

Yes this test is needed I think. If we can get most of the benefit
without PV interfaces, that's nice.

Wei, I think you need this as part of your performance comparison
too: set page poisoning value to 0 and enable KSM, compare with
your patches.


> b) we perform free page hinting using something that Nitesh proposed. We
> get in QEMU blocks of free pages that we can MADV_FREE. In addition we
> could e.g. clear the dirty bit of these pages in the dirty bitmap, to
> hinder them from getting migrated. Right now the hinting mechanism is
> synchronous (called from arch_free_page()) but we might be able to
> convert it into something asynchronous.
> 
> So we might be able to completely get rid of this interface.

The way I see it, hinting during alloc/free will always add
overhead which might be unacceptable for some people.  So even with
Nitesh's patches there's value in enabling / disabling hinting
dynamically. And Wei's patches would then be useful to set
the stage where we know the initial page state.


> And looking at all the discussions and problems that already happened
> during the development of this series, I think we should rather look
> into how clean free page hinting might solve the same problem.

I'm not sure I follow the logic. We found that neat tricks
especially re-using the max order free page for reporting.

> If it can't be solved using free page hinting, fair enough.

I suspect Nitesh will need to find a way not to have mm code
call out to random drivers or subsystems before that code
is acceptable.


-- 
MST

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: David Hildenbrand @ 2018-06-29 11:53 UTC (permalink / raw)
  To: Wei Wang, virtio-dev, linux-kernel, virtualization, kvm, linux-mm,
	mst, mhocko, akpm
  Cc: yang.zhang.wz, Andrea Arcangeli, riel, quan.xu0,
	liliang.opensource, Luiz Capitulino, pbonzini, nilal, torvalds
In-Reply-To: <5B36189E.5050204@intel.com>

On 29.06.2018 13:31, Wei Wang wrote:
> On 06/29/2018 03:46 PM, David Hildenbrand wrote:
>>>
>>> I'm afraid it can't. For example, when we have a guest booted, without
>>> too many memory activities. Assume the guest has 8GB free memory. The
>>> arch_free_page there won't be able to capture the 8GB free pages since
>>> there is no free() called. This results in no free pages reported to host.
>>
>> So, it takes some time from when the guest boots up until the balloon
>> device was initialized and therefore page hinting can start. For that
>> period, you won't get any arch_free_page()/page hinting callbacks, correct.
>>
>> However in the hypervisor, you can theoretically track which pages the
>> guest actually touched ("dirty"), so you already know "which pages were
>> never touched while booting up until virtio-balloon was brought to
>> life". These, you can directly exclude from migration. No interface
>> required.
>>
>> The remaining problem is pages that were touched ("allocated") by the
>> guest during bootup but freed again, before virtio-balloon came up. One
>> would have to measure how many pages these usually are, I would say it
>> would not be that many (because recently freed pages are likely to be
>> used again next for allocation). However, there are some pages not being
>> reported.
>>
>> During the lifetime of the guest, this should not be a problem,
>> eventually one of these pages would get allocated/freed again, so the
>> problem "solves itself over time". You are looking into the special case
>> of migrating the VM just after it has been started. But we have the
>> exact same problem also for ordinary free page hinting, so we should
>> rather solve that problem. It is not migration specific.
>>
>> If we are looking for an alternative to "problem solves itself",
>> something like "if virtio-balloon comes up, it will report all free
>> pages step by step using free page hinting, just like we would have from
>> "arch_free_pages()"". This would be the same interface we are using for
>> free page hinting - and it could even be made configurable in the guest.
>>
>> The current approach we are discussing internally for details about
>> Nitesh's work ("how the magic inside arch_fee_pages() will work
>> efficiently) would allow this as far as I can see just fine.
>>
>> There would be a tiny little window between virtio-balloon comes up and
>> it has reported all free pages step by step, but that can be considered
>> a very special corner case that I would argue is not worth it to be
>> optimized.
>>
>> If I am missing something important here, sorry in advance :)
>>
> 
> Probably I didn't explain that well. Please see my re-try:
> 
> That work is to monitor page allocation and free activities via 
> arch_alloc_pages and arch_free_pages. It has per-CPU lists to record the 
> pages that are freed to the mm free list, and the per-CPU lists dump the 
> recorded pages to a global list when any of them is full.
> So its own per-CPU list will only be able to get free pages when there 
> is an mm free() function gets called. If we have 8GB free memory on the 
> mm free list, but no application uses them and thus no mm free() calls 
> are made. In that case, the arch_free_pages isn't called, and no free 
> pages added to the per-CPU list, but we have 8G free memory right on the 
> mm free list.
> How would you guarantee the per-CPU lists have got all the free pages 
> that the mm free lists have?

As I said, if we have some mechanism that will scan the free pages (not
arch_free_page() once and report hints using the same mechanism step by
step (not your bulk interface)), this problem is solved. And as I said,
this is not a migration specific problem, we have the same problem in
the current page hinting RFC. These pages have to be reported.

> 
> - I'm also worried about the overhead of maintaining so many per-CPU 
> lists and the global list. For example, if we have applications 
> frequently allocate and free 4KB pages, and each per-CPU list needs to 
> implement the buddy algorithm to sort and merge neighbor pages. Today a 
> server can have more than 100 CPUs, then there will be more than 100 
> per-CPU lists which need to sync to a global list under a lock, I'm not 
> sure if this would scale well.

The overhead in the current RFC is definitely too high. But I consider
this a problem to be solved before page hinting would go upstream. And
we are discussing right now "if we have a reasonable page hinting
implementation, why would we need your interface in addition".

> 
> - This seems to be a burden imposed on the core mm memory 
> allocation/free path. The whole overhead needs to be carried during the 
> whole system life cycle. What we actually expected is to just make one 
> call to get the free page hints only when live migration happens.

You're focusing too much on the actual implementation of the page
hinting RFC right now. Assume for now that we would have
- efficient page hinting without degrading other CPUs and little
  overhead
- a mechanism that solves reporting free pages once after we started up
  virtio-balloon and actual free page hinting starts

Why would your suggestion still be applicable?

Your point for now is "I might not want to have page hinting enabled due
to the overhead, but still a live migration speedup". If that overhead
actually exists (we'll have to see) or there might be another reason to
disable page hinting, then we have to decide if that specific setup is
worth it merging your changes.

I am not (and don't want to be) in the position to make any decisions
here :) I just want to understand if two interfaces for free pages
actually make sense.

Maybe the argument "I don't want free page hinting" is good enough.

> 
> Best,
> Wei


-- 

Thanks,

David / dhildenb

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-06-29 11:31 UTC (permalink / raw)
  To: David Hildenbrand, virtio-dev, linux-kernel, virtualization, kvm,
	linux-mm, mst, mhocko, akpm
  Cc: yang.zhang.wz, Andrea Arcangeli, riel, quan.xu0,
	liliang.opensource, Luiz Capitulino, pbonzini, nilal, torvalds
In-Reply-To: <4840cbb7-dd3f-7540-6a7c-13427de2f0d1@redhat.com>

On 06/29/2018 03:46 PM, David Hildenbrand wrote:
>>
>> I'm afraid it can't. For example, when we have a guest booted, without
>> too many memory activities. Assume the guest has 8GB free memory. The
>> arch_free_page there won't be able to capture the 8GB free pages since
>> there is no free() called. This results in no free pages reported to host.
>
> So, it takes some time from when the guest boots up until the balloon
> device was initialized and therefore page hinting can start. For that
> period, you won't get any arch_free_page()/page hinting callbacks, correct.
>
> However in the hypervisor, you can theoretically track which pages the
> guest actually touched ("dirty"), so you already know "which pages were
> never touched while booting up until virtio-balloon was brought to
> life". These, you can directly exclude from migration. No interface
> required.
>
> The remaining problem is pages that were touched ("allocated") by the
> guest during bootup but freed again, before virtio-balloon came up. One
> would have to measure how many pages these usually are, I would say it
> would not be that many (because recently freed pages are likely to be
> used again next for allocation). However, there are some pages not being
> reported.
>
> During the lifetime of the guest, this should not be a problem,
> eventually one of these pages would get allocated/freed again, so the
> problem "solves itself over time". You are looking into the special case
> of migrating the VM just after it has been started. But we have the
> exact same problem also for ordinary free page hinting, so we should
> rather solve that problem. It is not migration specific.
>
> If we are looking for an alternative to "problem solves itself",
> something like "if virtio-balloon comes up, it will report all free
> pages step by step using free page hinting, just like we would have from
> "arch_free_pages()"". This would be the same interface we are using for
> free page hinting - and it could even be made configurable in the guest.
>
> The current approach we are discussing internally for details about
> Nitesh's work ("how the magic inside arch_fee_pages() will work
> efficiently) would allow this as far as I can see just fine.
>
> There would be a tiny little window between virtio-balloon comes up and
> it has reported all free pages step by step, but that can be considered
> a very special corner case that I would argue is not worth it to be
> optimized.
>
> If I am missing something important here, sorry in advance :)
>

Probably I didn't explain that well. Please see my re-try:

That work is to monitor page allocation and free activities via 
arch_alloc_pages and arch_free_pages. It has per-CPU lists to record the 
pages that are freed to the mm free list, and the per-CPU lists dump the 
recorded pages to a global list when any of them is full.
So its own per-CPU list will only be able to get free pages when there 
is an mm free() function gets called. If we have 8GB free memory on the 
mm free list, but no application uses them and thus no mm free() calls 
are made. In that case, the arch_free_pages isn't called, and no free 
pages added to the per-CPU list, but we have 8G free memory right on the 
mm free list.
How would you guarantee the per-CPU lists have got all the free pages 
that the mm free lists have?

- I'm also worried about the overhead of maintaining so many per-CPU 
lists and the global list. For example, if we have applications 
frequently allocate and free 4KB pages, and each per-CPU list needs to 
implement the buddy algorithm to sort and merge neighbor pages. Today a 
server can have more than 100 CPUs, then there will be more than 100 
per-CPU lists which need to sync to a global list under a lock, I'm not 
sure if this would scale well.

- This seems to be a burden imposed on the core mm memory 
allocation/free path. The whole overhead needs to be carried during the 
whole system life cycle. What we actually expected is to just make one 
call to get the free page hints only when live migration happens.

Best,
Wei

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-06-29  9:30 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <1530259790-2414-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>



On 2018年06月29日 16:09, Toshiaki Makita wrote:
> Under heavy load vhost busypoll may run without suppressing
> notification. For example tx zerocopy callback can push tx work while
> handle_tx() is running, then busyloop exits due to vhost_has_work()
> condition and enables notification but immediately reenters handle_tx()
> because the pushed work was tx. In this case handle_tx() tries to
> disable notification again, but when using event_idx it by design
> cannot. Then busyloop will run without suppressing notification.
> Another example is the case where handle_tx() tries to enable
> notification but avail idx is advanced so disables it again. This case
> also lead to the same situation with event_idx.

Good catch.

>
> The problem is that once we enter this situation busyloop does not work
> under heavy load for considerable amount of time, because notification
> is likely to happen during busyloop and handle_tx() immediately enables
> notification after notification happens. Specifically busyloop detects
> notification by vhost_has_work() and then handle_tx() calls
> vhost_enable_notify(). Because the detected work was the tx work, it
> enters handle_tx(), and enters busyloop without suppression again.
> This is likely to be repeated, so with event_idx we are almost not able
> to suppress notification in this case.
>
> To fix this, poll the work instead of enabling notification when
> busypoll is interrupted by something. IMHO signal_pending() and
> vhost_has_work() are kind of interruptions rather than signals to
> completely cancel the busypoll, so let's run busypoll after the
> necessary work is done. In order to avoid too long busyloop due to
> interruption, save the endtime in vq field and use it when reentering
> the work function.

I think we don't care long busyloop unless e.g tx can starve rx?

>
> I observed this problem makes tx performance poor but does not rx. I
> guess it is because rx notification from socket is not that heavy so
> did not impact the performance even when we failed to mask the
> notification.

I think the reason is:

For tx, we may only process used ring under handle_tx(). Busy polling 
code can not recognize this case.
For rx, we call peek_head_len() after exiting busy loop, so if the work 
is for rx, it can be processed in handle_rx() immediately.

> Anyway for consistency I fixed rx routine as well as tx.
>
> Performance numbers:
>
> - Bulk transfer from guest to external physical server.
>      [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]

Just to confirm in this case since zerocopy is enabled, we are in fact 
use the generic XDP datapath?

> - Set 10us busypoll.
> - Guest disables checksum and TSO because of host XDP.
> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>    (EPT_MISCONFIG event).
>
>                              Before              After
>                            Mbps  kicks/s      Mbps  kicks/s
> UDP_STREAM 1472byte              247758                 27
>                  Send   3645.37            6958.10
>                  Recv   3588.56            6958.10
>                1byte                9865                 37
>                  Send      4.34               5.43
>                  Recv      4.17               5.26
> TCP_STREAM             8801.03    45794   9592.77     2884

Impressive numbers.

>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
>   drivers/vhost/net.c   | 94 +++++++++++++++++++++++++++++++++++----------------
>   drivers/vhost/vhost.c |  1 +
>   drivers/vhost/vhost.h |  1 +
>   3 files changed, 66 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index eeaf6739215f..0e85f628b965 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -391,13 +391,14 @@ static inline unsigned long busy_clock(void)
>   	return local_clock() >> 10;
>   }
>   
> -static bool vhost_can_busy_poll(struct vhost_dev *dev,
> -				unsigned long endtime)
> +static bool vhost_can_busy_poll(unsigned long endtime)
>   {
> -	return likely(!need_resched()) &&
> -	       likely(!time_after(busy_clock(), endtime)) &&
> -	       likely(!signal_pending(current)) &&
> -	       !vhost_has_work(dev);
> +	return likely(!need_resched() && !time_after(busy_clock(), endtime));
> +}
> +
> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
> +{
> +	return unlikely(signal_pending(current)) || vhost_has_work(dev);
>   }
>   
>   static void vhost_net_disable_vq(struct vhost_net *n,
> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>   
>   	if (r == vq->num && vq->busyloop_timeout) {
>   		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> +		if (vq->busyloop_endtime) {
> +			endtime = vq->busyloop_endtime;
> +			vq->busyloop_endtime = 0;

Looks like endtime may be before current time. So we skip busy loop in 
this case.

> +		} else {
> +			endtime = busy_clock() + vq->busyloop_timeout;
> +		}
> +		while (vhost_can_busy_poll(endtime)) {
> +			if (vhost_busy_poll_interrupted(vq->dev)) {
> +				vq->busyloop_endtime = endtime;
> +				break;
> +			}
> +			if (!vhost_vq_avail_empty(vq->dev, vq))
> +				break;
>   			cpu_relax();
> +		}
>   		preempt_enable();
>   		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>   				      out_num, in_num, NULL, NULL);
> @@ -509,12 +521,16 @@ static void handle_tx(struct vhost_net *net)
>   			break;
>   		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>   		if (head == vq->num) {
> -			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> +			if (unlikely(vq->busyloop_endtime)) {
> +				/* Busy poll is interrupted. */
> +				vhost_poll_queue(&vq->poll);
> +			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>   				vhost_disable_notify(&net->dev, vq);
>   				continue;
>   			}
>   			break;
>   		}
> +		vq->busyloop_endtime = 0;
>   		if (in) {
>   			vq_err(vq, "Unexpected descriptor format for TX: "
>   			       "out %d, int %d\n", out, in);
> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>   
>   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>   {
> -	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> -	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> -	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> +	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
> +	struct vhost_virtqueue *rvq = &rnvq->vq;
> +	struct vhost_virtqueue *tvq = &tnvq->vq;

NIt: I admit the variable name is confusing. It would be better to do 
the renaming in a separate patch to ease review.

>   	unsigned long uninitialized_var(endtime);
> -	int len = peek_head_len(rvq, sk);
> +	int len = peek_head_len(rnvq, sk);
>   
> -	if (!len && vq->busyloop_timeout) {
> +	if (!len && tvq->busyloop_timeout) {
>   		/* Flush batched heads first */
> -		vhost_rx_signal_used(rvq);
> +		vhost_rx_signal_used(rnvq);
>   		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, 1);
> -		vhost_disable_notify(&net->dev, vq);
> +		mutex_lock_nested(&tvq->mutex, 1);
> +		vhost_disable_notify(&net->dev, tvq);
>   
>   		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> +		if (rvq->busyloop_endtime) {
> +			endtime = rvq->busyloop_endtime;
> +			rvq->busyloop_endtime = 0;
> +		} else {
> +			endtime = busy_clock() + tvq->busyloop_timeout;
> +		}
>   
> -		while (vhost_can_busy_poll(&net->dev, endtime) &&
> -		       !sk_has_rx_data(sk) &&
> -		       vhost_vq_avail_empty(&net->dev, vq))
> +		while (vhost_can_busy_poll(endtime)) {
> +			if (vhost_busy_poll_interrupted(&net->dev)) {
> +				rvq->busyloop_endtime = endtime;
> +				break;
> +			}
> +			if (sk_has_rx_data(sk) ||
> +			    !vhost_vq_avail_empty(&net->dev, tvq))
> +				break;
>   			cpu_relax();

Actually, I plan to poll guest rx refilling as well here to avoid rx 
kicks. Want to draft another patch to do it as well? It's as simple as 
add a vhost_vq_avail_empty for rvq above.

> +		}
>   
>   		preempt_enable();
>   
> -		if (!vhost_vq_avail_empty(&net->dev, vq))
> -			vhost_poll_queue(&vq->poll);
> -		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> -			vhost_disable_notify(&net->dev, vq);
> -			vhost_poll_queue(&vq->poll);
> +		if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> +			vhost_poll_queue(&tvq->poll);
> +		} else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> +			vhost_disable_notify(&net->dev, tvq);
> +			vhost_poll_queue(&tvq->poll);
>   		}
>   
> -		mutex_unlock(&vq->mutex);
> +		mutex_unlock(&tvq->mutex);
>   
> -		len = peek_head_len(rvq, sk);
> +		len = peek_head_len(rnvq, sk);
>   	}
>   
>   	return len;
> @@ -804,6 +832,7 @@ static void handle_rx(struct vhost_net *net)
>   	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
>   
>   	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
> +		vq->busyloop_endtime = 0;
>   		sock_len += sock_hlen;
>   		vhost_len = sock_len + vhost_hlen;
>   		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>   			goto out;
>   		}
>   	}
> -	vhost_net_enable_vq(net, vq);
> +	if (unlikely(vq->busyloop_endtime)) {
> +		/* Busy poll is interrupted. */
> +		vhost_poll_queue(&vq->poll);
> +	} else {
> +		vhost_net_enable_vq(net, vq);
> +	}

This is to reduce the rx wake ups. Better with another patch.

So I suggest to split the patches into:

1 better naming of variable of vhost_net_rx_peek_head_len().
2 avoid tx kicks (most of what this patch did)
3 avoid rx wakeups (exactly the above 6 lines did)
4 avoid rx kicks

Btw, tonghao is doing some refactoring of busy polling as well. Depends 
on the order of being merged, one of you may need rebasing.

Thanks

>   out:
>   	vhost_rx_signal_used(nvq);
>   	mutex_unlock(&vq->mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 9beefa6ed1ce..fe83578fe336 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -323,6 +323,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>   	vhost_reset_is_le(vq);
>   	vhost_disable_cross_endian(vq);
>   	vq->busyloop_timeout = 0;
> +	vq->busyloop_endtime = 0;
>   	vq->umem = NULL;
>   	vq->iotlb = NULL;
>   	__vhost_vq_meta_reset(vq);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 6c844b90a168..7e9cf80ccae9 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -144,6 +144,7 @@ struct vhost_virtqueue {
>   	bool user_be;
>   #endif
>   	u32 busyloop_timeout;
> +	unsigned long busyloop_endtime;
>   };
>   
>   struct vhost_msg_node {

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-06-29  8:09 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang; +Cc: netdev, virtualization, kvm

Under heavy load vhost busypoll may run without suppressing
notification. For example tx zerocopy callback can push tx work while
handle_tx() is running, then busyloop exits due to vhost_has_work()
condition and enables notification but immediately reenters handle_tx()
because the pushed work was tx. In this case handle_tx() tries to
disable notification again, but when using event_idx it by design
cannot. Then busyloop will run without suppressing notification.
Another example is the case where handle_tx() tries to enable
notification but avail idx is advanced so disables it again. This case
also lead to the same situation with event_idx.

The problem is that once we enter this situation busyloop does not work
under heavy load for considerable amount of time, because notification
is likely to happen during busyloop and handle_tx() immediately enables
notification after notification happens. Specifically busyloop detects
notification by vhost_has_work() and then handle_tx() calls
vhost_enable_notify(). Because the detected work was the tx work, it
enters handle_tx(), and enters busyloop without suppression again.
This is likely to be repeated, so with event_idx we are almost not able
to suppress notification in this case.

To fix this, poll the work instead of enabling notification when
busypoll is interrupted by something. IMHO signal_pending() and
vhost_has_work() are kind of interruptions rather than signals to
completely cancel the busypoll, so let's run busypoll after the
necessary work is done. In order to avoid too long busyloop due to
interruption, save the endtime in vq field and use it when reentering
the work function.

I observed this problem makes tx performance poor but does not rx. I
guess it is because rx notification from socket is not that heavy so
did not impact the performance even when we failed to mask the
notification. Anyway for consistency I fixed rx routine as well as tx.

Performance numbers:

- Bulk transfer from guest to external physical server.
    [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
- Set 10us busypoll.
- Guest disables checksum and TSO because of host XDP.
- Measured single flow Mbps by netperf, and kicks by perf kvm stat
  (EPT_MISCONFIG event).

                            Before              After
                          Mbps  kicks/s      Mbps  kicks/s
UDP_STREAM 1472byte              247758                 27
                Send   3645.37            6958.10
                Recv   3588.56            6958.10
              1byte                9865                 37
                Send      4.34               5.43
                Recv      4.17               5.26
TCP_STREAM             8801.03    45794   9592.77     2884

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/vhost/net.c   | 94 +++++++++++++++++++++++++++++++++++----------------
 drivers/vhost/vhost.c |  1 +
 drivers/vhost/vhost.h |  1 +
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index eeaf6739215f..0e85f628b965 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -391,13 +391,14 @@ static inline unsigned long busy_clock(void)
 	return local_clock() >> 10;
 }
 
-static bool vhost_can_busy_poll(struct vhost_dev *dev,
-				unsigned long endtime)
+static bool vhost_can_busy_poll(unsigned long endtime)
 {
-	return likely(!need_resched()) &&
-	       likely(!time_after(busy_clock(), endtime)) &&
-	       likely(!signal_pending(current)) &&
-	       !vhost_has_work(dev);
+	return likely(!need_resched() && !time_after(busy_clock(), endtime));
+}
+
+static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
+{
+	return unlikely(signal_pending(current)) || vhost_has_work(dev);
 }
 
 static void vhost_net_disable_vq(struct vhost_net *n,
@@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 
 	if (r == vq->num && vq->busyloop_timeout) {
 		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
-		while (vhost_can_busy_poll(vq->dev, endtime) &&
-		       vhost_vq_avail_empty(vq->dev, vq))
+		if (vq->busyloop_endtime) {
+			endtime = vq->busyloop_endtime;
+			vq->busyloop_endtime = 0;
+		} else {
+			endtime = busy_clock() + vq->busyloop_timeout;
+		}
+		while (vhost_can_busy_poll(endtime)) {
+			if (vhost_busy_poll_interrupted(vq->dev)) {
+				vq->busyloop_endtime = endtime;
+				break;
+			}
+			if (!vhost_vq_avail_empty(vq->dev, vq))
+				break;
 			cpu_relax();
+		}
 		preempt_enable();
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				      out_num, in_num, NULL, NULL);
@@ -509,12 +521,16 @@ static void handle_tx(struct vhost_net *net)
 			break;
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+			if (unlikely(vq->busyloop_endtime)) {
+				/* Busy poll is interrupted. */
+				vhost_poll_queue(&vq->poll);
+			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
+		vq->busyloop_endtime = 0;
 		if (in) {
 			vq_err(vq, "Unexpected descriptor format for TX: "
 			       "out %d, int %d\n", out, in);
@@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
-	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
-	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
+	struct vhost_virtqueue *rvq = &rnvq->vq;
+	struct vhost_virtqueue *tvq = &tnvq->vq;
 	unsigned long uninitialized_var(endtime);
-	int len = peek_head_len(rvq, sk);
+	int len = peek_head_len(rnvq, sk);
 
-	if (!len && vq->busyloop_timeout) {
+	if (!len && tvq->busyloop_timeout) {
 		/* Flush batched heads first */
-		vhost_rx_signal_used(rvq);
+		vhost_rx_signal_used(rnvq);
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock_nested(&vq->mutex, 1);
-		vhost_disable_notify(&net->dev, vq);
+		mutex_lock_nested(&tvq->mutex, 1);
+		vhost_disable_notify(&net->dev, tvq);
 
 		preempt_disable();
-		endtime = busy_clock() + vq->busyloop_timeout;
+		if (rvq->busyloop_endtime) {
+			endtime = rvq->busyloop_endtime;
+			rvq->busyloop_endtime = 0;
+		} else {
+			endtime = busy_clock() + tvq->busyloop_timeout;
+		}
 
-		while (vhost_can_busy_poll(&net->dev, endtime) &&
-		       !sk_has_rx_data(sk) &&
-		       vhost_vq_avail_empty(&net->dev, vq))
+		while (vhost_can_busy_poll(endtime)) {
+			if (vhost_busy_poll_interrupted(&net->dev)) {
+				rvq->busyloop_endtime = endtime;
+				break;
+			}
+			if (sk_has_rx_data(sk) ||
+			    !vhost_vq_avail_empty(&net->dev, tvq))
+				break;
 			cpu_relax();
+		}
 
 		preempt_enable();
 
-		if (!vhost_vq_avail_empty(&net->dev, vq))
-			vhost_poll_queue(&vq->poll);
-		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-			vhost_disable_notify(&net->dev, vq);
-			vhost_poll_queue(&vq->poll);
+		if (!vhost_vq_avail_empty(&net->dev, tvq)) {
+			vhost_poll_queue(&tvq->poll);
+		} else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
+			vhost_disable_notify(&net->dev, tvq);
+			vhost_poll_queue(&tvq->poll);
 		}
 
-		mutex_unlock(&vq->mutex);
+		mutex_unlock(&tvq->mutex);
 
-		len = peek_head_len(rvq, sk);
+		len = peek_head_len(rnvq, sk);
 	}
 
 	return len;
@@ -804,6 +832,7 @@ static void handle_rx(struct vhost_net *net)
 	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+		vq->busyloop_endtime = 0;
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
 		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
 			goto out;
 		}
 	}
-	vhost_net_enable_vq(net, vq);
+	if (unlikely(vq->busyloop_endtime)) {
+		/* Busy poll is interrupted. */
+		vhost_poll_queue(&vq->poll);
+	} else {
+		vhost_net_enable_vq(net, vq);
+	}
 out:
 	vhost_rx_signal_used(nvq);
 	mutex_unlock(&vq->mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9beefa6ed1ce..fe83578fe336 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -323,6 +323,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
+	vq->busyloop_endtime = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
 	__vhost_vq_meta_reset(vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b90a168..7e9cf80ccae9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -144,6 +144,7 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	unsigned long busyloop_endtime;
 };
 
 struct vhost_msg_node {
-- 
2.14.2

^ permalink raw reply related

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: David Hildenbrand @ 2018-06-29  7:46 UTC (permalink / raw)
  To: Wei Wang, virtio-dev, linux-kernel, virtualization, kvm, linux-mm,
	mst, mhocko, akpm
  Cc: yang.zhang.wz, Andrea Arcangeli, riel, quan.xu0,
	liliang.opensource, Luiz Capitulino, pbonzini, nilal, torvalds
In-Reply-To: <5B35ACD5.4090800@intel.com>

On 29.06.2018 05:51, Wei Wang wrote:
> On 06/27/2018 07:06 PM, David Hildenbrand wrote:
>> On 25.06.2018 14:05, Wei Wang wrote:
>>> This patch series is separated from the previous "Virtio-balloon
>>> Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>>> implemented by this series enables the virtio-balloon driver to report
>>> hints of guest free pages to the host. It can be used to accelerate live
>>> migration of VMs. Here is an introduction of this usage:
>>>
>>> Live migration needs to transfer the VM's memory from the source machine
>>> to the destination round by round. For the 1st round, all the VM's memory
>>> is transferred. From the 2nd round, only the pieces of memory that were
>>> written by the guest (after the 1st round) are transferred. One method
>>> that is popularly used by the hypervisor to track which part of memory is
>>> written is to write-protect all the guest memory.
>>>
>>> This feature enables the optimization by skipping the transfer of guest
>>> free pages during VM live migration. It is not concerned that the memory
>>> pages are used after they are given to the hypervisor as a hint of the
>>> free pages, because they will be tracked by the hypervisor and transferred
>>> in the subsequent round if they are used and written.
>>>
>>> * Tests
>>> - Test Environment
>>>      Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
>>>      Guest: 8G RAM, 4 vCPU
>>>      Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
>>>
>>> - Test Results
>>>      - Idle Guest Live Migration Time (results are averaged over 10 runs):
>>>          - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction
>>>      - Guest with Linux Compilation Workload (make bzImage -j4):
>>>          - Live Migration Time (average)
>>>            Optimization v.s. Legacy = 1402ms v.s. 2528ms --> ~44% reduction
>>>          - Linux Compilation Time
>>>            Optimization v.s. Legacy = 5min6s v.s. 5min12s
>>>            --> no obvious difference
>>>
>> Being in version 34 already, this whole thing still looks and feels like
>> a big hack to me. It might just be me, but especially if I read about
>> assumptions like "QEMU will not hotplug memory during migration". This
>> does not feel like a clean solution.
>>
>> I am still not sure if we really need this interface, especially as real
>> free page hinting might be on its way.
>>
>> a) we perform free page hinting by setting all free pages
>> (arch_free_page()) to zero. Migration will detect zero pages and
>> minimize #pages to migrate. I don't think this is a good idea but Michel
>> suggested to do a performance evaluation and Nitesh is looking into that
>> right now.
> 
> The hypervisor doesn't get the zero pages for free. It pays lots of CPU 
> utilization and memory bandwidth (there are some guys complaining abut 
> this on the QEMU mailinglist)
> In the above results, the legacy part already has the zero page feature 
> in use.

Indeed, I don't consider this attempt not very practical in general,
especially as it would rely on KSM right now, which is frowned upon by
many people.

> 
>>
>> b) we perform free page hinting using something that Nitesh proposed. We
>> get in QEMU blocks of free pages that we can MADV_FREE. In addition we
>> could e.g. clear the dirty bit of these pages in the dirty bitmap, to
>> hinder them from getting migrated. Right now the hinting mechanism is
>> synchronous (called from arch_free_page()) but we might be able to
>> convert it into something asynchronous.
>>
>> So we might be able to completely get rid of this interface. And looking
>> at all the discussions and problems that already happened during the
>> development of this series, I think we should rather look into how clean
>> free page hinting might solve the same problem.
>>
>> If it can't be solved using free page hinting, fair enough.
>>
> 
> I'm afraid it can't. For example, when we have a guest booted, without 
> too many memory activities. Assume the guest has 8GB free memory. The 
> arch_free_page there won't be able to capture the 8GB free pages since 
> there is no free() called. This results in no free pages reported to host.


So, it takes some time from when the guest boots up until the balloon
device was initialized and therefore page hinting can start. For that
period, you won't get any arch_free_page()/page hinting callbacks, correct.

However in the hypervisor, you can theoretically track which pages the
guest actually touched ("dirty"), so you already know "which pages were
never touched while booting up until virtio-balloon was brought to
life". These, you can directly exclude from migration. No interface
required.

The remaining problem is pages that were touched ("allocated") by the
guest during bootup but freed again, before virtio-balloon came up. One
would have to measure how many pages these usually are, I would say it
would not be that many (because recently freed pages are likely to be
used again next for allocation). However, there are some pages not being
reported.

During the lifetime of the guest, this should not be a problem,
eventually one of these pages would get allocated/freed again, so the
problem "solves itself over time". You are looking into the special case
of migrating the VM just after it has been started. But we have the
exact same problem also for ordinary free page hinting, so we should
rather solve that problem. It is not migration specific.

If we are looking for an alternative to "problem solves itself",
something like "if virtio-balloon comes up, it will report all free
pages step by step using free page hinting, just like we would have from
"arch_free_pages()"". This would be the same interface we are using for
free page hinting - and it could even be made configurable in the guest.

The current approach we are discussing internally for details about
Nitesh's work ("how the magic inside arch_fee_pages() will work
efficiently) would allow this as far as I can see just fine.

There would be a tiny little window between virtio-balloon comes up and
it has reported all free pages step by step, but that can be considered
a very special corner case that I would argue is not worth it to be
optimized.

If I am missing something important here, sorry in advance :)

> 
> Best,
> Wei
> 


-- 

Thanks,

David / dhildenb

^ permalink raw reply

* Re: [PATCH v34 0/4] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-06-29  3:51 UTC (permalink / raw)
  To: David Hildenbrand, virtio-dev, linux-kernel, virtualization, kvm,
	linux-mm, mst, mhocko, akpm
  Cc: yang.zhang.wz, riel, quan.xu0, liliang.opensource, pbonzini,
	nilal, torvalds
In-Reply-To: <c4dd0a13-91fb-c0f5-b41f-54421fdacca9@redhat.com>

On 06/27/2018 07:06 PM, David Hildenbrand wrote:
> On 25.06.2018 14:05, Wei Wang wrote:
>> This patch series is separated from the previous "Virtio-balloon
>> Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>> implemented by this series enables the virtio-balloon driver to report
>> hints of guest free pages to the host. It can be used to accelerate live
>> migration of VMs. Here is an introduction of this usage:
>>
>> Live migration needs to transfer the VM's memory from the source machine
>> to the destination round by round. For the 1st round, all the VM's memory
>> is transferred. From the 2nd round, only the pieces of memory that were
>> written by the guest (after the 1st round) are transferred. One method
>> that is popularly used by the hypervisor to track which part of memory is
>> written is to write-protect all the guest memory.
>>
>> This feature enables the optimization by skipping the transfer of guest
>> free pages during VM live migration. It is not concerned that the memory
>> pages are used after they are given to the hypervisor as a hint of the
>> free pages, because they will be tracked by the hypervisor and transferred
>> in the subsequent round if they are used and written.
>>
>> * Tests
>> - Test Environment
>>      Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
>>      Guest: 8G RAM, 4 vCPU
>>      Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
>>
>> - Test Results
>>      - Idle Guest Live Migration Time (results are averaged over 10 runs):
>>          - Optimization v.s. Legacy = 284ms vs 1757ms --> ~84% reduction
>>      - Guest with Linux Compilation Workload (make bzImage -j4):
>>          - Live Migration Time (average)
>>            Optimization v.s. Legacy = 1402ms v.s. 2528ms --> ~44% reduction
>>          - Linux Compilation Time
>>            Optimization v.s. Legacy = 5min6s v.s. 5min12s
>>            --> no obvious difference
>>
> Being in version 34 already, this whole thing still looks and feels like
> a big hack to me. It might just be me, but especially if I read about
> assumptions like "QEMU will not hotplug memory during migration". This
> does not feel like a clean solution.
>
> I am still not sure if we really need this interface, especially as real
> free page hinting might be on its way.
>
> a) we perform free page hinting by setting all free pages
> (arch_free_page()) to zero. Migration will detect zero pages and
> minimize #pages to migrate. I don't think this is a good idea but Michel
> suggested to do a performance evaluation and Nitesh is looking into that
> right now.

The hypervisor doesn't get the zero pages for free. It pays lots of CPU 
utilization and memory bandwidth (there are some guys complaining abut 
this on the QEMU mailinglist)
In the above results, the legacy part already has the zero page feature 
in use.

>
> b) we perform free page hinting using something that Nitesh proposed. We
> get in QEMU blocks of free pages that we can MADV_FREE. In addition we
> could e.g. clear the dirty bit of these pages in the dirty bitmap, to
> hinder them from getting migrated. Right now the hinting mechanism is
> synchronous (called from arch_free_page()) but we might be able to
> convert it into something asynchronous.
>
> So we might be able to completely get rid of this interface. And looking
> at all the discussions and problems that already happened during the
> development of this series, I think we should rather look into how clean
> free page hinting might solve the same problem.
>
> If it can't be solved using free page hinting, fair enough.
>

I'm afraid it can't. For example, when we have a guest booted, without 
too many memory activities. Assume the guest has 8GB free memory. The 
arch_free_page there won't be able to capture the 8GB free pages since 
there is no free() called. This results in no free pages reported to host.

Best,
Wei

^ permalink raw reply

* Re: [PATCH v2 0/5] Add virtio-iommu driver
From: Peter Maydell @ 2018-06-28  9:40 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: devicetree@vger.kernel.org, jayachandran.nair@cavium.com,
	Lorenzo Pieralisi, tnowicki@caviumnetworks.com,
	kvm@vger.kernel.org, virtio-dev@lists.oasis-open.org, Will Deacon,
	iommu@lists.linux-foundation.org,
	virtualization@lists.linux-foundation.org, Marc Zyngier,
	robh+dt@kernel.org, kvmarm@lists.cs.columbia.edu, Robin Murphy,
	joro@8bytes.org
In-Reply-To: <20180627224801-mutt-send-email-mst@kernel.org>

On 27 June 2018 at 20:59, Michael S. Tsirkin <mst@redhat.com> wrote:
> My point was virtio mmio isn't widely used outside of ARM.
> Rather than try to make everyone use it, IMHO it's better
> to start with PCI.

Yes. I would much rather we let virtio-mmio disappear into
history as a random bit of legacy stuff. One of the things
I didn't like about the virtio-iommu design was that it
resurrected virtio-mmio as something we need to actively care
about again, so if there is a path forward that will work with
pci virtio I would much prefer that.

thanks
-- PMM

^ permalink raw reply

* Re: [PATCH v33 1/4] mm: add a function to get free page blocks
From: Wei Wang @ 2018-06-28  8:39 UTC (permalink / raw)
  To: Michael S. Tsirkin, Linus Torvalds
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	nilal, liliang.opensource, Linux Kernel Mailing List,
	Michal Hocko, linux-mm, Paolo Bonzini, Andrew Morton,
	virtualization
In-Reply-To: <20180627220402-mutt-send-email-mst@kernel.org>

On 06/28/2018 03:07 AM, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2018 at 09:05:39AM -0700, Linus Torvalds wrote:
>> [ Sorry for slow reply, my travels have made a mess of my inbox ]
>>
>> On Mon, Jun 25, 2018 at 6:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>> Linus, do you think it would be ok to have get_from_free_page_list
>>> actually pop entries from the free list and use them as the buffer
>>> to store PAs?
>> Honestly, what I think the best option would be is to get rid of this
>> interface *entirely*, and just have the balloon code do
>>
>>      #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
>> __GFP_THISNODE | __GFP_NOMEMALLOC)
>>
>>      struct page *page =  alloc_pages(GFP_MINFLAGS, MAX_ORDER-1);
>>
>>   which is not a new interface, and simply removes the max-order page
>> from the list if at all possible.
>>
>> The above has the advantage of "just working", and not having any races.
>>
>> Now, because you don't want to necessarily *entirely* deplete the max
>> order, I'd suggest that the *one* new interface you add is just a "how
>> many max-order pages are there" interface. So then you can query
>> (either before or after getting the max-order page) just how many of
>> them there were and whether you want to give that page back.
>>
>> Notice? No need for any page lists or physical addresses. No races. No
>> complex new functions.
>>
>> The physical address you can just get from the "struct page" you got.
>>
>> And if you run out of memory because of getting a page, you get all
>> the usual "hey, we ran out of memory" responses..
>>
>> Wouldn't the above be sufficient?
>>
>>              Linus


Thanks for the elaboration.

> I think so, thanks!
>
> Wei, to put it in balloon terms, I think there's one thing we missed: if
> you do manage to allocate a page, and you don't have a use for it, then
> hey, you can just give it to the host because you know it's free - you
> are going to return it to the free list.
>

I'm not sure if this would be better than Linus' previous suggestion, 
because live migration is expected to be performed without disturbing 
the guest. If we do allocation to get all the free pages at all 
possible, then the guest applications would be seriously affected. For 
example, the network would become very slow as the allocation of sk_buf 
often triggers OOM during live migration. If live migration happens from 
time to time, and users try memory related tools like "free -h" on the 
guest, the reported statistics (e.g. the fee memory becomes very low 
abruptly due to the balloon allocation) would confuse them.

With the previous suggestion, we only get hints of the free pages (i.e. 
just report the address of free pages to host without taking them off 
the list).

Best,
Wei

^ permalink raw reply

* Re: [PATCH net-next v2] net: vhost: improve performance when enable busyloop
From: Tonghao Zhang @ 2018-06-28  6:44 UTC (permalink / raw)
  To: mst; +Cc: Linux Kernel Network Developers, Tonghao Zhang, virtualization
In-Reply-To: <20180627185543-mutt-send-email-mst@kernel.org>

On Wed, Jun 27, 2018 at 11:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 27, 2018 at 10:24:43PM +0800, Jason Wang wrote:
> >
> >
> > On 2018年06月26日 13:17, xiangxia.m.yue@gmail.com wrote:
> > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > >
> > > This patch improves the guest receive performance from
> > > host. On the handle_tx side, we poll the sock receive
> > > queue at the same time. handle_rx do that in the same way.
> > >
> > > For avoiding deadlock, change the code to lock the vq one
> > > by one and use the VHOST_NET_VQ_XX as a subclass for
> > > mutex_lock_nested. With the patch, qemu can set differently
> > > the busyloop_timeout for rx or tx queue.
> > >
> > > We set the poll-us=100us and use the iperf3 to test
> > > its throughput. The iperf3 command is shown as below.
> > >
> > > on the guest:
> > > iperf3  -s -D
> > >
> > > on the host:
> > > iperf3  -c 192.168.1.100 -i 1 -P 10 -t 10 -M 1400
> > >
> > > * With the patch:     23.1 Gbits/sec
> > > * Without the patch:  12.7 Gbits/sec
> > >
> > > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> >
> > Thanks a lot for the patch. Looks good generally, but please split this big
> > patch into separate ones like:
> >
> > patch 1: lock vqs one by one
> > patch 2: replace magic number of lock annotation
> > patch 3: factor out generic busy polling logic to vhost_net_busy_poll()
> > patch 4: add rx busy polling in tx path.
> >
> > And please cc Michael in v3.
> >
> > Thanks
>
> Pls include host CPU utilization numbers. You can get them e.g. using
> vmstat.
OK, thanks.
> I suspect we also want the polling controllable e.g. through
> an ioctl.
>
> --
> MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v2] net: vhost: improve performance when enable busyloop
From: Tonghao Zhang @ 2018-06-28  6:42 UTC (permalink / raw)
  To: jasowang
  Cc: Linux Kernel Network Developers, Tonghao Zhang, mst,
	virtualization
In-Reply-To: <369bea44-6ebd-337a-b20b-a28a604fa2e9@redhat.com>

On Wed, Jun 27, 2018 at 10:24 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年06月26日 13:17, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > This patch improves the guest receive performance from
> > host. On the handle_tx side, we poll the sock receive
> > queue at the same time. handle_rx do that in the same way.
> >
> > For avoiding deadlock, change the code to lock the vq one
> > by one and use the VHOST_NET_VQ_XX as a subclass for
> > mutex_lock_nested. With the patch, qemu can set differently
> > the busyloop_timeout for rx or tx queue.
> >
> > We set the poll-us=100us and use the iperf3 to test
> > its throughput. The iperf3 command is shown as below.
> >
> > on the guest:
> > iperf3  -s -D
> >
> > on the host:
> > iperf3  -c 192.168.1.100 -i 1 -P 10 -t 10 -M 1400
> >
> > * With the patch:     23.1 Gbits/sec
> > * Without the patch:  12.7 Gbits/sec
> >
> > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
>
> Thanks a lot for the patch. Looks good generally, but please split this
> big patch into separate ones like:
>
> patch 1: lock vqs one by one
> patch 2: replace magic number of lock annotation
> patch 3: factor out generic busy polling logic to vhost_net_busy_poll()
> patch 4: add rx busy polling in tx path.
>
> And please cc Michael in v3.
Thanks. will be done.

> Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v2] net: vhost: improve performance when enable busyloop
From: Jason Wang @ 2018-06-28  4:40 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, virtualization
In-Reply-To: <20180627185543-mutt-send-email-mst@kernel.org>



On 2018年06月27日 23:58, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2018 at 10:24:43PM +0800, Jason Wang wrote:
>>
>> On 2018年06月26日 13:17, xiangxia.m.yue@gmail.com wrote:
>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>
>>> This patch improves the guest receive performance from
>>> host. On the handle_tx side, we poll the sock receive
>>> queue at the same time. handle_rx do that in the same way.
>>>
>>> For avoiding deadlock, change the code to lock the vq one
>>> by one and use the VHOST_NET_VQ_XX as a subclass for
>>> mutex_lock_nested. With the patch, qemu can set differently
>>> the busyloop_timeout for rx or tx queue.
>>>
>>> We set the poll-us=100us and use the iperf3 to test
>>> its throughput. The iperf3 command is shown as below.
>>>
>>> on the guest:
>>> iperf3  -s -D
>>>
>>> on the host:
>>> iperf3  -c 192.168.1.100 -i 1 -P 10 -t 10 -M 1400
>>>
>>> * With the patch:     23.1 Gbits/sec
>>> * Without the patch:  12.7 Gbits/sec
>>>
>>> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
>> Thanks a lot for the patch. Looks good generally, but please split this big
>> patch into separate ones like:
>>
>> patch 1: lock vqs one by one
>> patch 2: replace magic number of lock annotation
>> patch 3: factor out generic busy polling logic to vhost_net_busy_poll()
>> patch 4: add rx busy polling in tx path.
>>
>> And please cc Michael in v3.
>>
>> Thanks
> Pls include host CPU utilization numbers. You can get them e.g. using
> vmstat. I suspect we also want the polling controllable e.g. through
> an ioctl.
>

I believe we had an ioctl for setting timeout? Or you want another kind 
of controlling.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 0/5] Add virtio-iommu driver
From: Michael S. Tsirkin @ 2018-06-27 19:59 UTC (permalink / raw)
  To: Jean-Philippe Brucker
  Cc: Mark Rutland, devicetree@vger.kernel.org,
	jayachandran.nair@cavium.com, Lorenzo Pieralisi,
	tnowicki@caviumnetworks.com, kvm@vger.kernel.org,
	virtio-dev@lists.oasis-open.org, joro@8bytes.org, Will Deacon,
	virtualization@lists.linux-foundation.org, Marc Zyngier,
	iommu@lists.linux-foundation.org, robh+dt@kernel.org,
	eric.auger@redhat.com, Robin Murphy, kvmarm@lists.cs.columbia.edu
In-Reply-To: <2bcda195-d2c6-fbf0-322e-d91756fc3547@arm.com>

On Wed, Jun 27, 2018 at 07:04:46PM +0100, Jean-Philippe Brucker wrote:
> On 26/06/18 19:07, Michael S. Tsirkin wrote:
> > So as I pointed out new virtio 0 device isn't really welcome ;)
> 
> Agreed, virtio-iommu is expected to be implemented on virtio 1 and
> later. I'll remove the two legacy-related paragraph from the spec and
> add a check in the driver as you suggested, to avoid giving the wrong idea.
> 
> > No one bothered implementing virtio 1 in MMIO for all the work
> > that was put in defining it.
> 
> That is curious, given that the virtio_mmio driver supports virtio 1 and
> from my own experience, porting the MMIO transport to virtio 1 only
> requires updating a few registers, when ANY_LAYOUT, modern virtqueue and
> status are already implemented.
> 
> > The fact that the MMIO part of the
> > spec doesn't seem to allow for transitional devices might
> > or might not have something to do with this.
> 
> Sorry, which part do you have in mind? The spec does provide both a
> legacy and modern register layout, with version numbers to differentiate
> them.

Yes but there's no way to implement a transitional virtio mmio
device. The version is either "legacy" or "modern".

So if you implement a modern device old guests won't work with it.

> > So why make it an MMIO device at all? A system with an IOMMU
> > will have a PCI bus, won't it? And it's pretty common to
> > have the IOMMU be a PCI device on the root bus.
> Having the IOMMU outside PCI seems more common though. On Arm and Intel
> the IOMMU doesn't have a PCI config space, BARs or capabilities, just a
> plain MMIO region and a static number of interrupts. However the AMD
> IOMMU appears as a PCI device with additional MMIO registers. I would be
> interested in implementing virtio-iommu as a PCI dev at some point, at
> least so that we can use MSI-X.
> 
> The problem is that it requires invasive changes in the firmware
> interfaces and drivers. They need to describe relationship between IOMMU
> and endpoint, and DT or ACPI IORT don't expect the programming interface
> to be inside the PCI bus that the IOMMU manages.

They don't particularly care IMHO.

> Describing it as a
> peripheral is more natural. For AMD it is implemented in their driver
> using IVHD tables that can't be reused. Right now I don't expect any
> success in proposing changes to firmware interfaces or drivers, because
> the device is new and paravirtualized, and works out of the box with
> MMIO. Hopefully that will change in the future, perhaps when someone
> supports DT for AMD IOMMU (they do describe bindings at the end of the
> spec, but I don't think it can work in Linux with the existing
> infrastructure)

That's a bit abstract, I don't really understand the issues involved.
ACPI is formatted by QEMU, so firmware does not need to care.
And is there even a DT for intel?

> Another reason to keep the MMIO transport option is that one
> virtio-iommu can manage DMA from endpoints on multiple PCI domains at
> the same time, as well as platform devices. Some VMMs might want that,
> in which case the IOMMU would be a separate platform device.

Which buses are managed by the IOMMU is separate from the bus
on which it's programming interface resides.

> > Will remove need to bother with dt bindings etc.
> That's handled by the firmware drivers and IOMMU layer, there shouldn't
> be any special treatment at the virtio layer. In general removal of an
> IOMMU needs to be done after removal of all endpoints connected to it,
> which should be described using device_link from the driver core. DT or
> ACPI is only used to tell drivers where to find resources, and to
> describe the DMA topology.
> 
> Thanks,
> Jean

My point was virtio mmio isn't widely used outside of ARM.
Rather than try to make everyone use it, IMHO it's better
to start with PCI.

-- 
MST

^ permalink raw reply

* Re: [PATCH v33 1/4] mm: add a function to get free page blocks
From: Michael S. Tsirkin @ 2018-06-27 19:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	nilal, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton,
	Michal Hocko
In-Reply-To: <CA+55aFwFpDPvfL=KPdabO-x1r0FnwpfPk5oN8+e01TKqAPNYbw@mail.gmail.com>

On Wed, Jun 27, 2018 at 09:05:39AM -0700, Linus Torvalds wrote:
> [ Sorry for slow reply, my travels have made a mess of my inbox ]
> 
> On Mon, Jun 25, 2018 at 6:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > Linus, do you think it would be ok to have get_from_free_page_list
> > actually pop entries from the free list and use them as the buffer
> > to store PAs?
> 
> Honestly, what I think the best option would be is to get rid of this
> interface *entirely*, and just have the balloon code do
> 
>     #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
> __GFP_THISNODE | __GFP_NOMEMALLOC)
> 
>     struct page *page =  alloc_pages(GFP_MINFLAGS, MAX_ORDER-1);
> 
>  which is not a new interface, and simply removes the max-order page
> from the list if at all possible.
> 
> The above has the advantage of "just working", and not having any races.
> 
> Now, because you don't want to necessarily *entirely* deplete the max
> order, I'd suggest that the *one* new interface you add is just a "how
> many max-order pages are there" interface. So then you can query
> (either before or after getting the max-order page) just how many of
> them there were and whether you want to give that page back.
> 
> Notice? No need for any page lists or physical addresses. No races. No
> complex new functions.
> 
> The physical address you can just get from the "struct page" you got.
> 
> And if you run out of memory because of getting a page, you get all
> the usual "hey, we ran out of memory" responses..
> 
> Wouldn't the above be sufficient?
> 
>             Linus

I think so, thanks!

Wei, to put it in balloon terms, I think there's one thing we missed: if
you do manage to allocate a page, and you don't have a use for it, then
hey, you can just give it to the host because you know it's free - you
are going to return it to the free list.

-- 
MST

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox