Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH net-next V2 2/8] vhost: hide used ring layout from device
From: Jason Wang @ 2018-07-16  3:28 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, netdev, linux-kernel, virtualization, maxime.coquelin, wexu
In-Reply-To: <1531711691-6769-1-git-send-email-jasowang@redhat.com>

We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 48 +++++++++++++++++++++------------------
 drivers/vhost/scsi.c  | 62 +++++++++++++++++++++++++++------------------------
 drivers/vhost/vhost.c | 52 +++++++++++++++++++++---------------------
 drivers/vhost/vhost.h |  9 +++++---
 drivers/vhost/vsock.c | 42 +++++++++++++++++-----------------
 5 files changed, 113 insertions(+), 100 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 03dd1de..a8c9506 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -430,15 +430,16 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
+				    struct vring_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num,
 				    bool *busyloop_intr)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+	int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
+	if (r == -ENOSPC && vq->busyloop_timeout) {
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(endtime)) {
@@ -451,8 +452,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 			cpu_relax();
 		}
 		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
+		r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+				      ARRAY_SIZE(vq->iov), out_num, in_num,
+				      NULL, NULL);
 	}
 
 	return r;
@@ -474,7 +476,6 @@ static void handle_tx(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -487,6 +488,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+	struct vring_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -512,14 +514,11 @@ static void handle_tx(struct vhost_net *net)
 			vhost_zerocopy_signal_used(net, vq);
 
 		busyloop_intr = false;
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in, &busyloop_intr);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = vhost_net_tx_get_vq_desc(net, vq, &used, vq->iov,
+					       ARRAY_SIZE(vq->iov),
+					       &out, &in, &busyloop_intr);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
 			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -528,6 +527,9 @@ static void handle_tx(struct vhost_net *net)
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 		if (in) {
 			vq_err(vq, "Unexpected descriptor format for TX: "
 			       "out %d, int %d\n", out, in);
@@ -555,7 +557,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+			vq->heads[nvq->upend_idx].id =
+				cpu_to_vhost32(vq, used.id);
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
@@ -596,7 +599,7 @@ static void handle_tx(struct vhost_net *net)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
 		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+			vhost_add_used_and_signal(&net->dev, vq, &used, 0);
 		else
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
@@ -754,17 +757,15 @@ static void handle_rx(struct vhost_net *net)
 						      &busyloop_intr))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
-					   vhost_len, &in, vq_log, &log,
-					   likely(mergeable) ? UIO_MAXIOV : 1);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(headcount < 0))
-			goto out;
+		err = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
+				     vhost_len, &in, vq_log, &log,
+				     likely(mergeable) ? UIO_MAXIOV : 1,
+				     &headcount);
 		/* OK, now we need to know about added descriptors. */
-		if (!headcount) {
+		if (err == -ENOSPC) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
-			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+			}else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
 				vhost_disable_notify(&net->dev, vq);
@@ -775,6 +776,9 @@ static void handle_rx(struct vhost_net *net)
 			goto out;
 		}
 		busyloop_intr = false;
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			goto out;
 		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 17fcd3b..013464c 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	int tvc_vq_desc;
+	struct vring_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,8 +441,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head, ret;
+	int ret;
 
 	if (!vq->private_data) {
 		vs->vs_events_missed = true;
@@ -451,16 +452,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 
 again:
 	vhost_disable_notify(&vs->dev, vq);
-	head = vhost_get_vq_desc(vq, vq->iov,
+	ret = vhost_get_vq_desc(vq, &used, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
-	if (head < 0) {
+	if (ret == -ENOSPC) {
+		if (vhost_enable_notify(&vs->dev, vq))
+			goto again;
 		vs->vs_events_missed = true;
 		return;
 	}
-	if (head == vq->num) {
-		if (vhost_enable_notify(&vs->dev, vq))
-			goto again;
+	if (ret < 0) {
 		vs->vs_events_missed = true;
 		return;
 	}
@@ -480,7 +481,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	eventp = vq->iov[out].iov_base;
 	ret = __copy_to_user(eventp, event, sizeof(*event));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
 	else
 		vq_err(vq, "Faulted on vhost_scsi_send_event\n");
 }
@@ -541,7 +542,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
 			struct vhost_scsi_virtqueue *q;
-			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+			vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
 			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
 			vq = q - vs->vqs;
 			__set_bit(vq, signal);
@@ -784,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   int head, unsigned out)
+			   struct vring_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -795,7 +796,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 	resp = vq->iov[out].iov_base;
 	ret = __copy_to_user(resp, &rsp, sizeof(rsp));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, used, 0);
 	else
 		pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
@@ -807,11 +808,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
+	struct vring_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
 	unsigned int out = 0, in = 0;
-	int head, ret, prot_bytes;
+	int ret, prot_bytes;
 	size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
 	size_t out_size, in_size;
 	u16 lun;
@@ -831,22 +833,22 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	vhost_disable_notify(&vs->dev, vq);
 
 	for (;;) {
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov), &out, &in,
-					 NULL, NULL);
+		ret = vhost_get_vq_desc(vq, &used, vq->iov,
+					ARRAY_SIZE(vq->iov), &out, &in,
+					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 head, out, in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+			 used.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
 				vhost_disable_notify(&vs->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(ret < 0))
+			break;
 		/*
 		 * Check for a sane response buffer so we can report early
 		 * errors back to the guest.
@@ -891,20 +893,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 
 		if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
 			vq_err(vq, "Faulted on copy_from_iter\n");
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/* virtio-scsi spec requires byte 0 of the lun to be 1 */
 		if (unlikely(*lunp != 1)) {
 			vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 
 		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/*
@@ -950,7 +952,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_TO_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesout,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
@@ -958,7 +961,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_FROM_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesin,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
@@ -996,7 +1000,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			vq_err(vq, "Received SCSI CDB with command_size: %d that"
 				" exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
 				scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
@@ -1005,7 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (IS_ERR(cmd)) {
 			vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
 			       PTR_ERR(cmd));
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd->tvc_vhost = vs;
@@ -1025,7 +1029,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			if (unlikely(ret)) {
 				vq_err(vq, "Failed to map iov to sgl\n");
 				vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
-				vhost_scsi_send_bad_target(vs, vq, head, out);
+				vhost_scsi_send_bad_target(vs, vq, &used, out);
 				continue;
 			}
 		}
@@ -1034,7 +1038,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 * complete the virtio-scsi request in TCM callback context via
 		 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
 		 */
-		cmd->tvc_vq_desc = head;
+		cmd->tvc_vq_used = used;
 		/*
 		 * Dispatch cmd descriptor for cmwq execution in process
 		 * context provided by vhost_scsi_workqueue.  This also ensures
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8814e5b..9572c4f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1962,6 +1962,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vring_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -1994,7 +1995,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		 * invalid.
 		 */
 		if (vq->avail_idx == last_avail_idx)
-			return vq->num;
+			return -ENOSPC;
 
 		/* Only get avail ring entries after they have been
 		 * exposed by guest.
@@ -2012,6 +2013,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	used->id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2100,10 +2102,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
-	return head;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+			       struct vring_used_elem *used, int len)
+{
+	used->len = cpu_to_vhost32(vq, len);
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -2120,13 +2128,13 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota)
+		   unsigned int quota,
+		   s16 *count)
 {
 	unsigned int out, in;
 	int seg = 0;
 	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
+	int r = 0, nlogs = 0;
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
@@ -2137,17 +2145,12 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
+		r = vhost_get_vq_desc(vq, &heads[headcount], vq->iov + seg,
 				      ARRAY_SIZE(vq->iov) - seg, &out,
 				      &in, log, log_num);
 		if (unlikely(r < 0))
 			goto err;
 
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
 		if (unlikely(out || in <= 0)) {
 			vq_err(vq, "unexpected descriptor format for RX: "
 				"out %d, in %d\n", out, in);
@@ -2158,24 +2161,26 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
+
 		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
+		vhost_set_used_len(vq, &heads[headcount], len);
 		datalen -= len;
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	vhost_set_used_len(vq, &heads[headcount - 1], len + datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
 
 	/* Detect overrun */
 	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
+		headcount = UIO_MAXIOV + 1;
 		goto err;
 	}
-	return headcount;
+
+	*count = headcount;
+	return 0;
 err:
 	vhost_discard_vq_desc(vq, headcount);
 	return r;
@@ -2191,14 +2196,11 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+		   int len)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
-
-	return vhost_add_used_n(vq, &heads, 1);
+	vhost_set_used_len(vq, used, len);
+	return vhost_add_used_n(vq, used, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
@@ -2331,9 +2333,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct vring_used_elem *used, int len)
 {
-	vhost_add_used(vq, head, len);
+	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 52edd242..a7cc7e7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -182,6 +182,7 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
+		      struct vring_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
@@ -191,15 +192,17 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota);
+		   unsigned int quota,
+		   s16 *count);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *elem, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int id, int len);
+			       struct vring_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..59a01cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,11 +98,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
+		struct vring_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
 		size_t len;
-		int head;
+		int ret;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +117,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -139,6 +133,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			}
 			break;
 		}
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
 
 		if (out) {
 			virtio_transport_free_pkt(pkt);
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
+		len = vhost32_to_cpu(vq, used.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -163,7 +163,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
 		added = true;
 
 		if (pkt->reply) {
@@ -346,7 +346,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	int head;
+	struct vring_used_elem used;
+	int ret;
 	unsigned int out, in;
 	bool added = false;
 
@@ -367,18 +368,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			goto no_more_replies;
 		}
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
 				vhost_disable_notify(&vsock->dev, vq);
 				continue;
 			}
 			break;
 		}
+		if (ret < 0)
+			break;
 
 		pkt = vhost_vsock_alloc_pkt(vq, out, in);
 		if (!pkt) {
@@ -397,7 +397,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V2 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-07-16  3:28 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, netdev, linux-kernel, virtualization, maxime.coquelin, wexu
In-Reply-To: <1531711691-6769-1-git-send-email-jasowang@redhat.com>

Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 83 ++-------------------------------------------------
 drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  7 +++++
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b224036..03dd1de 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -702,83 +702,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
 	return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- *	vq has read descriptors only.
- * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- * @quota       - headcount quota, 1 for big buffer
- *	returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-		       struct vring_used_elem *heads,
-		       int datalen,
-		       unsigned *iovcount,
-		       struct vhost_log *log,
-		       unsigned *log_num,
-		       unsigned int quota)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
-	/* len is always initialized before use since we are always called with
-	 * datalen > 0.
-	 */
-	u32 uninitialized_var(len);
-
-	while (datalen > 0 && headcount < quota) {
-		if (unlikely(seg >= UIO_MAXIOV)) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
-		if (unlikely(r < 0))
-			goto err;
-
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (unlikely(out || in <= 0)) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		if (unlikely(log)) {
-			nlogs += *log_num;
-			log += *log_num;
-		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
-		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
-		datalen -= len;
-		++headcount;
-		seg += in;
-	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-	*iovcount = seg;
-	if (unlikely(log))
-		*log_num = nlogs;
-
-	/* Detect overrun */
-	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
-		goto err;
-	}
-	return headcount;
-err:
-	vhost_discard_vq_desc(vq, headcount);
-	return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -831,9 +754,9 @@ static void handle_rx(struct vhost_net *net)
 						      &busyloop_intr))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
-					vhost_len, &in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
+					   vhost_len, &in, vq_log, &log,
+					   likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..8814e5b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2104,6 +2104,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+	/* len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		r = vhost_get_vq_desc(vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (unlikely(r < 0))
+			goto err;
+
+		d = r;
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		r = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..52edd242 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V2 0/8] Packed virtqueue support for vhost
From: Jason Wang @ 2018-07-16  3:28 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, netdev, linux-kernel, virtualization, maxime.coquelin, wexu

Hi all:

This series implements packed virtqueues. The code were tested with
Tiwei's guest driver series at https://patchwork.ozlabs.org/cover/942297/


Pktgen test for both RX and TX does not show obvious difference with
split virtqueues. The main bottleneck is the guest Linux driver, since
it can not stress vhost for a 100% CPU utilization. A full TCP
benchmark is ongoing. Will test virtio-net pmd as well when it was
ready.

Notes:
- This version depends on Tiwei's series at https://patchwork.ozlabs.org/cover/942297/

This version were tested with:

- Zerocopy (Out of Order) support
- vIOMMU support
- mergeable buffer on/off
- busy polling on/off
- vsock (nc-vsock)

Changes from V1:
- drop uapi patch and use Tiwei's
- split the enablement of packed virtqueue into a separate patch

Changes from RFC V5:

- save unnecessary barriers during vhost_add_used_packed_n()
- more compact math for event idx
- fix failure of SET_VRING_BASE when avail_wrap_counter is true
- fix not copy avail_wrap_counter during GET_VRING_BASE
- introduce SET_VRING_USED_BASE/GET_VRING_USED_BASE for syncing last_used_idx
- rename used_wrap_counter to last_used_wrap_counter
- rebase to net-next

Changes from RFC V4:

- fix signalled_used index recording
- track avail index correctly
- various minor fixes

Changes from RFC V3:

- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from RFC V2:

- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from RFC V1:

- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (8):
  vhost: move get_rx_bufs to vhost.c
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost_net: do not explicitly manipulate vhost_used_elem
  vhost: vhost_put_user() can accept metadata type
  vhost: packed ring support
  vhost: event suppression for packed ring
  vhost: enable packed virtqueues

 drivers/vhost/net.c        | 143 ++-----
 drivers/vhost/scsi.c       |  62 +--
 drivers/vhost/vhost.c      | 994 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/vhost/vhost.h      |  55 ++-
 drivers/vhost/vsock.c      |  42 +-
 include/uapi/linux/vhost.h |   7 +
 6 files changed, 1035 insertions(+), 268 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [kbuild ack?] Re: [PATCH v6 0/9] x86: macrofying inline asm for better compilation
From: Ingo Molnar @ 2018-07-15 21:54 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Juergen Gross, Kate Stewart, Kees Cook, Josh Poimboeuf,
	Peter Zijlstra, Greg Kroah-Hartman, Christopher Li, X86 ML,
	Linux Kernel Mailing List, Philippe Ombredanne,
	virtualization@lists.linux-foundation.org, Masahiro Yamada,
	linux-sparse@vger.kernel.org, Ingo Molnar, Jan Beulich,
	H. Peter Anvin, Linus Torvalds, Thomas Gleixner, Sam Ravnborg,
	Alok Kataria
In-Reply-To: <BA4C0931-A51D-41AE-B918-C37EFFF35840@vmware.com>


* Nadav Amit <namit@vmware.com> wrote:

> > I ran some limited number of benchmarks, and in general the performance
> > impact is not very notable. You can still see >10 cycles shaved off some
> > syscalls that manipulate page-tables (e.g., mprotect()), in which
> > paravirt caused many functions not to be inlined. In addition this
> > patch-set can prevent issues such as [1], and improves code readability
> > and maintainability.

Ok, that's good enough as a benefit, I suppose.

> > Nadav Amit (9):
> >  Makefile: Prepare for using macros for inline asm

This non-trivial kbuild patch needs an Acked-by from Masahiro.

Thanks,

	Ingo

^ permalink raw reply

* RE: [PATCH v6 3/3] x86: paravirt: make native_save_fl extern inline
From: David Laight @ 2018-07-13 10:16 UTC (permalink / raw)
  To: 'Nick Desaulniers', mingo@redhat.com, tglx@linutronix.de
  Cc: kstewart@linuxfoundation.org, andrea.parri@amarulasolutions.com,
	linux-efi@vger.kernel.org, brijesh.singh@amd.com,
	jan.kiszka@siemens.com, jpoimboe@redhat.com, will.deacon@arm.com,
	jarkko.sakkinen@linux.intel.com,
	virtualization@lists.linux-foundation.org,
	yamada.masahiro@socionext.com, manojgupta@google.com,
	hpa@zytor.com, akataria@vmware.com, tweek@google.com,
	mawilcox@microsoft.com, x86@kernel.org, ghackmann
In-Reply-To: <20180621162324.36656-4-ndesaulniers@google.com>


[-- Attachment #1.1: Type: text/plain, Size: 206 bytes --]

Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

Please consider the environment and don't print this e-mail unless you really need to

[-- Attachment #1.2: Type: text/html, Size: 1534 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-07-13  3:26 UTC (permalink / raw)
  To: David Miller; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180712.144458.2076041018423659380.davem@davemloft.net>

On Thu, Jul 12, 2018 at 02:44:58PM -0700, David Miller wrote:
> From: Tiwei Bie <tiwei.bie@intel.com>
> Date: Wed, 11 Jul 2018 10:27:06 +0800
> 
> > Hello everyone,
> > 
> > This patch set implements packed ring support in virtio driver.
> > 
> > Some functional tests have been done with Jason's
> > packed ring implementation in vhost:
> > 
> > https://lkml.org/lkml/2018/7/3/33
> > 
> > Both of ping and netperf worked as expected.
> 
> Michael and Jason, where are we with this series?

I'm at netdev, won't be able to review before Monday.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Jason Wang @ 2018-07-13  0:52 UTC (permalink / raw)
  To: David Miller, tiwei.bie
  Cc: virtio-dev, mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180712.144458.2076041018423659380.davem@davemloft.net>



On 2018年07月13日 05:44, David Miller wrote:
> From: Tiwei Bie <tiwei.bie@intel.com>
> Date: Wed, 11 Jul 2018 10:27:06 +0800
>
>> Hello everyone,
>>
>> This patch set implements packed ring support in virtio driver.
>>
>> Some functional tests have been done with Jason's
>> packed ring implementation in vhost:
>>
>> https://lkml.org/lkml/2018/7/3/33
>>
>> Both of ping and netperf worked as expected.
> Michael and Jason, where are we with this series?

For the series:

Acked-by: Jason Wang <jasowang@redhat.com>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Wei Wang @ 2018-07-13  0:33 UTC (permalink / raw)
  To: Michal Hocko
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal,
	Linus Torvalds
In-Reply-To: <20180712114946.GI32648@dhcp22.suse.cz>

On 07/12/2018 07:49 PM, Michal Hocko wrote:
> On Thu 12-07-18 19:34:16, Wei Wang wrote:
>> On 07/12/2018 04:13 PM, Michal Hocko wrote:
>>> On Thu 12-07-18 10:52:08, Wei Wang wrote:
>>>> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
>>>>> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>>>>>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>>>>>> the guest free pages (from all the nodes).
>>>>> Maybe. Or maybe it would be better to have the memory balloon logic be
>>>>> per-node? Maybe you don't want to remove too much memory from one
>>>>> node? I think it's one of those "play with it" things.
>>>>>
>>>>> I don't think that's the big issue, actually. I think the real issue
>>>>> is how to react quickly and gracefully to "oops, I'm trying to give
>>>>> memory away, but now the guest wants it back" while you're in the
>>>>> middle of trying to create that 2TB list of pages.
>>>> OK. virtio-balloon has already registered an oom notifier
>>>> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
>>>> - stop the page allocation;
>>>> - immediately give back the allocated pages to mm.
>>> Please don't. Oom notifier is an absolutely hideous interface which
>>> should go away sooner or later (I would much rather like the former) so
>>> do not build a new logic on top of it. I would appreciate if you
>>> actually remove the notifier much more.
>>>
>>> You can give memory back from the standard shrinker interface. If we are
>>> reaching low reclaim priorities then we are struggling to reclaim memory
>>> and then you can start returning pages back.
>> OK. Just curious why oom notifier is thought to be hideous, and has it been
>> a consensus?
> Because it is a completely non-transparent callout from the OOM context
> which is really subtle on its own. It is just too easy to end up in
> weird corner cases. We really have to be careful and be as swift as
> possible. Any potential sleep would make the OOM situation much worse
> because nobody would be able to make a forward progress or (in)direct
> dependency on MM subsystem can easily deadlock. Those are really hard
> to track down and defining the notifier as blockable by design which
> just asks for bad implementations because most people simply do not
> realize how subtle the oom context is.
>
> Another thing is that it happens way too late when we have basically
> reclaimed the world and didn't get out of the memory pressure so you can
> expect any workload is suffering already. Anybody sitting on a large
> amount of reclaimable memory should have released that memory by that
> time. Proportionally to the reclaim pressure ideally.
>
> The notifier API is completely unaware of oom constrains. Just imagine
> you are OOM in a subset of numa nodes. Callback doesn't have any idea
> about that.
>
> Moreover we do have proper reclaim mechanism that has a feedback
> loop and that should be always preferable to an abrupt reclaim.

Sounds very reasonable, thanks for the elaboration. I'll try with shrinker.

Best,
Wei

^ permalink raw reply

* Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: David Miller @ 2018-07-12 21:44 UTC (permalink / raw)
  To: tiwei.bie; +Cc: virtio-dev, mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180711022711.7090-1-tiwei.bie@intel.com>

From: Tiwei Bie <tiwei.bie@intel.com>
Date: Wed, 11 Jul 2018 10:27:06 +0800

> Hello everyone,
> 
> This patch set implements packed ring support in virtio driver.
> 
> Some functional tests have been done with Jason's
> packed ring implementation in vhost:
> 
> https://lkml.org/lkml/2018/7/3/33
> 
> Both of ping and netperf worked as expected.

Michael and Jason, where are we with this series?

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Michal Hocko @ 2018-07-12 13:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal
In-Reply-To: <CA+55aFwku2tDH4+rfaC67xc4-cEwSrXgnQaci=e2id5ZCRE9JQ@mail.gmail.com>

[Hmm this one somehow got stuck in my outgoing emails]

On Wed 11-07-18 09:23:54, Linus Torvalds wrote:
[...]
> So I'm open to new interfaces. I just want those new interfaces to
> make sense, and be low latency and simple for the VM to do. I'm
> objecting to the incredibly baroque and heavy-weight one that can
> return near-infinite amounts of memory.

Mel was suggesting a bulk page allocator a year ago [1]. I can see only
slab bulk api so I am not sure what happened with that work. Anyway
I think that starting with what we have right now is much more
appropriate than over design this thing from the early beginning.

[1] http://lkml.kernel.org/r/20170109163518.6001-5-mgorman@techsingularity.net
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Michal Hocko @ 2018-07-12 11:49 UTC (permalink / raw)
  To: Wei Wang
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal,
	Linus Torvalds
In-Reply-To: <5B473CB8.1050306@intel.com>

On Thu 12-07-18 19:34:16, Wei Wang wrote:
> On 07/12/2018 04:13 PM, Michal Hocko wrote:
> > On Thu 12-07-18 10:52:08, Wei Wang wrote:
> > > On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> > > > On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
> > > > > Would it be better to remove __GFP_THISNODE? We actually want to get all
> > > > > the guest free pages (from all the nodes).
> > > > Maybe. Or maybe it would be better to have the memory balloon logic be
> > > > per-node? Maybe you don't want to remove too much memory from one
> > > > node? I think it's one of those "play with it" things.
> > > > 
> > > > I don't think that's the big issue, actually. I think the real issue
> > > > is how to react quickly and gracefully to "oops, I'm trying to give
> > > > memory away, but now the guest wants it back" while you're in the
> > > > middle of trying to create that 2TB list of pages.
> > > OK. virtio-balloon has already registered an oom notifier
> > > (virtballoon_oom_notify). I plan to add some control there. If oom happens,
> > > - stop the page allocation;
> > > - immediately give back the allocated pages to mm.
> > Please don't. Oom notifier is an absolutely hideous interface which
> > should go away sooner or later (I would much rather like the former) so
> > do not build a new logic on top of it. I would appreciate if you
> > actually remove the notifier much more.
> > 
> > You can give memory back from the standard shrinker interface. If we are
> > reaching low reclaim priorities then we are struggling to reclaim memory
> > and then you can start returning pages back.
> 
> OK. Just curious why oom notifier is thought to be hideous, and has it been
> a consensus?

Because it is a completely non-transparent callout from the OOM context
which is really subtle on its own. It is just too easy to end up in
weird corner cases. We really have to be careful and be as swift as
possible. Any potential sleep would make the OOM situation much worse
because nobody would be able to make a forward progress or (in)direct
dependency on MM subsystem can easily deadlock. Those are really hard
to track down and defining the notifier as blockable by design which
just asks for bad implementations because most people simply do not
realize how subtle the oom context is.

Another thing is that it happens way too late when we have basically
reclaimed the world and didn't get out of the memory pressure so you can
expect any workload is suffering already. Anybody sitting on a large
amount of reclaimable memory should have released that memory by that
time. Proportionally to the reclaim pressure ideally.

The notifier API is completely unaware of oom constrains. Just imagine
you are OOM in a subset of numa nodes. Callback doesn't have any idea
about that.

Moreover we do have proper reclaim mechanism that has a feedback
loop and that should be always preferable to an abrupt reclaim.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Wei Wang @ 2018-07-12 11:34 UTC (permalink / raw)
  To: Michal Hocko
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal,
	Linus Torvalds
In-Reply-To: <20180712081317.GD32648@dhcp22.suse.cz>

On 07/12/2018 04:13 PM, Michal Hocko wrote:
> On Thu 12-07-18 10:52:08, Wei Wang wrote:
>> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
>>> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>>>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>>>> the guest free pages (from all the nodes).
>>> Maybe. Or maybe it would be better to have the memory balloon logic be
>>> per-node? Maybe you don't want to remove too much memory from one
>>> node? I think it's one of those "play with it" things.
>>>
>>> I don't think that's the big issue, actually. I think the real issue
>>> is how to react quickly and gracefully to "oops, I'm trying to give
>>> memory away, but now the guest wants it back" while you're in the
>>> middle of trying to create that 2TB list of pages.
>> OK. virtio-balloon has already registered an oom notifier
>> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
>> - stop the page allocation;
>> - immediately give back the allocated pages to mm.
> Please don't. Oom notifier is an absolutely hideous interface which
> should go away sooner or later (I would much rather like the former) so
> do not build a new logic on top of it. I would appreciate if you
> actually remove the notifier much more.
>
> You can give memory back from the standard shrinker interface. If we are
> reaching low reclaim priorities then we are struggling to reclaim memory
> and then you can start returning pages back.

OK. Just curious why oom notifier is thought to be hideous, and has it 
been a consensus?

Best,
Wei

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Michal Hocko @ 2018-07-12  8:13 UTC (permalink / raw)
  To: Wei Wang
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal,
	Linus Torvalds
In-Reply-To: <5B46C258.40601@intel.com>

On Thu 12-07-18 10:52:08, Wei Wang wrote:
> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> > On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
> > > Would it be better to remove __GFP_THISNODE? We actually want to get all
> > > the guest free pages (from all the nodes).
> > Maybe. Or maybe it would be better to have the memory balloon logic be
> > per-node? Maybe you don't want to remove too much memory from one
> > node? I think it's one of those "play with it" things.
> > 
> > I don't think that's the big issue, actually. I think the real issue
> > is how to react quickly and gracefully to "oops, I'm trying to give
> > memory away, but now the guest wants it back" while you're in the
> > middle of trying to create that 2TB list of pages.
> 
> OK. virtio-balloon has already registered an oom notifier
> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
> - stop the page allocation;
> - immediately give back the allocated pages to mm.

Please don't. Oom notifier is an absolutely hideous interface which
should go away sooner or later (I would much rather like the former) so
do not build a new logic on top of it. I would appreciate if you
actually remove the notifier much more.

You can give memory back from the standard shrinker interface. If we are
reaching low reclaim priorities then we are struggling to reclaim memory
and then you can start returning pages back.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Jason Wang @ 2018-07-12  5:51 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Linux Kernel Network Developers, virtualization
In-Reply-To: <20180712082324-mutt-send-email-mst@kernel.org>



On 2018年07月12日 13:24, Michael S. Tsirkin wrote:
> On Thu, Jul 12, 2018 at 01:21:03PM +0800, Jason Wang wrote:
>>
>> On 2018年07月12日 11:34, Michael S. Tsirkin wrote:
>>> On Thu, Jul 12, 2018 at 11:26:12AM +0800, Jason Wang wrote:
>>>> On 2018年07月11日 19:59, Michael S. Tsirkin wrote:
>>>>> On Wed, Jul 11, 2018 at 01:12:59PM +0800, Jason Wang wrote:
>>>>>> On 2018年07月11日 11:49, Tonghao Zhang wrote:
>>>>>>> On Wed, Jul 11, 2018 at 10:56 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
>>>>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>>>>>>>
>>>>>>>>> This patches improve the guest receive and transmit performance.
>>>>>>>>> On the handle_tx side, we poll the sock receive queue at the same time.
>>>>>>>>> handle_rx do that in the same way.
>>>>>>>>>
>>>>>>>>> For more performance report, see patch 4.
>>>>>>>>>
>>>>>>>>> v4 -> v5:
>>>>>>>>> fix some issues
>>>>>>>>>
>>>>>>>>> v3 -> v4:
>>>>>>>>> fix some issues
>>>>>>>>>
>>>>>>>>> v2 -> v3:
>>>>>>>>> This patches are splited from previous big patch:
>>>>>>>>> http://patchwork.ozlabs.org/patch/934673/
>>>>>>>>>
>>>>>>>>> Tonghao Zhang (4):
>>>>>>>>>        vhost: lock the vqs one by one
>>>>>>>>>        net: vhost: replace magic number of lock annotation
>>>>>>>>>        net: vhost: factor out busy polling logic to vhost_net_busy_poll()
>>>>>>>>>        net: vhost: add rx busy polling in tx path
>>>>>>>>>
>>>>>>>>>       drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
>>>>>>>>>       drivers/vhost/vhost.c |  24 ++++-------
>>>>>>>>>       2 files changed, 67 insertions(+), 65 deletions(-)
>>>>>>>>>
>>>>>>>> Hi, any progress on the new version?
>>>>>>>>
>>>>>>>> I plan to send a new series of packed virtqueue support of vhost. If you
>>>>>>>> plan to send it soon, I can wait. Otherwise, I will send my series.
>>>>>>> I rebase the codes. and find there is no improvement anymore, the
>>>>>>> patches of  makita  may solve the problem. jason you may send your
>>>>>>> patches, and I will do some research on busypoll.
>>>>>> I see. Maybe you can try some bi-directional traffic.
>>>>>>
>>>>>> Btw, lots of optimizations could be done for busy polling. E.g integrating
>>>>>> with host NAPI busy polling or a 100% busy polling vhost_net. You're welcome
>>>>>> to work or propose new ideas.
>>>>>>
>>>>>> Thanks
>>>>> It seems clear we do need adaptive polling.
>>>> Yes.
>>>>
>>>>>     The difficulty with NAPI
>>>>> polling is it can't access guest memory easily. But maybe
>>>>> get_user_pages on the polled memory+NAPI polling can work.
>>>> You mean something like zerocopy? Looks like we can do busy polling without
>>>> it. I mean something like https://patchwork.kernel.org/patch/8707511/.
>>>>
>>>> Thanks
>>> How does this patch work? vhost_vq_avail_empty can sleep,
>>> you are calling it within an rcu read side critical section.
>> Ok, I get your meaning. I have patches to access vring through
>> get_user_pages + vmap() which should help here. (And it increase PPS about
>> 10%-20%).
> Remember you must mark it as dirty on unpin too ...

Ok.

>
>
>>> That's not the only problem btw, another one is that the
>>> CPU time spent polling isn't accounted with the VM.
>>
>> Yes, but it's not the 'issue' of this patch.
> Yes it is. polling within thread context accounts CPU correctly.
>
>> And I believe cgroup can help?
>>
>> Thanks
>
> cgroups are what's broken by polling in irq context.

But I think the NAPI busy polling is still done in process context.

Thanks

>
>>>>>>>> Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Michael S. Tsirkin @ 2018-07-12  5:24 UTC (permalink / raw)
  To: Jason Wang; +Cc: Linux Kernel Network Developers, virtualization
In-Reply-To: <2898b8aa-8a7c-06ad-9dc0-0e97e2f8e417@redhat.com>

On Thu, Jul 12, 2018 at 01:21:03PM +0800, Jason Wang wrote:
> 
> 
> On 2018年07月12日 11:34, Michael S. Tsirkin wrote:
> > On Thu, Jul 12, 2018 at 11:26:12AM +0800, Jason Wang wrote:
> > > 
> > > On 2018年07月11日 19:59, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 11, 2018 at 01:12:59PM +0800, Jason Wang wrote:
> > > > > On 2018年07月11日 11:49, Tonghao Zhang wrote:
> > > > > > On Wed, Jul 11, 2018 at 10:56 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
> > > > > > > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > > > > > > > 
> > > > > > > > This patches improve the guest receive and transmit performance.
> > > > > > > > On the handle_tx side, we poll the sock receive queue at the same time.
> > > > > > > > handle_rx do that in the same way.
> > > > > > > > 
> > > > > > > > For more performance report, see patch 4.
> > > > > > > > 
> > > > > > > > v4 -> v5:
> > > > > > > > fix some issues
> > > > > > > > 
> > > > > > > > v3 -> v4:
> > > > > > > > fix some issues
> > > > > > > > 
> > > > > > > > v2 -> v3:
> > > > > > > > This patches are splited from previous big patch:
> > > > > > > > http://patchwork.ozlabs.org/patch/934673/
> > > > > > > > 
> > > > > > > > Tonghao Zhang (4):
> > > > > > > >       vhost: lock the vqs one by one
> > > > > > > >       net: vhost: replace magic number of lock annotation
> > > > > > > >       net: vhost: factor out busy polling logic to vhost_net_busy_poll()
> > > > > > > >       net: vhost: add rx busy polling in tx path
> > > > > > > > 
> > > > > > > >      drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
> > > > > > > >      drivers/vhost/vhost.c |  24 ++++-------
> > > > > > > >      2 files changed, 67 insertions(+), 65 deletions(-)
> > > > > > > > 
> > > > > > > Hi, any progress on the new version?
> > > > > > > 
> > > > > > > I plan to send a new series of packed virtqueue support of vhost. If you
> > > > > > > plan to send it soon, I can wait. Otherwise, I will send my series.
> > > > > > I rebase the codes. and find there is no improvement anymore, the
> > > > > > patches of  makita  may solve the problem. jason you may send your
> > > > > > patches, and I will do some research on busypoll.
> > > > > I see. Maybe you can try some bi-directional traffic.
> > > > > 
> > > > > Btw, lots of optimizations could be done for busy polling. E.g integrating
> > > > > with host NAPI busy polling or a 100% busy polling vhost_net. You're welcome
> > > > > to work or propose new ideas.
> > > > > 
> > > > > Thanks
> > > > It seems clear we do need adaptive polling.
> > > Yes.
> > > 
> > > >    The difficulty with NAPI
> > > > polling is it can't access guest memory easily. But maybe
> > > > get_user_pages on the polled memory+NAPI polling can work.
> > > You mean something like zerocopy? Looks like we can do busy polling without
> > > it. I mean something like https://patchwork.kernel.org/patch/8707511/.
> > > 
> > > Thanks
> > How does this patch work? vhost_vq_avail_empty can sleep,
> > you are calling it within an rcu read side critical section.
> 
> Ok, I get your meaning. I have patches to access vring through
> get_user_pages + vmap() which should help here. (And it increase PPS about
> 10%-20%).

Remember you must mark it as dirty on unpin too ...


> > 
> > That's not the only problem btw, another one is that the
> > CPU time spent polling isn't accounted with the VM.
> 
> 
> Yes, but it's not the 'issue' of this patch.

Yes it is. polling within thread context accounts CPU correctly.

> And I believe cgroup can help?
> 
> Thanks


cgroups are what's broken by polling in irq context.

> > 
> > > > > > > Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Jason Wang @ 2018-07-12  5:21 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Linux Kernel Network Developers, virtualization
In-Reply-To: <20180712062756-mutt-send-email-mst@kernel.org>



On 2018年07月12日 11:34, Michael S. Tsirkin wrote:
> On Thu, Jul 12, 2018 at 11:26:12AM +0800, Jason Wang wrote:
>>
>> On 2018年07月11日 19:59, Michael S. Tsirkin wrote:
>>> On Wed, Jul 11, 2018 at 01:12:59PM +0800, Jason Wang wrote:
>>>> On 2018年07月11日 11:49, Tonghao Zhang wrote:
>>>>> On Wed, Jul 11, 2018 at 10:56 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
>>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>>>>>
>>>>>>> This patches improve the guest receive and transmit performance.
>>>>>>> On the handle_tx side, we poll the sock receive queue at the same time.
>>>>>>> handle_rx do that in the same way.
>>>>>>>
>>>>>>> For more performance report, see patch 4.
>>>>>>>
>>>>>>> v4 -> v5:
>>>>>>> fix some issues
>>>>>>>
>>>>>>> v3 -> v4:
>>>>>>> fix some issues
>>>>>>>
>>>>>>> v2 -> v3:
>>>>>>> This patches are splited from previous big patch:
>>>>>>> http://patchwork.ozlabs.org/patch/934673/
>>>>>>>
>>>>>>> Tonghao Zhang (4):
>>>>>>>       vhost: lock the vqs one by one
>>>>>>>       net: vhost: replace magic number of lock annotation
>>>>>>>       net: vhost: factor out busy polling logic to vhost_net_busy_poll()
>>>>>>>       net: vhost: add rx busy polling in tx path
>>>>>>>
>>>>>>>      drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
>>>>>>>      drivers/vhost/vhost.c |  24 ++++-------
>>>>>>>      2 files changed, 67 insertions(+), 65 deletions(-)
>>>>>>>
>>>>>> Hi, any progress on the new version?
>>>>>>
>>>>>> I plan to send a new series of packed virtqueue support of vhost. If you
>>>>>> plan to send it soon, I can wait. Otherwise, I will send my series.
>>>>> I rebase the codes. and find there is no improvement anymore, the
>>>>> patches of  makita  may solve the problem. jason you may send your
>>>>> patches, and I will do some research on busypoll.
>>>> I see. Maybe you can try some bi-directional traffic.
>>>>
>>>> Btw, lots of optimizations could be done for busy polling. E.g integrating
>>>> with host NAPI busy polling or a 100% busy polling vhost_net. You're welcome
>>>> to work or propose new ideas.
>>>>
>>>> Thanks
>>> It seems clear we do need adaptive polling.
>> Yes.
>>
>>>    The difficulty with NAPI
>>> polling is it can't access guest memory easily. But maybe
>>> get_user_pages on the polled memory+NAPI polling can work.
>> You mean something like zerocopy? Looks like we can do busy polling without
>> it. I mean something like https://patchwork.kernel.org/patch/8707511/.
>>
>> Thanks
> How does this patch work? vhost_vq_avail_empty can sleep,
> you are calling it within an rcu read side critical section.

Ok, I get your meaning. I have patches to access vring through 
get_user_pages + vmap() which should help here. (And it increase PPS 
about 10%-20%).

>
> That's not the only problem btw, another one is that the
> CPU time spent polling isn't accounted with the VM.


Yes, but it's not the 'issue' of this patch. And I believe cgroup can help?

Thanks

>
>>>>>> Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Michael S. Tsirkin @ 2018-07-12  3:34 UTC (permalink / raw)
  To: Jason Wang; +Cc: Linux Kernel Network Developers, virtualization
In-Reply-To: <0dcaf8f1-01b5-417d-420b-d5b716a82a8a@redhat.com>

On Thu, Jul 12, 2018 at 11:26:12AM +0800, Jason Wang wrote:
> 
> 
> On 2018年07月11日 19:59, Michael S. Tsirkin wrote:
> > On Wed, Jul 11, 2018 at 01:12:59PM +0800, Jason Wang wrote:
> > > 
> > > On 2018年07月11日 11:49, Tonghao Zhang wrote:
> > > > On Wed, Jul 11, 2018 at 10:56 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > 
> > > > > On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
> > > > > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > > > > > 
> > > > > > This patches improve the guest receive and transmit performance.
> > > > > > On the handle_tx side, we poll the sock receive queue at the same time.
> > > > > > handle_rx do that in the same way.
> > > > > > 
> > > > > > For more performance report, see patch 4.
> > > > > > 
> > > > > > v4 -> v5:
> > > > > > fix some issues
> > > > > > 
> > > > > > v3 -> v4:
> > > > > > fix some issues
> > > > > > 
> > > > > > v2 -> v3:
> > > > > > This patches are splited from previous big patch:
> > > > > > http://patchwork.ozlabs.org/patch/934673/
> > > > > > 
> > > > > > Tonghao Zhang (4):
> > > > > >      vhost: lock the vqs one by one
> > > > > >      net: vhost: replace magic number of lock annotation
> > > > > >      net: vhost: factor out busy polling logic to vhost_net_busy_poll()
> > > > > >      net: vhost: add rx busy polling in tx path
> > > > > > 
> > > > > >     drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
> > > > > >     drivers/vhost/vhost.c |  24 ++++-------
> > > > > >     2 files changed, 67 insertions(+), 65 deletions(-)
> > > > > > 
> > > > > Hi, any progress on the new version?
> > > > > 
> > > > > I plan to send a new series of packed virtqueue support of vhost. If you
> > > > > plan to send it soon, I can wait. Otherwise, I will send my series.
> > > > I rebase the codes. and find there is no improvement anymore, the
> > > > patches of  makita  may solve the problem. jason you may send your
> > > > patches, and I will do some research on busypoll.
> > > I see. Maybe you can try some bi-directional traffic.
> > > 
> > > Btw, lots of optimizations could be done for busy polling. E.g integrating
> > > with host NAPI busy polling or a 100% busy polling vhost_net. You're welcome
> > > to work or propose new ideas.
> > > 
> > > Thanks
> > It seems clear we do need adaptive polling.
> 
> Yes.
> 
> >   The difficulty with NAPI
> > polling is it can't access guest memory easily. But maybe
> > get_user_pages on the polled memory+NAPI polling can work.
> 
> You mean something like zerocopy? Looks like we can do busy polling without
> it. I mean something like https://patchwork.kernel.org/patch/8707511/.
> 
> Thanks

How does this patch work? vhost_vq_avail_empty can sleep,
you are calling it within an rcu read side critical section.

That's not the only problem btw, another one is that the
CPU time spent polling isn't accounted with the VM.

> > 
> > > > > Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v5 0/4] net: vhost: improve performance when enable busyloop
From: Jason Wang @ 2018-07-12  3:26 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Linux Kernel Network Developers, virtualization
In-Reply-To: <20180711145421-mutt-send-email-mst@kernel.org>



On 2018年07月11日 19:59, Michael S. Tsirkin wrote:
> On Wed, Jul 11, 2018 at 01:12:59PM +0800, Jason Wang wrote:
>>
>> On 2018年07月11日 11:49, Tonghao Zhang wrote:
>>> On Wed, Jul 11, 2018 at 10:56 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>
>>>> On 2018年07月04日 12:31, xiangxia.m.yue@gmail.com wrote:
>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>>>
>>>>> This patches improve the guest receive and transmit performance.
>>>>> On the handle_tx side, we poll the sock receive queue at the same time.
>>>>> handle_rx do that in the same way.
>>>>>
>>>>> For more performance report, see patch 4.
>>>>>
>>>>> v4 -> v5:
>>>>> fix some issues
>>>>>
>>>>> v3 -> v4:
>>>>> fix some issues
>>>>>
>>>>> v2 -> v3:
>>>>> This patches are splited from previous big patch:
>>>>> http://patchwork.ozlabs.org/patch/934673/
>>>>>
>>>>> Tonghao Zhang (4):
>>>>>      vhost: lock the vqs one by one
>>>>>      net: vhost: replace magic number of lock annotation
>>>>>      net: vhost: factor out busy polling logic to vhost_net_busy_poll()
>>>>>      net: vhost: add rx busy polling in tx path
>>>>>
>>>>>     drivers/vhost/net.c   | 108 ++++++++++++++++++++++++++++----------------------
>>>>>     drivers/vhost/vhost.c |  24 ++++-------
>>>>>     2 files changed, 67 insertions(+), 65 deletions(-)
>>>>>
>>>> Hi, any progress on the new version?
>>>>
>>>> I plan to send a new series of packed virtqueue support of vhost. If you
>>>> plan to send it soon, I can wait. Otherwise, I will send my series.
>>> I rebase the codes. and find there is no improvement anymore, the
>>> patches of  makita  may solve the problem. jason you may send your
>>> patches, and I will do some research on busypoll.
>> I see. Maybe you can try some bi-directional traffic.
>>
>> Btw, lots of optimizations could be done for busy polling. E.g integrating
>> with host NAPI busy polling or a 100% busy polling vhost_net. You're welcome
>> to work or propose new ideas.
>>
>> Thanks
> It seems clear we do need adaptive polling.

Yes.

>   The difficulty with NAPI
> polling is it can't access guest memory easily. But maybe
> get_user_pages on the polled memory+NAPI polling can work.

You mean something like zerocopy? Looks like we can do busy polling 
without it. I mean something like 
https://patchwork.kernel.org/patch/8707511/.

Thanks

>
>>>> Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Wei Wang @ 2018-07-12  2:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, nilal, liliang.opensource,
	Linux Kernel Mailing List, virtualization, linux-mm,
	Paolo Bonzini, Andrew Morton, Michal Hocko
In-Reply-To: <CA+55aFxyv=EUAJFUSio=k+pm3ddteojshP7Radjia5ZRgm53zQ@mail.gmail.com>

On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>> the guest free pages (from all the nodes).
> Maybe. Or maybe it would be better to have the memory balloon logic be
> per-node? Maybe you don't want to remove too much memory from one
> node? I think it's one of those "play with it" things.
>
> I don't think that's the big issue, actually. I think the real issue
> is how to react quickly and gracefully to "oops, I'm trying to give
> memory away, but now the guest wants it back" while you're in the
> middle of trying to create that 2TB list of pages.

OK. virtio-balloon has already registered an oom notifier 
(virtballoon_oom_notify). I plan to add some control there. If oom happens,
- stop the page allocation;
- immediately give back the allocated pages to mm.

Best,
Wei

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Linus Torvalds @ 2018-07-12  2:30 UTC (permalink / raw)
  To: wei.w.wang
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, nilal, liliang.opensource,
	Linux Kernel Mailing List, virtualization, linux-mm,
	Paolo Bonzini, Andrew Morton, Michal Hocko
In-Reply-To: <5B46BB46.2080802@intel.com>

On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>
> Would it be better to remove __GFP_THISNODE? We actually want to get all
> the guest free pages (from all the nodes).

Maybe. Or maybe it would be better to have the memory balloon logic be
per-node? Maybe you don't want to remove too much memory from one
node? I think it's one of those "play with it" things.

I don't think that's the big issue, actually. I think the real issue
is how to react quickly and gracefully to "oops, I'm trying to give
memory away, but now the guest wants it back" while you're in the
middle of trying to create that 2TB list of pages.

IOW, I think the real work is in whatever tuning for the righ
tbehavior. But I'm just guessing.

             Linus

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Wei Wang @ 2018-07-12  2:21 UTC (permalink / raw)
  To: Linus Torvalds, Michal Hocko
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal
In-Reply-To: <CA+55aFwku2tDH4+rfaC67xc4-cEwSrXgnQaci=e2id5ZCRE9JQ@mail.gmail.com>

On 07/12/2018 12:23 AM, Linus Torvalds wrote:
> On Wed, Jul 11, 2018 at 2:21 AM Michal Hocko <mhocko@kernel.org> wrote:
>> We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
>> So why do we need any array based interface?
> That was actually my original argument in the original thread - that
> the only new interface people might want is one that just tells how
> many of those MAX_ORDER-1 pages there are.
>
> See the thread in v33 with the subject
>
>    "[PATCH v33 1/4] mm: add a function to get free page blocks"
>
> and look for me suggesting just using
>
>      #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
> __GFP_THISNODE | __GFP_NOMEMALLOC)

Would it be better to remove __GFP_THISNODE? We actually want to get all 
the guest free pages (from all the nodes).

Best,
Wei

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Michael S. Tsirkin @ 2018-07-11 19:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	liliang.opensource, Linux Kernel Mailing List, virtualization,
	linux-mm, Paolo Bonzini, Andrew Morton, nilal, Linus Torvalds
In-Reply-To: <20180711110949.GJ20050@dhcp22.suse.cz>

On Wed, Jul 11, 2018 at 01:09:49PM +0200, Michal Hocko wrote:
> But let me note that I am not really convinced how this (or previous)
> approach will really work in most workloads. We tend to cache heavily so
> there is rarely any memory free.

It might be that it's worth flushing the cache when VM is
migrating. Or maybe we should implement virtio-tmem or add
transcendent memory support to the balloon.

-- 
MST

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Linus Torvalds @ 2018-07-11 16:23 UTC (permalink / raw)
  To: Michal Hocko
  Cc: yang.zhang.wz, virtio-dev, Rik van Riel, quan.xu0, KVM list,
	Michael S. Tsirkin, liliang.opensource, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton, nilal
In-Reply-To: <20180711092152.GE20050@dhcp22.suse.cz>

On Wed, Jul 11, 2018 at 2:21 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
> So why do we need any array based interface?

That was actually my original argument in the original thread - that
the only new interface people might want is one that just tells how
many of those MAX_ORDER-1 pages there are.

See the thread in v33 with the subject

  "[PATCH v33 1/4] mm: add a function to get free page blocks"

and look for me suggesting just using

    #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
__GFP_THISNODE | __GFP_NOMEMALLOC)

    struct page *page =  alloc_pages(GFP_MINFLAGS, MAX_ORDER-1);

for this all.

But I could also see an argument for "allocate N pages of size
MAX_ORDER-1", with some small N, simply because I can see the
advantage of not taking and releasing the locking and looking up the
zone individually N times.

If you want to get gigabytes of memory (or terabytes), doing it in
bigger chunks than one single maximum-sized page sounds fairly
reasonable.

I just don't think that "thousands of pages" is reasonable. But "tens
of max-sized pages" sounds fair enough to me, and it would certainly
not be a pain for the VM.

So I'm open to new interfaces. I just want those new interfaces to
make sense, and be low latency and simple for the VM to do. I'm
objecting to the incredibly baroque and heavy-weight one that can
return near-infinite amounts of memory.

The real advantage of jjuist the existing "alloc_pages()" model is
that I think the ballooning people can use that to *test* things out.
If it turns out that taking and releasing the VM locks is a big cost,
we can see if a batch interface that allows you to get tens of pages
at the same time is worth it.

So yes, I'd suggest starting with just the existing alloc_pages. Maybe
it's not enough, but it should be good enough for testing.

                    Linus

^ permalink raw reply

* Re: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Michal Hocko @ 2018-07-11 14:38 UTC (permalink / raw)
  To: Wang, Wei W
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	Rik van Riel, quan.xu0@gmail.com, KVM list, Michael S. Tsirkin,
	liliang.opensource@gmail.com, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton,
	nilal@redhat.com, Linus Torvalds
In-Reply-To: <286AC319A985734F985F78AFA26841F7396EEFD8@SHSMSX101.ccr.corp.intel.com>

On Wed 11-07-18 13:55:15, Wang, Wei W wrote:
> On Wednesday, July 11, 2018 7:10 PM, Michal Hocko wrote:
> > On Wed 11-07-18 18:52:45, Wei Wang wrote:
> > > On 07/11/2018 05:21 PM, Michal Hocko wrote:
> > > > On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> > > > [...]
> > > > > That was what I tried to encourage with actually removing the
> > > > > pages form the page list. That would be an _incremental_
> > > > > interface. You can remove MAX_ORDER-1 pages one by one (or a
> > > > > hundred at a time), and mark them free for ballooning that way.
> > > > > And if you still feel you have tons of free memory, just continue
> > removing more pages from the free list.
> > > > We already have an interface for that. alloc_pages(GFP_NOWAIT,
> > MAX_ORDER -1).
> > > > So why do we need any array based interface?
> > >
> > > Yes, I'm trying to get free pages directly via alloc_pages, so there
> > > will be no new mm APIs.
> > 
> > OK. The above was just a rough example. In fact you would need a more
> > complex gfp mask. I assume you only want to balloon only memory directly
> > usable by the kernel so it will be
> > 	(GFP_KERNEL | __GFP_NOWARN) & ~__GFP_RECLAIM
> 
> Sounds good to me, thanks.
> 
> > 
> > > I plan to let free page allocation stop when the remaining system free
> > > memory becomes close to min_free_kbytes (prevent swapping).
> > 
> > ~__GFP_RECLAIM will make sure you are allocate as long as there is any
> > memory without reclaim. It will not even poke the kswapd to do the
> > background work. So I do not think you would need much more than that.
> 
> "close to min_free_kbytes" - I meant when doing the allocations, we
> intentionally reserve some small amount of memory, e.g. 2 free page
> blocks of "MAX_ORDER - 1". So when other applications happen to do
> some allocation, they may easily get some from the reserved memory
> left on the free list. Without that reserved memory, other allocation
> may cause the system free memory below the WMARK[MIN], and kswapd
> would start to do swapping. This is actually just a small optimization
> to reduce the probability of causing swapping (nice to have, but not
> mandatary because we will allocate free page blocks one by one).

I really have hard time to follow you here. Nothing outside of the core
MM proper should play with watermarks.
 
>  > But let me note that I am not really convinced how this (or previous)
> > approach will really work in most workloads. We tend to cache heavily so
> > there is rarely any memory free.
> 
> With less free memory, the improvement becomes less, but should be
> nicer than no optimization. For example, the Linux build workload
> would cause 4~5 GB (out of 8GB) memory to be used as page cache at the
> final stage, there is still ~44% live migration time reduction.

But most systems will stay somewhere around the high watermark if there
is any page cache activity. Especially after a longer uptime.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* RE: [PATCH v35 1/5] mm: support to get hints of free page blocks
From: Wang, Wei W @ 2018-07-11 13:55 UTC (permalink / raw)
  To: Michal Hocko
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	Rik van Riel, quan.xu0@gmail.com, KVM list, Michael S. Tsirkin,
	liliang.opensource@gmail.com, Linux Kernel Mailing List,
	virtualization, linux-mm, Paolo Bonzini, Andrew Morton,
	nilal@redhat.com, Linus Torvalds
In-Reply-To: <20180711110949.GJ20050@dhcp22.suse.cz>

On Wednesday, July 11, 2018 7:10 PM, Michal Hocko wrote:
> On Wed 11-07-18 18:52:45, Wei Wang wrote:
> > On 07/11/2018 05:21 PM, Michal Hocko wrote:
> > > On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> > > [...]
> > > > That was what I tried to encourage with actually removing the
> > > > pages form the page list. That would be an _incremental_
> > > > interface. You can remove MAX_ORDER-1 pages one by one (or a
> > > > hundred at a time), and mark them free for ballooning that way.
> > > > And if you still feel you have tons of free memory, just continue
> removing more pages from the free list.
> > > We already have an interface for that. alloc_pages(GFP_NOWAIT,
> MAX_ORDER -1).
> > > So why do we need any array based interface?
> >
> > Yes, I'm trying to get free pages directly via alloc_pages, so there
> > will be no new mm APIs.
> 
> OK. The above was just a rough example. In fact you would need a more
> complex gfp mask. I assume you only want to balloon only memory directly
> usable by the kernel so it will be
> 	(GFP_KERNEL | __GFP_NOWARN) & ~__GFP_RECLAIM

Sounds good to me, thanks.

> 
> > I plan to let free page allocation stop when the remaining system free
> > memory becomes close to min_free_kbytes (prevent swapping).
> 
> ~__GFP_RECLAIM will make sure you are allocate as long as there is any
> memory without reclaim. It will not even poke the kswapd to do the
> background work. So I do not think you would need much more than that.

"close to min_free_kbytes" - I meant when doing the allocations, we intentionally reserve some small amount of memory, e.g. 2 free page blocks of "MAX_ORDER - 1". So when other applications happen to do some allocation, they may easily get some from the reserved memory left on the free list. Without that reserved memory, other allocation may cause the system free memory below the WMARK[MIN], and kswapd would start to do swapping. This is actually just a small optimization to reduce the probability of causing swapping (nice to have, but not mandatary because we will allocate free page blocks one by one).

 > But let me note that I am not really convinced how this (or previous)
> approach will really work in most workloads. We tend to cache heavily so
> there is rarely any memory free.

With less free memory, the improvement becomes less, but should be nicer than no optimization. For example, the Linux build workload would cause 4~5 GB (out of 8GB) memory to be used as page cache at the final stage, there is still ~44% live migration time reduction.

Since we have many cloud customers interested in this feature, I think we can let them test the usefulness.

Best,
Wei

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox