Linux virtualization list
 help / color / mirror / Atom feed
* [RFC V5 PATCH 3/8] vhost: do not use vring_used_elem
From: Jason Wang @ 2018-05-29  2:10 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1527559830-8133-1-git-send-email-jasowang@redhat.com>

Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 19 +++++++-------
 drivers/vhost/scsi.c  | 10 ++++----
 drivers/vhost/vhost.c | 68 ++++++++++++---------------------------------------
 drivers/vhost/vhost.h | 18 ++++++++------
 drivers/vhost/vsock.c |  6 ++---
 5 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 826489c..3826f1f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,10 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+		if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+		if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+			vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
 			++j;
 		} else
 			break;
@@ -367,7 +367,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_lock_bh();
 
 	/* set len to mark this desc buffers done DMA */
-	vq->heads[ubuf->desc].len = success ?
+	vq->heads[ubuf->desc].elem.len = success ?
 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 	cnt = vhost_net_ubuf_put(ubufs);
 
@@ -426,7 +426,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
-				    struct vring_used_elem *used_elem,
+				    struct vhost_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
@@ -477,7 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -542,9 +542,10 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id =
-				cpu_to_vhost32(vq, used.id);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+			vq->heads[nvq->upend_idx].elem.id =
+				cpu_to_vhost32(vq, used.elem.id);
+			vq->heads[nvq->upend_idx].elem.len =
+				VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 654c71f..ac11412 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	struct vring_used_elem tvc_vq_used;
+	struct vhost_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	unsigned out, in;
 	int ret;
 
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   struct vring_used_elem *used, unsigned out)
+			   struct vhost_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
@@ -837,7 +837,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 					ARRAY_SIZE(vq->iov), &out, &in,
 					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 used.id, out, in);
+			 used.elem.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 296bd5e..2b2a776 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -728,41 +728,6 @@ static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size, int access);
 
-static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
-			      const void *from, unsigned size)
-{
-	int ret;
-
-	if (!vq->iotlb)
-		return __copy_to_user(to, from, size);
-	else {
-		/* This function should be called after iotlb
-		 * prefetch, which means we're sure that all vq
-		 * could be access through iotlb. So -EAGAIN should
-		 * not happen in this case.
-		 */
-		struct iov_iter t;
-		void __user *uaddr = vhost_vq_meta_fetch(vq,
-				     (u64)(uintptr_t)to, size,
-				     VHOST_ADDR_USED);
-
-		if (uaddr)
-			return __copy_to_user(uaddr, from, size);
-
-		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
-				     ARRAY_SIZE(vq->iotlb_iov),
-				     VHOST_ACCESS_WO);
-		if (ret < 0)
-			goto out;
-		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
-		ret = copy_to_iter(from, size, &t);
-		if (ret == size)
-			ret = 0;
-	}
-out:
-	return ret;
-}
-
 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 				void __user *from, unsigned size)
 {
@@ -1958,7 +1923,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vring_used_elem *used,
+		      struct vhost_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -2009,7 +1974,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
-	used->id = ring_head;
+	used->elem.id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2103,9 +2068,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
-	used->len = cpu_to_vhost32(vq, len);
+	used->elem.len = cpu_to_vhost32(vq, len);
 }
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
@@ -2119,7 +2084,7 @@ static void vhost_set_used_len(struct vhost_virtqueue *vq,
  *	returns number of buffer heads allocated, negative on error
  */
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -2192,7 +2157,7 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
 		   int len)
 {
 	vhost_set_used_len(vq, used, len);
@@ -2201,27 +2166,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
-			    struct vring_used_elem *heads,
+			    struct vhost_used_elem *heads,
 			    unsigned count)
 {
 	struct vring_used_elem __user *used;
 	u16 old, new;
-	int start;
+	int start, i;
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (count == 1) {
-		if (vhost_put_user(vq, heads[0].id, &used->id)) {
+	for (i = 0; i < count; i++) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.id,
+					    &used[i].id))) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
-		if (vhost_put_user(vq, heads[0].len, &used->len)) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.len,
+					    &used[i].len))) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
-	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
-		vq_err(vq, "Failed to write used");
-		return -EFAULT;
 	}
 	if (unlikely(vq->log_used)) {
 		/* Make sure data is seen before log. */
@@ -2245,7 +2209,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count)
 {
 	int start, n, r;
@@ -2329,7 +2293,7 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
 	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
@@ -2339,7 +2303,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 /* multi-buffer version of vhost_add_used_and_signal */
 void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 				 struct vhost_virtqueue *vq,
-				 struct vring_used_elem *heads, unsigned count)
+				 struct vhost_used_elem *heads, unsigned count)
 {
 	vhost_add_used_n(vq, heads, count);
 	vhost_signal(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a7cc7e7..8dea44b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -34,6 +34,10 @@ struct vhost_poll {
 	struct vhost_dev	 *dev;
 };
 
+struct vhost_used_elem {
+	struct vring_used_elem elem;
+};
+
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
@@ -126,7 +130,7 @@ struct vhost_virtqueue {
 	struct iovec iov[UIO_MAXIOV];
 	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
-	struct vring_used_elem *heads;
+	struct vhost_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
 	struct vhost_umem *iotlb;
@@ -182,12 +186,12 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
-		      struct vring_used_elem *used_elem,
+		      struct vhost_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -198,13 +202,13 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *elem, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		   struct vhost_used_elem *elem, int len);
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *, int len);
+			       struct vhost_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *heads, unsigned count);
+			       struct vhost_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 59a01cd..695694f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
-		struct vring_used_elem used;
+		struct vhost_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = vhost32_to_cpu(vq, used.len);
+		len = vhost32_to_cpu(vq, used.elem.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -346,7 +346,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	int ret;
 	unsigned int out, in;
 	bool added = false;
-- 
2.7.4

^ permalink raw reply related

* [RFC V5 PATCH 2/8] vhost: hide used ring layout from device
From: Jason Wang @ 2018-05-29  2:10 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1527559830-8133-1-git-send-email-jasowang@redhat.com>

We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 46 +++++++++++++++++++++-----------------
 drivers/vhost/scsi.c  | 62 +++++++++++++++++++++++++++------------------------
 drivers/vhost/vhost.c | 52 +++++++++++++++++++++---------------------
 drivers/vhost/vhost.h |  9 +++++---
 drivers/vhost/vsock.c | 42 +++++++++++++++++-----------------
 5 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 762aa81..826489c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -426,22 +426,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
+				    struct vring_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+	int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
+	if (r == -ENOSPC && vq->busyloop_timeout) {
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(vq->dev, endtime) &&
 		       vhost_vq_avail_empty(vq->dev, vq))
 			cpu_relax();
 		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
+		r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+				      ARRAY_SIZE(vq->iov), out_num, in_num,
+				      NULL, NULL);
 	}
 
 	return r;
@@ -463,7 +465,6 @@ static void handle_tx(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -476,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+	struct vring_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -499,20 +501,20 @@ static void handle_tx(struct vhost_net *net)
 			vhost_zerocopy_signal_used(net, vq);
 
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = vhost_net_tx_get_vq_desc(net, vq, &used, vq->iov,
+					       ARRAY_SIZE(vq->iov),
+					       &out, &in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 		if (in) {
 			vq_err(vq, "Unexpected descriptor format for TX: "
 			       "out %d, int %d\n", out, in);
@@ -540,7 +542,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+			vq->heads[nvq->upend_idx].id =
+				cpu_to_vhost32(vq, used.id);
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
@@ -581,7 +584,7 @@ static void handle_tx(struct vhost_net *net)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
 		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+			vhost_add_used_and_signal(&net->dev, vq, &used, 0);
 		else
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
@@ -713,14 +716,12 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
-					   &in, vq_log, &log,
-					   likely(mergeable) ? UIO_MAXIOV : 1);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(headcount < 0))
-			goto out;
+		err = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+				     &in, vq_log, &log,
+				     likely(mergeable) ? UIO_MAXIOV : 1,
+				     &headcount);
 		/* OK, now we need to know about added descriptors. */
-		if (!headcount) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -731,6 +732,9 @@ static void handle_rx(struct vhost_net *net)
 			 * they refilled. */
 			goto out;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			goto out;
 		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 7ad5709..654c71f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	int tvc_vq_desc;
+	struct vring_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,8 +441,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head, ret;
+	int ret;
 
 	if (!vq->private_data) {
 		vs->vs_events_missed = true;
@@ -451,16 +452,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 
 again:
 	vhost_disable_notify(&vs->dev, vq);
-	head = vhost_get_vq_desc(vq, vq->iov,
+	ret = vhost_get_vq_desc(vq, &used, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
-	if (head < 0) {
+	if (ret == -ENOSPC) {
+		if (vhost_enable_notify(&vs->dev, vq))
+			goto again;
 		vs->vs_events_missed = true;
 		return;
 	}
-	if (head == vq->num) {
-		if (vhost_enable_notify(&vs->dev, vq))
-			goto again;
+	if (ret < 0) {
 		vs->vs_events_missed = true;
 		return;
 	}
@@ -480,7 +481,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	eventp = vq->iov[out].iov_base;
 	ret = __copy_to_user(eventp, event, sizeof(*event));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
 	else
 		vq_err(vq, "Faulted on vhost_scsi_send_event\n");
 }
@@ -541,7 +542,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
 			struct vhost_scsi_virtqueue *q;
-			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+			vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
 			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
 			vq = q - vs->vqs;
 			__set_bit(vq, signal);
@@ -784,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   int head, unsigned out)
+			   struct vring_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -795,7 +796,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 	resp = vq->iov[out].iov_base;
 	ret = __copy_to_user(resp, &rsp, sizeof(rsp));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, used, 0);
 	else
 		pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
@@ -807,11 +808,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
+	struct vring_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
 	unsigned int out = 0, in = 0;
-	int head, ret, prot_bytes;
+	int ret, prot_bytes;
 	size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
 	size_t out_size, in_size;
 	u16 lun;
@@ -831,22 +833,22 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	vhost_disable_notify(&vs->dev, vq);
 
 	for (;;) {
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov), &out, &in,
-					 NULL, NULL);
+		ret = vhost_get_vq_desc(vq, &used, vq->iov,
+					ARRAY_SIZE(vq->iov), &out, &in,
+					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 head, out, in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+			 used.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
 				vhost_disable_notify(&vs->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(ret < 0))
+			break;
 		/*
 		 * Check for a sane response buffer so we can report early
 		 * errors back to the guest.
@@ -891,20 +893,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 
 		if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
 			vq_err(vq, "Faulted on copy_from_iter\n");
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/* virtio-scsi spec requires byte 0 of the lun to be 1 */
 		if (unlikely(*lunp != 1)) {
 			vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 
 		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/*
@@ -950,7 +952,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_TO_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesout,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
@@ -958,7 +961,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_FROM_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesin,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
@@ -996,7 +1000,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			vq_err(vq, "Received SCSI CDB with command_size: %d that"
 				" exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
 				scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
@@ -1005,7 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (IS_ERR(cmd)) {
 			vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
 			       PTR_ERR(cmd));
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd->tvc_vhost = vs;
@@ -1025,7 +1029,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			if (unlikely(ret)) {
 				vq_err(vq, "Failed to map iov to sgl\n");
 				vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
-				vhost_scsi_send_bad_target(vs, vq, head, out);
+				vhost_scsi_send_bad_target(vs, vq, &used, out);
 				continue;
 			}
 		}
@@ -1034,7 +1038,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 * complete the virtio-scsi request in TCM callback context via
 		 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
 		 */
-		cmd->tvc_vq_desc = head;
+		cmd->tvc_vq_used = used;
 		/*
 		 * Dispatch cmd descriptor for cmwq execution in process
 		 * context provided by vhost_scsi_workqueue.  This also ensures
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 096a688..296bd5e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1958,6 +1958,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vring_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -1990,7 +1991,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		 * invalid.
 		 */
 		if (vq->avail_idx == last_avail_idx)
-			return vq->num;
+			return -ENOSPC;
 
 		/* Only get avail ring entries after they have been
 		 * exposed by guest.
@@ -2008,6 +2009,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	used->id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2096,10 +2098,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
-	return head;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+			       struct vring_used_elem *used, int len)
+{
+	used->len = cpu_to_vhost32(vq, len);
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -2116,13 +2124,13 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota)
+		   unsigned int quota,
+		   s16 *count)
 {
 	unsigned int out, in;
 	int seg = 0;
 	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
+	int r = 0, nlogs = 0;
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
@@ -2133,17 +2141,12 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
+		r = vhost_get_vq_desc(vq, &heads[headcount], vq->iov + seg,
 				      ARRAY_SIZE(vq->iov) - seg, &out,
 				      &in, log, log_num);
 		if (unlikely(r < 0))
 			goto err;
 
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
 		if (unlikely(out || in <= 0)) {
 			vq_err(vq, "unexpected descriptor format for RX: "
 				"out %d, in %d\n", out, in);
@@ -2154,24 +2157,26 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
+
 		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
+		vhost_set_used_len(vq, &heads[headcount], len);
 		datalen -= len;
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	vhost_set_used_len(vq, &heads[headcount - 1], len + datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
 
 	/* Detect overrun */
 	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
+		headcount = UIO_MAXIOV + 1;
 		goto err;
 	}
-	return headcount;
+
+	*count = headcount;
+	return 0;
 err:
 	vhost_discard_vq_desc(vq, headcount);
 	return r;
@@ -2187,14 +2192,11 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+		   int len)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
-
-	return vhost_add_used_n(vq, &heads, 1);
+	vhost_set_used_len(vq, used, len);
+	return vhost_add_used_n(vq, used, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
@@ -2327,9 +2329,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct vring_used_elem *used, int len)
 {
-	vhost_add_used(vq, head, len);
+	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 52edd242..a7cc7e7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -182,6 +182,7 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
+		      struct vring_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
@@ -191,15 +192,17 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota);
+		   unsigned int quota,
+		   s16 *count);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *elem, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int id, int len);
+			       struct vring_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..59a01cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,11 +98,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
+		struct vring_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
 		size_t len;
-		int head;
+		int ret;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +117,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -139,6 +133,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			}
 			break;
 		}
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
 
 		if (out) {
 			virtio_transport_free_pkt(pkt);
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
+		len = vhost32_to_cpu(vq, used.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -163,7 +163,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
 		added = true;
 
 		if (pkt->reply) {
@@ -346,7 +346,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	int head;
+	struct vring_used_elem used;
+	int ret;
 	unsigned int out, in;
 	bool added = false;
 
@@ -367,18 +368,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			goto no_more_replies;
 		}
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
 				vhost_disable_notify(&vsock->dev, vq);
 				continue;
 			}
 			break;
 		}
+		if (ret < 0)
+			break;
 
 		pkt = vhost_vsock_alloc_pkt(vq, out, in);
 		if (!pkt) {
@@ -397,7 +397,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [RFC V5 PATCH 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-05-29  2:10 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <1527559830-8133-1-git-send-email-jasowang@redhat.com>

Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 83 ++-------------------------------------------------
 drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  7 +++++
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a..762aa81 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -664,83 +664,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 	return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- *	vq has read descriptors only.
- * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- * @quota       - headcount quota, 1 for big buffer
- *	returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-		       struct vring_used_elem *heads,
-		       int datalen,
-		       unsigned *iovcount,
-		       struct vhost_log *log,
-		       unsigned *log_num,
-		       unsigned int quota)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
-	/* len is always initialized before use since we are always called with
-	 * datalen > 0.
-	 */
-	u32 uninitialized_var(len);
-
-	while (datalen > 0 && headcount < quota) {
-		if (unlikely(seg >= UIO_MAXIOV)) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
-		if (unlikely(r < 0))
-			goto err;
-
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (unlikely(out || in <= 0)) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		if (unlikely(log)) {
-			nlogs += *log_num;
-			log += *log_num;
-		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
-		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
-		datalen -= len;
-		++headcount;
-		seg += in;
-	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-	*iovcount = seg;
-	if (unlikely(log))
-		*log_num = nlogs;
-
-	/* Detect overrun */
-	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
-		goto err;
-	}
-	return headcount;
-err:
-	vhost_discard_vq_desc(vq, headcount);
-	return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -790,9 +713,9 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len,
-					&in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+					   &in, vq_log, &log,
+					   likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f0be5f3..096a688 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2100,6 +2100,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+	/* len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		r = vhost_get_vq_desc(vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (unlikely(r < 0))
+			goto err;
+
+		d = r;
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		r = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..52edd242 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [RFC V5 PATCH 0/8] Packed ring layout for vhost
From: Jason Wang @ 2018-05-29  2:10 UTC (permalink / raw)
  To: mst, jasowang; +Cc: kvm, netdev, linux-kernel, virtualization, wexu

Hi all:

This RFC implement packed ring layout. The code were tested with
Tiwei's RFC V5 at https://lkml.org/lkml/2018/5/22/138. Some fixups and
tweaks were needed on top of Tiwei's code to make it run for event
index.

Pktgen reports about 20% improvement on TX PPS when doing pktgen from
guest to host. No ovbious improvement on RX PPS. We can do lots of
optimizations on top but for simple and for correceness first, this
version does not do much.

This version were tested with:

- Zerocopy (Out of Order) support
- vIOMMU support
- mergeable buffer on/off
- busy polling on/off

Notes for tester:

- Start from this version, vhost need qemu co-operation to work
  correctly. Or you can comment out the packed specific code for
  GET/SET_VRING_BASE.

- Changes from V4:
- fix signalled_used index recording
- track avail index correctly
- various minor fixes

Changes from V3:
- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from V2:
- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from V1:

- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (8):
  vhost: move get_rx_bufs to vhost.c
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost_net: do not explicitly manipulate vhost_used_elem
  vhost: vhost_put_user() can accept metadata type
  virtio: introduce packed ring defines
  vhost: packed ring support
  vhost: event suppression for packed ring

 drivers/vhost/net.c                | 144 ++----
 drivers/vhost/scsi.c               |  62 +--
 drivers/vhost/vhost.c              | 926 +++++++++++++++++++++++++++++++++----
 drivers/vhost/vhost.h              |  52 ++-
 drivers/vhost/vsock.c              |  42 +-
 include/uapi/linux/virtio_config.h |   9 +
 include/uapi/linux/virtio_ring.h   |  32 ++
 7 files changed, 1000 insertions(+), 267 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v4] virtio_blk: add DISCARD and WRIET ZEROES commands support
From: Changpeng Liu @ 2018-05-29  1:42 UTC (permalink / raw)
  To: virtualization, changpeng.liu; +Cc: pbonzini, cavery, stefanha

Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands
support, this will impact the performance when using SSD backend over
file systems.

Commit 88c85538 "virtio-blk: add discard and write zeroes features to
specification"(see https://github.com/oasis-tcs/virtio-spec) extended
existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES
commands support.

While here, using 16 bytes descriptor to describe one segment of DISCARD
or WRITE ZEROES commands, each command may contain one or more decriptors.

The following data structure shows the definition of one descriptor:

struct virtio_blk_discard_write_zeroes {
        le64 sector;
        le32 num_sectors;
        le32 unmap;
};

Field 'sector' means the start sector for DISCARD and WRITE ZEROES,
filed 'num_sectors' means the number of sectors for DISCARD and WRITE
ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap'
maybe used for WRITE ZEROES command with DISCARD enabled.

We also extended the virtio-blk configuration space to let backend
device put DISCARD and WRITE ZEROES configuration parameters.

struct virtio_blk_config {
        [...]

        le32 max_discard_sectors;
        le32 max_discard_seg;
        le32 discard_sector_alignment;
        le32 max_write_zeroes_sectors;
        le32 max_write_zeroes_seg;
        u8 write_zeroes_may_unmap;
}

New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard
command, maximum discard sectors size in field 'max_discard_sectors' and
maximum discard segment number in field 'max_discard_seg'.

New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write
zeroes command, maximum write zeroes sectors size in field
'max_write_zeroes_sectors' and maximum write zeroes segment number in
field 'max_write_zeroes_seg'.

The parameters in the configuration space of the device field
'max_discard_sectors' and field 'discard_sector_alignment' are expressed in
512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The
field 'max_write_zeroes_sectors' is expressed in 512-byte units if the
VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
---
CHANGELOG:
v4: several optimizations based on MST's comments, remove bit field usage for
command descriptor.
v3: define the virtio-blk protocol to add discard and write zeroes support,
first version implementation based on proposed specification.
v2: add write zeroes command support.
v1: initial proposal implementation for discard command.
---
 drivers/block/virtio_blk.c      | 92 +++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/virtio_blk.h | 43 +++++++++++++++++++
 2 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..a250fcc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+static inline int virtblk_setup_discard_write_zeroes(struct request *req,
+						bool unmap)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+	unsigned short n = 0;
+	struct virtio_blk_discard_write_zeroes *range;
+	struct bio *bio;
+
+	range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL);
+	if (!range)
+		return -ENOMEM;
+
+	__rq_for_each_bio(bio, req) {
+		u64 sector = bio->bi_iter.bi_sector;
+		u32 num_sectors = bio->bi_iter.bi_size >> 9;
+
+		range[n].unmap = cpu_to_le32(unmap);
+		range[n].num_sectors = cpu_to_le32(num_sectors);
+		range[n].sector = cpu_to_le64(sector);
+		n++;
+	}
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		kfree(page_address(req->special_vec.bv_page) +
+		      req->special_vec.bv_offset);
+	}
+
 	switch (req_op(req)) {
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
@@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int qid = hctx->queue_num;
 	int err;
 	bool notify = false;
+	bool unmap = false;
 	u32 type;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case REQ_OP_FLUSH:
 		type = VIRTIO_BLK_T_FLUSH;
 		break;
+	case REQ_OP_DISCARD:
+		type = VIRTIO_BLK_T_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		type = VIRTIO_BLK_T_WRITE_ZEROES;
+		unmap = !(req->cmd_flags & REQ_NOUNMAP);
+		break;
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
 		type = VIRTIO_BLK_T_SCSI_CMD;
@@ -256,9 +299,16 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(req);
 
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
+		err = virtblk_setup_discard_write_zeroes(req, unmap);
+		if (err)
+			return BLK_STS_RESOURCE;
+	}
+
 	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
 	if (num) {
-		if (rq_data_dir(req) == WRITE)
+		if (rq_data_dir(req) == WRITE || type == VIRTIO_BLK_T_DISCARD ||
+		    type == VIRTIO_BLK_T_WRITE_ZEROES)
 			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
 		else
 			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
@@ -777,6 +827,42 @@ static int virtblk_probe(struct virtio_device *vdev)
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
+		q->limits.discard_granularity = blk_size;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+				discard_sector_alignment, &v);
+		if (v)
+			q->limits.discard_alignment = v << 9;
+		else
+			q->limits.discard_alignment = 0;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+				max_discard_sectors, &v);
+		if (v)
+			blk_queue_max_discard_sectors(q, v);
+		else
+			blk_queue_max_discard_sectors(q, UINT_MAX);
+
+		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
+				&v);
+		if (v)
+			blk_queue_max_discard_segments(q, v);
+		else
+			blk_queue_max_discard_segments(q, USHRT_MAX);
+
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
+		virtio_cread(vdev, struct virtio_blk_config,
+				max_write_zeroes_sectors, &v);
+		if (v)
+			blk_queue_max_write_zeroes_sectors(q, v);
+		else
+			blk_queue_max_write_zeroes_sectors(q, UINT_MAX);
+	}
+
 	virtblk_update_capacity(vblk, false);
 	virtio_device_ready(vdev);
 
@@ -885,14 +971,14 @@ static int virtblk_restore(struct virtio_device *vdev)
 	VIRTIO_BLK_F_SCSI,
 #endif
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 }
 ;
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 9ebe4d9..8e7a015 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -38,6 +38,8 @@
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
+#define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
+#define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -86,6 +88,29 @@ struct virtio_blk_config {
 
 	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
 	__u16 num_queues;
+	/* The maximum discard sectors (in 512-byte sectors) for
+	 * one segment (if VIRTIO_BLK_F_DISCARD)
+	 */
+	__u32 max_discard_sectors;
+	/* The maximum number of discard segments in a
+	 * discard command (if VIRTIO_BLK_F_DISCARD)
+	 */
+	__u32 max_discard_seg;
+	/* The alignment sectors for discard (if VIRTIO_BLK_F_DISCARD) */
+	__u32 discard_sector_alignment;
+	/* The maximum number of write zeroes sectors (in 512-byte sectors) in
+	 * one segment (if VIRTIO_BLK_F_WRITE_ZEROES)
+	 */
+	__u32 max_write_zeroes_sectors;
+	/* The maximum number of segments in a write zeroes
+	 * command (if VIRTIO_BLK_F_WRITE_ZEROES)
+	 */
+	__u32 max_write_zeroes_seg;
+	/* Device clear this bit when write zeroes command can't result in
+	 * unmapping sectors (if VIRITO_BLK_F_WRITE_ZEROES and with unmap)
+	 */
+	__u8 write_zeroes_may_unmap;
+	__u8 unused1[3];
 } __attribute__((packed));
 
 /*
@@ -114,6 +139,12 @@ struct virtio_blk_config {
 /* Get device ID command */
 #define VIRTIO_BLK_T_GET_ID    8
 
+/* Discard command */
+#define VIRTIO_BLK_T_DISCARD	11
+
+/* Write zeroes command */
+#define VIRTIO_BLK_T_WRITE_ZEROES	13
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -133,6 +164,18 @@ struct virtio_blk_outhdr {
 	__virtio64 sector;
 };
 
+/*
+ * discard/write zeroes range for each request.
+ */
+struct virtio_blk_discard_write_zeroes {
+	/* discard/write zeroes start sector */
+	__virtio64 sector;
+	/* number of discard/write zeroes sectors */
+	__virtio32 num_sectors;
+	/* valid for write zeroes command */
+	__virtio32 unmap;
+};
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 struct virtio_scsi_inhdr {
 	__virtio32 errors;
-- 
1.9.3

^ permalink raw reply related

* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Benjamin Herrenschmidt @ 2018-05-28 23:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: robh, mpe, linux-kernel, virtualization, hch, joe, david,
	linuxppc-dev, elfring, Anshuman Khandual
In-Reply-To: <6fff9f5d67361653e6072570a857cf0d1009a123.camel@kernel.crashing.org>

On Tue, 2018-05-29 at 09:48 +1000, Benjamin Herrenschmidt wrote:
> > Well it's not supposed to be much slower for the static case.
> > 
> > vhost has a cache so should be fine.
> > 
> > A while ago Paolo implemented a translation cache which should be
> > perfect for this case - most of the code got merged but
> > never enabled because of stability issues.
> > 
> > If all else fails, we could teach QEMU to handle the no-iommu case
> > as if VIRTIO_F_IOMMU_PLATFORM was off.
> 
> Any serious reason why not just getting that 2 line patch allowing our
> arch code to force virtio to use the DMA API ?
> 
> It's not particularly invasive and solves our problem rather nicely
> without adding overhead or additional knowledge to qemu/libvirt/mgmnt
> tools etc... that it doesn't need etc....
> 
> The guest knows it's going secure so the guest arch code can do the
> right thing rather trivially.
> 
> Long term we should probably make virtio always use the DMA API anyway,
> and interpose "1:1" dma_ops for the traditional virtio case, that would
> reduce code clutter significantly. In that case, it would become just a
> matter of having a platform hook to override the dma_ops used.

To elaborate a bit ....

What we are trying to solve here is entirely a guest problem, I don't
think involving qemu in the solution is the right thing to do.

The guest can only allow external parties (qemu, potentially PCI
devices, etc...) access to some restricted portions of memory (insecure
memory). Thus the guest need to do some bounce buffering/swiotlb type
tricks.

This is completely orthogonal to whether there is an actual iommu
between the guest and the device (or emulated device/virtio).

This is why I think the solution should reside in the guest kernel, by
proper manipulation (by the arch code) of the dma ops.

I don't think forcing the addition of an emulated iommu in the middle
just to work around the fact that virtio "cheats" and doesn't use the
dma API unless there is one, is the right "fix".

The right long term fix is to always use the DMA API, reducing code
path etc... and just have a single point where virtio can "chose"
alternate DMA ops (via an arch hook to deal with our case).

In the meantime, having the hook we propose gets us going, but if you
agree with the approach, we should also work on the long term approach.

Cheers,
Ben.

^ permalink raw reply

* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Benjamin Herrenschmidt @ 2018-05-28 23:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: robh, mpe, linux-kernel, virtualization, hch, joe, david,
	linuxppc-dev, elfring, Anshuman Khandual
In-Reply-To: <20180525202300-mutt-send-email-mst@kernel.org>

On Fri, 2018-05-25 at 20:45 +0300, Michael S. Tsirkin wrote:
> On Thu, May 24, 2018 at 08:27:04AM +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2018-05-23 at 21:50 +0300, Michael S. Tsirkin wrote:
> > 
> > > I re-read that discussion and I'm still unclear on the
> > > original question, since I got several apparently
> > > conflicting answers.
> > > 
> > > I asked:
> > > 
> > > 	Why isn't setting VIRTIO_F_IOMMU_PLATFORM on the
> > > 	hypervisor side sufficient?
> > 
> > I thought I had replied to this...
> > 
> > There are a couple of reasons:
> > 
> > - First qemu doesn't know that the guest will switch to "secure mode"
> > in advance. There is no difference between a normal and a secure
> > partition until the partition does the magic UV call to "enter secure
> > mode" and qemu doesn't see any of it. So who can set the flag here ?
> 
> Not sure I understand. Just set the flag e.g. on qemu command line.
> I might be wrong, but these secure mode things usually
> a. require hypervisor side tricks anyway

The way our secure mode architecture is designed, there doesn't need at
this point to be any knowledge at qemu level whatsoever. Well at least
until we do migration but that's a different kettle of fish. In any
case, the guest starts normally (which means as a non-secure guest, and
thus expects normal virtio, our FW today doesn't handle
VIRTIO_F_IOMMU_PLATFORM, though granted, we can fix this), and later
that guest issues some special Ultravisor call that turns it into a
secure guest.

There is some involvement of the hypervisor, but not qemu at this
stage. We would very much like to avoid that, as it would be a hassle
for users to have to use different libvirt options etc... bcs the guest
might turn itself into a secure VM.

> > - Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or
> > vhost) go through the emulated MMIO for every access to the guest,
> > which adds additional overhead.
> > 
> > Cheers,
> > Ben.
> 
> Well it's not supposed to be much slower for the static case.
> 
> vhost has a cache so should be fine.
> 
> A while ago Paolo implemented a translation cache which should be
> perfect for this case - most of the code got merged but
> never enabled because of stability issues.
> 
> If all else fails, we could teach QEMU to handle the no-iommu case
> as if VIRTIO_F_IOMMU_PLATFORM was off.

Any serious reason why not just getting that 2 line patch allowing our
arch code to force virtio to use the DMA API ?

It's not particularly invasive and solves our problem rather nicely
without adding overhead or additional knowledge to qemu/libvirt/mgmnt
tools etc... that it doesn't need etc....

The guest knows it's going secure so the guest arch code can do the
right thing rather trivially.

Long term we should probably make virtio always use the DMA API anyway,
and interpose "1:1" dma_ops for the traditional virtio case, that would
reduce code clutter significantly. In that case, it would become just a
matter of having a platform hook to override the dma_ops used.

Cheers,
Ben.

> 
> 
> > > 
> > > 
> > > >  arch/powerpc/include/asm/dma-mapping.h |  6 ++++++
> > > >  arch/powerpc/platforms/pseries/iommu.c | 11 +++++++++++
> > > >  drivers/virtio/virtio_ring.c           | 10 ++++++++++
> > > >  3 files changed, 27 insertions(+)
> > > > 
> > > > diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
> > > > index 8fa3945..056e578 100644
> > > > --- a/arch/powerpc/include/asm/dma-mapping.h
> > > > +++ b/arch/powerpc/include/asm/dma-mapping.h
> > > > @@ -115,4 +115,10 @@ extern u64 __dma_get_required_mask(struct device *dev);
> > > >  #define ARCH_HAS_DMA_MMAP_COHERENT
> > > >  
> > > >  #endif /* __KERNEL__ */
> > > > +
> > > > +#define platform_forces_virtio_dma platform_forces_virtio_dma
> > > > +
> > > > +struct virtio_device;
> > > > +
> > > > +extern bool platform_forces_virtio_dma(struct virtio_device *vdev);
> > > >  #endif	/* _ASM_DMA_MAPPING_H */
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > > > index 06f0296..a2ec15a 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -38,6 +38,7 @@
> > > >  #include <linux/of.h>
> > > >  #include <linux/iommu.h>
> > > >  #include <linux/rculist.h>
> > > > +#include <linux/virtio.h>
> > > >  #include <asm/io.h>
> > > >  #include <asm/prom.h>
> > > >  #include <asm/rtas.h>
> > > > @@ -1396,3 +1397,13 @@ static int __init disable_multitce(char *str)
> > > >  __setup("multitce=", disable_multitce);
> > > >  
> > > >  machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
> > > > +
> > > > +bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > > > +{
> > > > +	/*
> > > > +	 * On protected guest platforms, force virtio core to use DMA
> > > > +	 * MAP API for all virtio devices. But there can also be some
> > > > +	 * exceptions for individual devices like virtio balloon.
> > > > +	 */
> > > > +	return (of_find_compatible_node(NULL, NULL, "ibm,ultravisor") != NULL);
> > > > +}
> > > 
> > > Isn't this kind of slow?  vring_use_dma_api is on
> > > data path and supposed to be very fast.
> > > 
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 21d464a..47ea6c3 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -141,8 +141,18 @@ struct vring_virtqueue {
> > > >   * unconditionally on data path.
> > > >   */
> > > >  
> > > > +#ifndef platform_forces_virtio_dma
> > > > +static inline bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +
> > > >  static bool vring_use_dma_api(struct virtio_device *vdev)
> > > >  {
> > > > +	if (platform_forces_virtio_dma(vdev))
> > > > +		return true;
> > > > +
> > > >  	if (!virtio_has_iommu_quirk(vdev))
> > > >  		return true;
> > > >  
> > > > -- 
> > > > 2.9.3

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Samudrala, Sridhar @ 2018-05-26 19:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <20180526075156.GC4288@nanopsycho.orion>

'On 5/26/2018 12:51 AM, Jiri Pirko wrote:
> Sat, May 26, 2018 at 09:22:18AM CEST, sridhar.samudrala@intel.com wrote:
>> On 5/25/2018 4:28 PM, Stephen Hemminger wrote:
>>> On Fri, 25 May 2018 16:11:47 -0700
>>> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>>>
>>>> On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
>>>>> On Thu, 24 May 2018 09:55:14 -0700
>>>>> Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>>>>>> --- a/drivers/net/hyperv/Kconfig
>>>>>> +++ b/drivers/net/hyperv/Kconfig
>>>>>> @@ -2,5 +2,6 @@ config HYPERV_NET
>>>>>>     	tristate "Microsoft Hyper-V virtual network driver"
>>>>>>     	depends on HYPERV
>>>>>>     	select UCS2_STRING
>>>>>> +	select FAILOVER
>>>>> When I take a working kernel config, add the patches then do
>>>>> make oldconfig
>>>>>
>>>>> It is not autoselecting FAILOVER, it prompts me for it. This means
>>>>> if user says no then a non-working netvsc device is made.
>>>> I see
>>>>       Generic failover module (FAILOVER) [M/y/?] (NEW)
>>>>
>>>> So the user is given an option to either build as a Module or part of the
>>>> kernel. 'n' is not an option.
>>> With most libraries there is no prompt at all.
>> Not sure what you meant by this.
>> Without any patches applied, i had a .config file with HYPERV_NET configured
>> as a module.
>> Then after applying the first 2 patches in this series, i did a
>>   make oldconfig
>> and i see the above prompt.
>>
>> Are you saying that on some distros, 'make oldconfig creates a .config
>> file without any prompt and FAILOVER is not getting selected even when HYPERV_NET
>> is enabled?
>>
>>
> Well the thing is that for a user, it makes no sense to select
> "FAILOVER" by hand. It is a lib, so it should be only select it by a
> user. It has no sense to have it turned on by hand - no lib user.
> You can achieve that by simply removing "help" for the Kconfig
> item. Same thing for "NET_FAILOVER".

I played around with the CONFIG options and i see that FAILOVER options do
get selected correctly when virtio-net/netvsc are enabled.  Even if the FAILOVER
is turned off by the user before the hyperv-net/virtio-net patches are applied,
it gets selected automatically when hyperv-net/virtio-net patches are applied and
enabled in config.

If we don't want to allow the user to see these options, then i think we need to
remove them from Kconfig files.  Just removing "help" doesn't seem to make a
difference.

Can we address any config issues (i don't see any at this point) as a bug-fix on top
of this series?


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Jiri Pirko @ 2018-05-26  7:51 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <cffe3c66-ae31-ae13-27ce-55f9be2d53d4@intel.com>

Sat, May 26, 2018 at 09:22:18AM CEST, sridhar.samudrala@intel.com wrote:
>On 5/25/2018 4:28 PM, Stephen Hemminger wrote:
>> On Fri, 25 May 2018 16:11:47 -0700
>> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>> 
>> > On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
>> > > On Thu, 24 May 2018 09:55:14 -0700
>> > > Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>> > > > --- a/drivers/net/hyperv/Kconfig
>> > > > +++ b/drivers/net/hyperv/Kconfig
>> > > > @@ -2,5 +2,6 @@ config HYPERV_NET
>> > > >    	tristate "Microsoft Hyper-V virtual network driver"
>> > > >    	depends on HYPERV
>> > > >    	select UCS2_STRING
>> > > > +	select FAILOVER
>> > > When I take a working kernel config, add the patches then do
>> > > make oldconfig
>> > > 
>> > > It is not autoselecting FAILOVER, it prompts me for it. This means
>> > > if user says no then a non-working netvsc device is made.
>> > I see
>> >      Generic failover module (FAILOVER) [M/y/?] (NEW)
>> > 
>> > So the user is given an option to either build as a Module or part of the
>> > kernel. 'n' is not an option.
>> With most libraries there is no prompt at all.
>
>Not sure what you meant by this.
>Without any patches applied, i had a .config file with HYPERV_NET configured
>as a module.
>Then after applying the first 2 patches in this series, i did a
>  make oldconfig
>and i see the above prompt.
>
>Are you saying that on some distros, 'make oldconfig creates a .config
>file without any prompt and FAILOVER is not getting selected even when HYPERV_NET
>is enabled?
>
>

Well the thing is that for a user, it makes no sense to select
"FAILOVER" by hand. It is a lib, so it should be only select it by a
user. It has no sense to have it turned on by hand - no lib user.
You can achieve that by simply removing "help" for the Kconfig
item. Same thing for "NET_FAILOVER".

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Jiri Pirko @ 2018-05-26  7:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: alexander.h.duyck, virtio-dev, mst, kubakici, Sridhar Samudrala,
	virtualization, loseweigh, netdev, anjali.singhai, aaron.f.brown,
	davem
In-Reply-To: <20180525153744.2b53c449@xeon-e3>

Sat, May 26, 2018 at 12:37:44AM CEST, stephen@networkplumber.org wrote:
>On Thu, 24 May 2018 09:55:13 -0700
>Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>
>
>> +	spin_lock(&failover_lock);
>
>Since register is not in fast path, this should be a mutex?

I don't get it. Why would you prefer mutex over spinlock here?

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Samudrala, Sridhar @ 2018-05-26  7:22 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <20180525162839.3c7028e1@xeon-e3>

On 5/25/2018 4:28 PM, Stephen Hemminger wrote:
> On Fri, 25 May 2018 16:11:47 -0700
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>
>> On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
>>> On Thu, 24 May 2018 09:55:14 -0700
>>> Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>>>   
>>>> --- a/drivers/net/hyperv/Kconfig
>>>> +++ b/drivers/net/hyperv/Kconfig
>>>> @@ -2,5 +2,6 @@ config HYPERV_NET
>>>>    	tristate "Microsoft Hyper-V virtual network driver"
>>>>    	depends on HYPERV
>>>>    	select UCS2_STRING
>>>> +	select FAILOVER
>>> When I take a working kernel config, add the patches then do
>>> make oldconfig
>>>
>>> It is not autoselecting FAILOVER, it prompts me for it. This means
>>> if user says no then a non-working netvsc device is made.
>> I see
>>      Generic failover module (FAILOVER) [M/y/?] (NEW)
>>
>> So the user is given an option to either build as a Module or part of the
>> kernel. 'n' is not an option.
> With most libraries there is no prompt at all.

Not sure what you meant by this.
Without any patches applied, i had a .config file with HYPERV_NET configured
as a module.
Then after applying the first 2 patches in this series, i did a
   make oldconfig
and i see the above prompt.

Are you saying that on some distros, 'make oldconfig creates a .config
file without any prompt and FAILOVER is not getting selected even when HYPERV_NET
is enabled?

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Stephen Hemminger @ 2018-05-25 23:29 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <00d34f67-f26f-0b20-af3f-2add24ae8a5c@intel.com>

On Fri, 25 May 2018 16:06:58 -0700
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 5/25/2018 3:38 PM, Stephen Hemminger wrote:
> > On Thu, 24 May 2018 09:55:13 -0700
> > Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
> >  
> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >> index 03ed492c4e14..0f4ba52b641d 100644
> >> --- a/include/linux/netdevice.h
> >> +++ b/include/linux/netdevice.h
> >> @@ -1421,6 +1421,8 @@ struct net_device_ops {
> >>    *	entity (i.e. the master device for bridged veth)
> >>    * @IFF_MACSEC: device is a MACsec device
> >>    * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
> >> + * @IFF_FAILOVER: device is a failover master device
> >> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
> >>    */
> >>   enum netdev_priv_flags {
> >>   	IFF_802_1Q_VLAN			= 1<<0,
> >> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
> >>   	IFF_PHONY_HEADROOM		= 1<<24,
> >>   	IFF_MACSEC			= 1<<25,
> >>   	IFF_NO_RX_HANDLER		= 1<<26,
> >> +	IFF_FAILOVER			= 1<<27,
> >> +	IFF_FAILOVER_SLAVE		= 1<<28,
> >>   };  
> > Why is FAILOVER any different than other master/slave relationships.
> > I don't think you need to take up precious netdev flag bits for this.  
> 
> These are netdev priv flags.
> Jiri says that IFF_MASTER/IFF_SLAVE are bonding specific flags and cannot be used
> with other failover mechanisms. Team also doesn't use this flags and it has its own
> priv_flags.
> 

They are already used by bonding and team.
I don't see why this can't reuse them.

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Stephen Hemminger @ 2018-05-25 23:28 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <c0be8e44-34f8-0e46-3a51-7ce869a5d351@intel.com>

On Fri, 25 May 2018 16:11:47 -0700
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
> > On Thu, 24 May 2018 09:55:14 -0700
> > Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
> >  
> >> --- a/drivers/net/hyperv/Kconfig
> >> +++ b/drivers/net/hyperv/Kconfig
> >> @@ -2,5 +2,6 @@ config HYPERV_NET
> >>   	tristate "Microsoft Hyper-V virtual network driver"
> >>   	depends on HYPERV
> >>   	select UCS2_STRING
> >> +	select FAILOVER  
> > When I take a working kernel config, add the patches then do
> > make oldconfig
> >
> > It is not autoselecting FAILOVER, it prompts me for it. This means
> > if user says no then a non-working netvsc device is made.  
> 
> I see
>     Generic failover module (FAILOVER) [M/y/?] (NEW)
> 
> So the user is given an option to either build as a Module or part of the
> kernel. 'n' is not an option.

With most libraries there is no prompt at all.

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Samudrala, Sridhar @ 2018-05-25 23:11 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <20180525153456.28b44c7c@xeon-e3>


On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
> On Thu, 24 May 2018 09:55:14 -0700
> Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>
>> --- a/drivers/net/hyperv/Kconfig
>> +++ b/drivers/net/hyperv/Kconfig
>> @@ -2,5 +2,6 @@ config HYPERV_NET
>>   	tristate "Microsoft Hyper-V virtual network driver"
>>   	depends on HYPERV
>>   	select UCS2_STRING
>> +	select FAILOVER
> When I take a working kernel config, add the patches then do
> make oldconfig
>
> It is not autoselecting FAILOVER, it prompts me for it. This means
> if user says no then a non-working netvsc device is made.

I see
    Generic failover module (FAILOVER) [M/y/?] (NEW)

So the user is given an option to either build as a Module or part of the
kernel. 'n' is not an option.

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Samudrala, Sridhar @ 2018-05-25 23:06 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <20180525153843.503ee052@xeon-e3>


On 5/25/2018 3:38 PM, Stephen Hemminger wrote:
> On Thu, 24 May 2018 09:55:13 -0700
> Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 03ed492c4e14..0f4ba52b641d 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1421,6 +1421,8 @@ struct net_device_ops {
>>    *	entity (i.e. the master device for bridged veth)
>>    * @IFF_MACSEC: device is a MACsec device
>>    * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
>> + * @IFF_FAILOVER: device is a failover master device
>> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
>>    */
>>   enum netdev_priv_flags {
>>   	IFF_802_1Q_VLAN			= 1<<0,
>> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
>>   	IFF_PHONY_HEADROOM		= 1<<24,
>>   	IFF_MACSEC			= 1<<25,
>>   	IFF_NO_RX_HANDLER		= 1<<26,
>> +	IFF_FAILOVER			= 1<<27,
>> +	IFF_FAILOVER_SLAVE		= 1<<28,
>>   };
> Why is FAILOVER any different than other master/slave relationships.
> I don't think you need to take up precious netdev flag bits for this.

These are netdev priv flags.
Jiri says that IFF_MASTER/IFF_SLAVE are bonding specific flags and cannot be used
with other failover mechanisms. Team also doesn't use this flags and it has its own
priv_flags.

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Samudrala, Sridhar @ 2018-05-25 23:04 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <20180525153744.2b53c449@xeon-e3>


[-- Attachment #1.1: Type: text/plain, Size: 967 bytes --]



On 5/25/2018 3:37 PM, Stephen Hemminger wrote:
> On Thu, 24 May 2018 09:55:13 -0700
> Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>
>
>> +	spin_lock(&failover_lock);
> Since register is not in fast path, this should be a mutex?

This is Jiri's comment which made me to switch to spinlock from mutex

   >> Why mutex? Apparently you don't need to sleep while holding a lock.
   >> Simple spinlock would do.

>
>
>> +int failover_slave_unregister(struct net_device *slave_dev)
>> +{
>> +	struct net_device *failover_dev;
>> +	struct failover_ops *fops;
>> +
>> +	if (!netif_is_failover_slave(slave_dev))
>> +		goto done;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
>> +	if (!failover_dev)
>> +		goto done;
> Since the slave device must have a master device set already, why not use
> that instead of searching by MAC address on unregister or link change.
>
We also need to get the fops(failover_ops)


[-- Attachment #1.2: Type: text/html, Size: 36732 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Stephen Hemminger @ 2018-05-25 22:38 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <1527180917-39737-2-git-send-email-sridhar.samudrala@intel.com>

On Thu, 24 May 2018 09:55:13 -0700
Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 03ed492c4e14..0f4ba52b641d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1421,6 +1421,8 @@ struct net_device_ops {
>   *	entity (i.e. the master device for bridged veth)
>   * @IFF_MACSEC: device is a MACsec device
>   * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
> + * @IFF_FAILOVER: device is a failover master device
> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
>   */
>  enum netdev_priv_flags {
>  	IFF_802_1Q_VLAN			= 1<<0,
> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
>  	IFF_PHONY_HEADROOM		= 1<<24,
>  	IFF_MACSEC			= 1<<25,
>  	IFF_NO_RX_HANDLER		= 1<<26,
> +	IFF_FAILOVER			= 1<<27,
> +	IFF_FAILOVER_SLAVE		= 1<<28,
>  };

Why is FAILOVER any different than other master/slave relationships.
I don't think you need to take up precious netdev flag bits for this.

^ permalink raw reply

* Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
From: Stephen Hemminger @ 2018-05-25 22:37 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <1527180917-39737-2-git-send-email-sridhar.samudrala@intel.com>

On Thu, 24 May 2018 09:55:13 -0700
Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:


> +	spin_lock(&failover_lock);

Since register is not in fast path, this should be a mutex?


> +int failover_slave_unregister(struct net_device *slave_dev)
> +{
> +	struct net_device *failover_dev;
> +	struct failover_ops *fops;
> +
> +	if (!netif_is_failover_slave(slave_dev))
> +		goto done;
> +
> +	ASSERT_RTNL();
> +
> +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
> +	if (!failover_dev)
> +		goto done;

Since the slave device must have a master device set already, why not use
that instead of searching by MAC address on unregister or link change.

^ permalink raw reply

* Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Stephen Hemminger @ 2018-05-25 22:34 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <1527180917-39737-3-git-send-email-sridhar.samudrala@intel.com>

On Thu, 24 May 2018 09:55:14 -0700
Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:

> --- a/drivers/net/hyperv/Kconfig
> +++ b/drivers/net/hyperv/Kconfig
> @@ -2,5 +2,6 @@ config HYPERV_NET
>  	tristate "Microsoft Hyper-V virtual network driver"
>  	depends on HYPERV
>  	select UCS2_STRING
> +	select FAILOVER

When I take a working kernel config, add the patches then do
make oldconfig

It is not autoselecting FAILOVER, it prompts me for it. This means
if user says no then a non-working netvsc device is made.

^ permalink raw reply

* [PATCH v2 11/13] drm/virtio: Stop updating plane->crtc
From: Ville Syrjala @ 2018-05-25 18:50 UTC (permalink / raw)
  To: dri-devel; +Cc: David Airlie, intel-gfx, virtualization
In-Reply-To: <20180525185045.29689-1-ville.syrjala@linux.intel.com>

From: Ville Syrjälä <ville.syrjala@linux.intel.com>

We want to get rid of plane->crtc on atomic drivers. Stop setting it.

v2: s/fb/crtc/ in the commit message (Gerd)

Cc: David Airlie <airlied@linux.ie>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/virtio/virtgpu_display.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_display.c b/drivers/gpu/drm/virtio/virtgpu_display.c
index d6dd769a7ad3..ff9933e79416 100644
--- a/drivers/gpu/drm/virtio/virtgpu_display.c
+++ b/drivers/gpu/drm/virtio/virtgpu_display.c
@@ -282,8 +282,6 @@ static int vgdev_output_init(struct virtio_gpu_device *vgdev, int index)
 	drm_crtc_init_with_planes(dev, crtc, primary, cursor,
 				  &virtio_gpu_crtc_funcs, NULL);
 	drm_crtc_helper_add(crtc, &virtio_gpu_crtc_helper_funcs);
-	primary->crtc = crtc;
-	cursor->crtc = crtc;
 
 	drm_connector_init(dev, connector, &virtio_gpu_connector_funcs,
 			   DRM_MODE_CONNECTOR_VIRTUAL);
-- 
2.16.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related

* [PATCH v2 00/13] drm: Eliminate plane->fb/crtc usage for atomic drivers
From: Ville Syrjala @ 2018-05-25 18:50 UTC (permalink / raw)
  To: dri-devel
  Cc: David Airlie, Daniel Vetter, virtualization, Eric Anholt,
	David (ChunMing) Zhou, Thomas Hellstrom, Joonyoung Shim,
	Sinclair Yeh, Kyungmin Park, amd-gfx, VMware Graphics,
	Harry Wentland, linux-arm-msm, intel-gfx, Inki Dae, Deepak Rawat,
	Seung-Woo Kim, Rob Clark, Alex Deucher, freedreno,
	Christian König

From: Ville Syrjälä <ville.syrjala@linux.intel.com>

Here are again the last (?) bits of eliminating the plane->fb/crtc
usage for atomic drivers. I've pushed everything else (thanks to
everyone who reviewed them). 

Deepak said he'd tested the vmwgfx stuff, so I think it should be
safe to land. Just missing a bit of review...

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Cc: "Christian König" <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com>
Cc: Deepak Rawat <drawat@vmware.com>
Cc: Eric Anholt <eric@anholt.net>
Cc: freedreno@lists.freedesktop.org
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Harry Wentland <harry.wentland@amd.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: linux-arm-msm@vger.kernel.org
Cc: Rob Clark <robdclark@gmail.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Sinclair Yeh <syeh@vmware.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: virtualization@lists.linux-foundation.org
Cc: VMware Graphics <linux-graphics-maintainer@vmware.com>

Ville Syrjälä (13):
  drm/vmwgfx: Stop using plane->fb in vmw_kms_atomic_check_modeset()
  drm/vmwgfx: Stop using plane->fb in vmw_kms_helper_dirty()
  drm/vmwgfx: Stop using plane->fb in vmw_kms_update_implicit_fb()
  drm/vmwgfx: Stop updating plane->fb
  drm/vmwgfx: Stop using plane->fb in atomic_enable()
  drm/vmwgfx: Stop messing about with plane->fb/old_fb/crtc
  drm/amdgpu/dc: Stop updating plane->fb
  drm/i915: Stop updating plane->fb/crtc
  drm/exynos: Stop updating plane->crtc
  drm/msm: Stop updating plane->fb/crtc
  drm/virtio: Stop updating plane->crtc
  drm/vc4: Stop updating plane->fb/crtc
  drm: Stop updating plane->crtc/fb/old_fb on atomic drivers

 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  2 -
 drivers/gpu/drm/drm_atomic.c                      | 55 +++--------------------
 drivers/gpu/drm/drm_atomic_helper.c               | 15 +------
 drivers/gpu/drm/drm_crtc.c                        |  8 +++-
 drivers/gpu/drm/drm_fb_helper.c                   |  7 ---
 drivers/gpu/drm/drm_framebuffer.c                 |  5 ---
 drivers/gpu/drm/drm_plane.c                       | 14 +++---
 drivers/gpu/drm/drm_plane_helper.c                |  4 +-
 drivers/gpu/drm/exynos/exynos_drm_plane.c         |  2 -
 drivers/gpu/drm/i915/intel_atomic_plane.c         | 12 -----
 drivers/gpu/drm/i915/intel_display.c              |  7 ++-
 drivers/gpu/drm/msm/disp/mdp4/mdp4_crtc.c         |  1 -
 drivers/gpu/drm/msm/disp/mdp4/mdp4_plane.c        |  2 -
 drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c         |  1 -
 drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c        |  2 -
 drivers/gpu/drm/vc4/vc4_crtc.c                    |  3 --
 drivers/gpu/drm/virtio/virtgpu_display.c          |  2 -
 drivers/gpu/drm/vmwgfx/vmwgfx_fb.c                | 24 ----------
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c               | 24 +++++++---
 drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c              |  2 -
 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c              |  5 +--
 include/drm/drm_atomic.h                          |  3 --
 22 files changed, 46 insertions(+), 154 deletions(-)

-- 
2.16.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] XDP batching for TUN/vhost_net
From: Michael S. Tsirkin @ 2018-05-25 17:53 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>

On Mon, May 21, 2018 at 05:04:21PM +0800, Jason Wang wrote:
> Hi all:
> 
> We do not support XDP batching for TUN since it can only receive one
> packet a time from vhost_net. This series tries to remove this
> limitation by:
> 
> - introduce a TUN specific msg_control that can hold a pointer to an
>   array of XDP buffs
> - try copy and build XDP buff in vhost_net
> - store XDP buffs in an array and submit them once for every N packets
>   from vhost_net
> - since TUN can only do native XDP for datacopy packet, to simplify
>   the logic, split datacopy out logic and only do batching for
>   datacopy.

I like how this rework looks. Pls go ahead and repost as
non-RFC.

> With this series, TX PPS can improve about 34% from 2.9Mpps to
> 3.9Mpps when doing xdp_redirect_map between TAP and ixgbe.
> 
> Thanks
> 
> Jason Wang (12):
>   vhost_net: introduce helper to initialize tx iov iter
>   vhost_net: introduce vhost_exceeds_weight()
>   vhost_net: introduce vhost_has_more_pkts()
>   vhost_net: split out datacopy logic
>   vhost_net: batch update used ring for datacopy TX
>   tuntap: enable premmption early
>   tuntap: simplify error handling in tun_build_skb()
>   tuntap: tweak on the path of non-xdp case in tun_build_skb()
>   tuntap: split out XDP logic
>   vhost_net: build xdp buff
>   vhost_net: passing raw xdp buff to tun
>   vhost_net: batch submitting XDP buffers to underlayer sockets
> 
>  drivers/net/tun.c      | 226 +++++++++++++++++++++++++++----------
>  drivers/vhost/net.c    | 297 ++++++++++++++++++++++++++++++++++++++++++++-----
>  include/linux/if_tun.h |   7 ++
>  3 files changed, 444 insertions(+), 86 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply

* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Michael S. Tsirkin @ 2018-05-25 17:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: robh, mpe, linux-kernel, virtualization, hch, joe, david,
	linuxppc-dev, elfring, Anshuman Khandual
In-Reply-To: <f77638ddef3af52dd71341083707c9e3745dd505.camel@kernel.crashing.org>

On Thu, May 24, 2018 at 08:27:04AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-05-23 at 21:50 +0300, Michael S. Tsirkin wrote:
> 
> > I re-read that discussion and I'm still unclear on the
> > original question, since I got several apparently
> > conflicting answers.
> > 
> > I asked:
> > 
> > 	Why isn't setting VIRTIO_F_IOMMU_PLATFORM on the
> > 	hypervisor side sufficient?
> 
> I thought I had replied to this...
> 
> There are a couple of reasons:
> 
> - First qemu doesn't know that the guest will switch to "secure mode"
> in advance. There is no difference between a normal and a secure
> partition until the partition does the magic UV call to "enter secure
> mode" and qemu doesn't see any of it. So who can set the flag here ?

Not sure I understand. Just set the flag e.g. on qemu command line.
I might be wrong, but these secure mode things usually
a. require hypervisor side tricks anyway

> - Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or
> vhost) go through the emulated MMIO for every access to the guest,
> which adds additional overhead.
> 
> Cheers,
> Ben.

Well it's not supposed to be much slower for the static case.

vhost has a cache so should be fine.

A while ago Paolo implemented a translation cache which should be
perfect for this case - most of the code got merged but
never enabled because of stability issues.

If all else fails, we could teach QEMU to handle the no-iommu case
as if VIRTIO_F_IOMMU_PLATFORM was off.



> > 
> > 
> > >  arch/powerpc/include/asm/dma-mapping.h |  6 ++++++
> > >  arch/powerpc/platforms/pseries/iommu.c | 11 +++++++++++
> > >  drivers/virtio/virtio_ring.c           | 10 ++++++++++
> > >  3 files changed, 27 insertions(+)
> > > 
> > > diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
> > > index 8fa3945..056e578 100644
> > > --- a/arch/powerpc/include/asm/dma-mapping.h
> > > +++ b/arch/powerpc/include/asm/dma-mapping.h
> > > @@ -115,4 +115,10 @@ extern u64 __dma_get_required_mask(struct device *dev);
> > >  #define ARCH_HAS_DMA_MMAP_COHERENT
> > >  
> > >  #endif /* __KERNEL__ */
> > > +
> > > +#define platform_forces_virtio_dma platform_forces_virtio_dma
> > > +
> > > +struct virtio_device;
> > > +
> > > +extern bool platform_forces_virtio_dma(struct virtio_device *vdev);
> > >  #endif	/* _ASM_DMA_MAPPING_H */
> > > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > > index 06f0296..a2ec15a 100644
> > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > @@ -38,6 +38,7 @@
> > >  #include <linux/of.h>
> > >  #include <linux/iommu.h>
> > >  #include <linux/rculist.h>
> > > +#include <linux/virtio.h>
> > >  #include <asm/io.h>
> > >  #include <asm/prom.h>
> > >  #include <asm/rtas.h>
> > > @@ -1396,3 +1397,13 @@ static int __init disable_multitce(char *str)
> > >  __setup("multitce=", disable_multitce);
> > >  
> > >  machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
> > > +
> > > +bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > > +{
> > > +	/*
> > > +	 * On protected guest platforms, force virtio core to use DMA
> > > +	 * MAP API for all virtio devices. But there can also be some
> > > +	 * exceptions for individual devices like virtio balloon.
> > > +	 */
> > > +	return (of_find_compatible_node(NULL, NULL, "ibm,ultravisor") != NULL);
> > > +}
> > 
> > Isn't this kind of slow?  vring_use_dma_api is on
> > data path and supposed to be very fast.
> > 
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 21d464a..47ea6c3 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -141,8 +141,18 @@ struct vring_virtqueue {
> > >   * unconditionally on data path.
> > >   */
> > >  
> > > +#ifndef platform_forces_virtio_dma
> > > +static inline bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > > +{
> > > +	return false;
> > > +}
> > > +#endif
> > > +
> > >  static bool vring_use_dma_api(struct virtio_device *vdev)
> > >  {
> > > +	if (platform_forces_virtio_dma(vdev))
> > > +		return true;
> > > +
> > >  	if (!virtio_has_iommu_quirk(vdev))
> > >  		return true;
> > >  
> > > -- 
> > > 2.9.3

^ permalink raw reply

* Re: [PATCH net-next v12 0/5] Enable virtio_net to act as a standby for a passthru device
From: Michael S. Tsirkin @ 2018-05-25 17:19 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: alexander.h.duyck, virtio-dev, jiri, kubakici, netdev,
	virtualization, loseweigh, anjali.singhai, aaron.f.brown, davem
In-Reply-To: <1527180917-39737-1-git-send-email-sridhar.samudrala@intel.com>

On Thu, May 24, 2018 at 09:55:12AM -0700, Sridhar Samudrala wrote:
> The main motivation for this patch is to enable cloud service providers
> to provide an accelerated datapath to virtio-net enabled VMs in a 
> transparent manner with no/minimal guest userspace changes. This also
> enables hypervisor controlled live migration to be supported with VMs that
> have direct attached SR-IOV VF devices.
> 
> Patch 1 introduces a failover module that provides a generic interface for 
> paravirtual drivers to listen for netdev register/unregister/link change
> events from pci ethernet devices with the same MAC and takeover their
> datapath. The notifier and event handling code is based on the existing
> netvsc implementation. 
> 
> Patch 2 refactors netvsc to use the registration/notification framework
> introduced by failover module.
> 
> Patch 3 introduces a net_failover driver that provides an automated
> failover mechanism to paravirtual drivers via APIs to create and destroy
> a failover master netdev and mananges a primary and standby slave netdevs
> that get registered via the generic failover infrastructure.
> 
> Patch 4 introduces a new feature bit VIRTIO_NET_F_STANDBY to virtio-net
> that can be used by hypervisor to indicate that virtio_net interface
> should act as a standby for another device with the same MAC address.
> 
> Patch 5 extends virtio_net to use alternate datapath when available and
> registered. When STANDBY feature is enabled, virtio_net driver uese the
> net_failover API to create an additional 'failover' netdev that acts as
> a master device and controls 2 slave devices.  The original virtio_net
> netdev is registered as 'standby' netdev and a passthru/vf device with
> the same MAC gets registered as 'primary' netdev. Both 'standby' and
> 'failover' netdevs are associated with the same 'pci' device.  The user
> accesses the network interface via 'failover' netdev. The 'failover'
> netdev chooses 'primary' netdev as default for transmits when it is
> available with link up and running.
> 
> As this patch series is initially focusing on usecases where hypervisor 
> fully controls the VM networking and the guest is not expected to directly 
> configure any hardware settings, it doesn't expose all the ndo/ethtool ops
> that are supported by virtio_net at this time. To support additional usecases,
> it should be possible to enable additional ops later by caching the state
> in failover netdev and replaying when the 'primary' netdev gets registered. 
>  
> At the time of live migration, the hypervisor needs to unplug the VF device
> from the guest on the source host and reset the MAC filter of the VF to
> initiate failover of datapath to virtio before starting the migration. After
> the migration is completed, the destination hypervisor sets the MAC filter
> on the VF and plugs it back to the guest to switch over to VF datapath.
> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Series:

Acked-by: Michael S. Tsirkin <mst@redhat.com>


> v12:
> - Tested live migration with virtio-net/AVF(i40evf) configured in failover
>   mode while running iperf in background. Tried static ip and dhcp
>   configurations using 'network' scripts and Network Manager.
> - Build tested netvsc module.
> Updates:
> - Extended generic failover module to do common functions like setting
>   FAILOVER_SLAVE flag, registering rx-handler and linking to upper dev in
>   the generic register/unregister handlers.
>   This required adding 3 additional failover ops pre_register, pre_unregister
>   and handle_frame.  netvsc and net_failover drivers are updated to support
>   these ops.
> 
> v11:
> - Split net_failover module into 2 components.
>   1. 'failover' module that provides generic failover infrastructure
>   to register a failover instance and listen for slave events.
>   2. 'net_failover' driver that provides APIs to create/destroy upper
>   netdev and supports 3-netdev model used by virtio-net.
> - Added documentation
> 
> v10:
> - fix net_failover_open() to update failover CARRIER correctly based on
>   standby and primary states. 
> - fix net_failover_handle_frame() to handle frames received on standby
>   when primary is present.
> - replace netdev_upper_dev_link with netdev_master_upper_dev_link and
>   handle lower dev state changes.
> - fix net_failver_create() and net_failover_register() interfaces to
>   use ERR_PTR and avoid arg **
> - disable setting mac address when virtio-net in STANDBY mode
> - document exported symbols
> - added entry to MAINTAINERS file
> 
> v9:
> Select NET_FAILOVER automatically when VIRTIO_NET/HYPERV_NET 
> are enabled. (stephen)
> 
> v8:
> - Made the failover managment routines more robust by updating the feature 
>   bits/other fields in the failover netdev when slave netdevs are 
>   registered/unregistered. (mst)
> - added support for handling vlans.
> - Limited the changes in netvsc to only use the notifier/event/lookups
>   from the failover module. The slave register/unregister/link-change 
>   handlers are only updated to use the getbymac routine to get the 
>   upper netdev. There is no change in their functionality. (stephen)
> - renamed structs/function/file names to use net_failover prefix. (mst)
> 
> v7
> - Rename 'bypass/active/backup' terminology with 'failover/primary/standy'
>   (jiri, mst)
> - re-arranged dev_open() and dev_set_mtu() calls in the register routines
>   so that they don't get called for 2-netdev model. (stephen)
> - fixed select_queue() routine to do queue selection based on VF if it is
>   registered as primary. (stephen)
> -  minor bugfixes
> 
> v6 RFC:
>   Simplified virtio_net changes by moving all the ndo_ops of the 
>   bypass_netdev and create/destroy of bypass_netdev to 'bypass' module.
>   avoided 2 phase registration(driver + instances).
>   introduced IFF_BYPASS/IFF_BYPASS_SLAVE dev->priv_flags 
>   replaced mutex with a spinlock
> 
> v5 RFC:
>   Based on Jiri's comments, moved the common functionality to a 'bypass'
>   module so that the same notifier and event handlers to handle child
>   register/unregister/link change events can be shared between virtio_net
>   and netvsc.
>   Improved error handling based on Siwei's comments.
> v4:
> - Based on the review comments on the v3 version of the RFC patch and
>   Jakub's suggestion for the naming issue with 3 netdev solution,
>   proposed 3 netdev in-driver bonding solution for virtio-net.
> v3 RFC:
> - Introduced 3 netdev model and pointed out a couple of issues with
>   that model and proposed 2 netdev model to avoid these issues.
> - Removed broadcast/multicast optimization and only use virtio as
>   backup path when VF is unplugged.
> v2 RFC:
> - Changed VIRTIO_NET_F_MASTER to VIRTIO_NET_F_BACKUP (mst)
> - made a small change to the virtio-net xmit path to only use VF datapath
>   for unicasts. Broadcasts/multicasts use virtio datapath. This avoids
>   east-west broadcasts to go over the PCI link.
> - added suppport for the feature bit in qemu
> 
> Sridhar Samudrala (5):
>   net: Introduce generic failover module
>   netvsc: refactor notifier/event handling code to use the failover
>     framework
>   net: Introduce net_failover driver
>   virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit
>   virtio_net: Extend virtio to use VF datapath when available
> 
>  Documentation/networking/failover.rst     |  18 +
>  Documentation/networking/net_failover.rst | 116 +++++
>  MAINTAINERS                               |  16 +
>  drivers/net/Kconfig                       |  13 +
>  drivers/net/Makefile                      |   1 +
>  drivers/net/hyperv/Kconfig                |   1 +
>  drivers/net/hyperv/hyperv_net.h           |   2 +
>  drivers/net/hyperv/netvsc_drv.c           | 222 ++------
>  drivers/net/net_failover.c                | 836 ++++++++++++++++++++++++++++++
>  drivers/net/virtio_net.c                  |  40 +-
>  include/linux/netdevice.h                 |  16 +
>  include/net/failover.h                    |  36 ++
>  include/net/net_failover.h                |  40 ++
>  include/uapi/linux/virtio_net.h           |   3 +
>  net/Kconfig                               |  13 +
>  net/core/Makefile                         |   1 +
>  net/core/failover.c                       | 315 +++++++++++
>  17 files changed, 1522 insertions(+), 167 deletions(-)
>  create mode 100644 Documentation/networking/failover.rst
>  create mode 100644 Documentation/networking/net_failover.rst
>  create mode 100644 drivers/net/net_failover.c
>  create mode 100644 include/net/failover.h
>  create mode 100644 include/net/net_failover.h
>  create mode 100644 net/core/failover.c
> 
> -- 
> 2.14.3

^ permalink raw reply

* Re: [PATCH v3 09/27] x86/acpi: Adapt assembly for PIE support
From: Thomas Garnier via Virtualization @ 2018-05-25 17:00 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Kate Stewart, Nicolas Pitre, the arch/x86 maintainers,
	Sergey Senozhatsky, Petr Mladek, Len Brown, Peter Zijlstra,
	Yonghong Song, Christopher Li, Dave Hansen, Dominik Brodowski,
	LKML, Masahiro Yamada, Jan Beulich, H . Peter Anvin,
	Kernel Hardening, Christoph Lameter, Alok Kataria,
	Linux Doc Mailing List, linux-arch, Jonathan Corbet, Herbert Xu
In-Reply-To: <20180525091447.GC9666@amd>

On Fri, May 25, 2018 at 2:14 AM Pavel Machek <pavel@ucw.cz> wrote:

> On Thu 2018-05-24 09:35:42, Thomas Garnier wrote:
> > On Thu, May 24, 2018 at 4:03 AM Pavel Machek <pavel@ucw.cz> wrote:
> >
> > > On Wed 2018-05-23 12:54:03, Thomas Garnier wrote:
> > > > Change the assembly code to use only relative references of symbols
for
> > the
> > > > kernel to be PIE compatible.
> > > >
> > > > Position Independent Executable (PIE) support will allow to
extended the
> > > > KASLR randomization range below the -2G memory limit.
> >
> > > What testing did this get?
> >
> > Tested boot, hibernation and performance on qemu and dedicated machine.

> Well, this is suspend, not hibernation code.

> So "sudo pm-suspend" or "echo mem > /sys/power/state" would be good
> way to test this.

Thanks, it worked. I added this to the testsuite I use for KASLR.


> Thanks,
>                                                          Pavel

> > > > diff --git a/arch/x86/kernel/acpi/wakeup_64.S
> > b/arch/x86/kernel/acpi/wakeup_64.S
> > > > index 50b8ed0317a3..472659c0f811 100644
> > > > --- a/arch/x86/kernel/acpi/wakeup_64.S
> > > > +++ b/arch/x86/kernel/acpi/wakeup_64.S
> > > > @@ -14,7 +14,7 @@
> > > >        * Hooray, we are in Long 64-bit mode (but still running in
low
> > memory)
> > > >        */
> > > >  ENTRY(wakeup_long64)
> > > > -     movq    saved_magic, %rax
> > > > +     movq    saved_magic(%rip), %rax
> > > >       movq    $0x123456789abcdef0, %rdx
> > > >       cmpq    %rdx, %rax
> > > >       jne     bogus_64_magic
> >
> > > Because, as comment says, this is rather tricky code.
> >
> > I agree, I think maintainers feedback is very important for this
patchset.


> --
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures)
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html



-- 
Thomas

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox